Index: head/sys/coda/coda_vfsops.c =================================================================== --- head/sys/coda/coda_vfsops.c (revision 49534) +++ head/sys/coda/coda_vfsops.c (revision 49535) @@ -1,589 +1,587 @@ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ - * $Id: coda_vfsops.c,v 1.15 1999/07/20 07:18:17 phk Exp $ + * $Id: coda_vfsops.c,v 1.16 1999/07/21 12:51:36 phk Exp $ * */ /* * Mach Operating System * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include MALLOC_DEFINE(M_CODA, "CODA storage", "Various Coda Structures"); int codadebug = 0; int coda_vfsop_print_entry = 0; #define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__FUNCTION__)) struct vnode *coda_ctlvp; struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */ /* structure to keep statistics of internally generated/satisfied calls */ struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE]; #define MARK_ENTRY(op) (coda_vfsopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++) #define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++) extern int coda_nc_initialized; /* Set if cache has been initialized */ extern int vc_nb_open __P((dev_t, int, int, struct proc *)); int coda_vfsopstats_init(void) { register int i; for (i=0;ini_vp; if (error) { MARK_INT_FAIL(CODA_MOUNT_STATS); return (error); } if (dvp->v_type != VCHR) { MARK_INT_FAIL(CODA_MOUNT_STATS); vrele(dvp); return(ENXIO); } dev = dvp->v_rdev; vrele(dvp); /* * See if the device table matches our expectations. */ if (devsw(dev)->d_open != vc_nb_open) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } if (minor(dev) >= NVCODA || minor(dev) < 0) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } /* * Initialize the mount record and link it to the vfs struct */ mi = &coda_mnttbl[minor(dev)]; if (!VC_OPEN(&mi->mi_vcomm)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENODEV); } /* No initialization (here) of mi_vcomm! */ vfsp->mnt_data = (qaddr_t)mi; vfs_getnewfsid (vfsp); mi->mi_vfsp = vfsp; /* * Make a root vnode to placate the Vnode interface, but don't * actually make the CODA_ROOT call to venus until the first call * to coda_root in case a server is down while venus is starting. */ rootfid.Volume = 0; rootfid.Vnode = 0; rootfid.Unique = 0; cp = make_coda_node(&rootfid, vfsp, VDIR); rootvp = CTOV(cp); rootvp->v_flag |= VROOT; ctlfid.Volume = CTL_VOL; ctlfid.Vnode = CTL_VNO; ctlfid.Unique = CTL_UNI; /* cp = make_coda_node(&ctlfid, vfsp, VCHR); The above code seems to cause a loop in the cnode links. I don't totally understand when it happens, it is caught when closing down the system. */ cp = make_coda_node(&ctlfid, 0, VCHR); coda_ctlvp = CTOV(cp); /* Add vfs and rootvp to chain of vfs hanging off mntinfo */ mi->mi_vfsp = vfsp; mi->mi_rootvp = rootvp; /* set filesystem block size */ vfsp->mnt_stat.f_bsize = 8192; /* XXX -JJK */ /* Set f_iosize. XXX -- inamura@isl.ntt.co.jp. For vnode_pager_haspage() references. The value should be obtained from underlying UFS. */ /* Checked UFS. iosize is set as 8192 */ vfsp->mnt_stat.f_iosize = 8192; /* error is currently guaranteed to be zero, but in case some code changes... */ CODADEBUG(1, myprintf(("coda_mount returned %d\n",error));); if (error) MARK_INT_FAIL(CODA_MOUNT_STATS); else MARK_INT_SAT(CODA_MOUNT_STATS); return(error); } int coda_start(vfsp, flags, p) struct mount *vfsp; int flags; struct proc *p; { ENTRY; return (0); } int coda_unmount(vfsp, mntflags, p) struct mount *vfsp; int mntflags; struct proc *p; { struct coda_mntinfo *mi = vftomi(vfsp); int active, error = 0; ENTRY; MARK_ENTRY(CODA_UMOUNT_STATS); if (!CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_UMOUNT_STATS); return(EINVAL); } if (mi->mi_vfsp == vfsp) { /* We found the victim */ if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp))) return (EBUSY); /* Venus is still running */ #ifdef DEBUG printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp)); #endif vrele(mi->mi_rootvp); active = coda_kill(vfsp, NOT_DOWNCALL); mi->mi_rootvp->v_flag &= ~VROOT; error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE); printf("coda_unmount: active = %d, vflush active %d\n", active, error); error = 0; /* I'm going to take this out to allow lookups to go through. I'm * not sure it's important anyway. -- DCS 2/2/94 */ /* vfsp->VFS_DATA = NULL; */ /* No more vfsp's to hold onto */ mi->mi_vfsp = NULL; mi->mi_rootvp = NULL; if (error) MARK_INT_FAIL(CODA_UMOUNT_STATS); else MARK_INT_SAT(CODA_UMOUNT_STATS); return(error); } return (EINVAL); } /* * find root of cfs */ int coda_root(vfsp, vpp) struct mount *vfsp; struct vnode **vpp; { struct coda_mntinfo *mi = vftomi(vfsp); struct vnode **result; int error; struct proc *p = curproc; /* XXX - bnoble */ ViceFid VFid; ENTRY; MARK_ENTRY(CODA_ROOT_STATS); result = NULL; if (vfsp == mi->mi_vfsp) { if ((VTOC(mi->mi_rootvp)->c_fid.Volume != 0) || (VTOC(mi->mi_rootvp)->c_fid.Vnode != 0) || (VTOC(mi->mi_rootvp)->c_fid.Unique != 0)) { /* Found valid root. */ *vpp = mi->mi_rootvp; /* On Mach, this is vref. On NetBSD, VOP_LOCK */ #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_SAT(CODA_ROOT_STATS); return(0); } } error = venus_root(vftomi(vfsp), p->p_cred->pc_ucred, p, &VFid); if (!error) { /* * Save the new rootfid in the cnode, and rehash the cnode into the * cnode hash with the new fid key. */ coda_unsave(VTOC(mi->mi_rootvp)); VTOC(mi->mi_rootvp)->c_fid = VFid; coda_save(VTOC(mi->mi_rootvp)); *vpp = mi->mi_rootvp; #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_SAT(CODA_ROOT_STATS); goto exit; } else if (error == ENODEV || error == EINTR) { /* Gross hack here! */ /* * If Venus fails to respond to the CODA_ROOT call, coda_call returns * ENODEV. Return the uninitialized root vnode to allow vfs * operations such as unmount to continue. Without this hack, * there is no way to do an unmount if Venus dies before a * successful CODA_ROOT call is done. All vnode operations * will fail. */ *vpp = mi->mi_rootvp; #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_FAIL(CODA_ROOT_STATS); error = 0; goto exit; } else { CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); ); MARK_INT_FAIL(CODA_ROOT_STATS); goto exit; } exit: return(error); } int coda_quotactl(vfsp, cmd, uid, arg, p) struct mount *vfsp; int cmd; uid_t uid; caddr_t arg; struct proc *p; { ENTRY; return (EOPNOTSUPP); } /* * Get file system statistics. */ int coda_nb_statfs(vfsp, sbp, p) register struct mount *vfsp; struct statfs *sbp; struct proc *p; { ENTRY; /* MARK_ENTRY(CODA_STATFS_STATS); */ if (!CODA_MOUNTED(vfsp)) { /* MARK_INT_FAIL(CODA_STATFS_STATS);*/ return(EINVAL); } bzero(sbp, sizeof(struct statfs)); /* XXX - what to do about f_flags, others? --bnoble */ /* Below This is what AFS does #define NB_SFS_SIZ 0x895440 */ /* Note: Normal fs's have a bsize of 0x400 == 1024 */ sbp->f_type = vfsp->mnt_vfc->vfc_typenum; sbp->f_bsize = 8192; /* XXX */ sbp->f_iosize = 8192; /* XXX */ #define NB_SFS_SIZ 0x8AB75D sbp->f_blocks = NB_SFS_SIZ; sbp->f_bfree = NB_SFS_SIZ; sbp->f_bavail = NB_SFS_SIZ; sbp->f_files = NB_SFS_SIZ; sbp->f_ffree = NB_SFS_SIZ; bcopy((caddr_t)&(vfsp->mnt_stat.f_fsid), (caddr_t)&(sbp->f_fsid), sizeof (fsid_t)); snprintf(sbp->f_mntonname, sizeof(sbp->f_mntonname), "/coda"); snprintf(sbp->f_mntfromname, sizeof(sbp->f_mntfromname), "CODA"); /* MARK_INT_SAT(CODA_STATFS_STATS); */ return(0); } /* * Flush any pending I/O. */ int coda_sync(vfsp, waitfor, cred, p) struct mount *vfsp; int waitfor; struct ucred *cred; struct proc *p; { ENTRY; MARK_ENTRY(CODA_SYNC_STATS); MARK_INT_SAT(CODA_SYNC_STATS); return(0); } int coda_vget(vfsp, ino, vpp) struct mount *vfsp; ino_t ino; struct vnode **vpp; { ENTRY; return (EOPNOTSUPP); } /* * fhtovp is now what vget used to be in 4.3-derived systems. For * some silly reason, vget is now keyed by a 32 bit ino_t, rather than * a type-specific fid. */ int coda_fhtovp(vfsp, fhp, nam, vpp, exflagsp, creadanonp) register struct mount *vfsp; struct fid *fhp; struct mbuf *nam; struct vnode **vpp; int *exflagsp; struct ucred **creadanonp; { struct cfid *cfid = (struct cfid *)fhp; struct cnode *cp = 0; int error; struct proc *p = curproc; /* XXX -mach */ ViceFid VFid; int vtype; ENTRY; MARK_ENTRY(CODA_VGET_STATS); /* Check for vget of control object. */ if (IS_CTL_FID(&cfid->cfid_fid)) { *vpp = coda_ctlvp; vref(coda_ctlvp); MARK_INT_SAT(CODA_VGET_STATS); return(0); } error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, p->p_cred->pc_ucred, p, &VFid, &vtype); if (error) { CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));) *vpp = (struct vnode *)0; } else { CODADEBUG(CODA_VGET, myprintf(("vget: vol %lx vno %lx uni %lx type %d result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, vtype, error)); ) cp = make_coda_node(&VFid, vfsp, vtype); *vpp = CTOV(cp); } return(error); } int coda_vptofh(vnp, fidp) struct vnode *vnp; struct fid *fidp; { ENTRY; return (EOPNOTSUPP); } int coda_init(struct vfsconf *vfsp) { ENTRY; return 0; } /* * To allow for greater ease of use, some vnodes may be orphaned when * Venus dies. Certain operations should still be allowed to go * through, but without propagating ophan-ness. So this function will * get a new vnode for the file from the current run of Venus. */ int getNewVnode(vpp) struct vnode **vpp; { struct cfid cfid; struct coda_mntinfo *mi = vftomi((*vpp)->v_mount); ENTRY; cfid.cfid_len = (short)sizeof(ViceFid); cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */ /* XXX ? */ /* We're guessing that if set, the 1st element on the list is a * valid vnode to use. If not, return ENODEV as venus is dead. */ if (mi->mi_vfsp == NULL) return ENODEV; return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp, NULL, NULL); } #include #include /* get the mount structure corresponding to a given device. Assume * device corresponds to a UFS. Return NULL if no device is found. */ struct mount *devtomp(dev) dev_t dev; { struct mount *mp, *nmp; for (mp = mountlist.cqh_first; mp != (void*)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_next; if (((VFSTOUFS(mp))->um_dev == dev)) { /* mount corresponds to UFS and the device matches one we want */ return(mp); } } /* mount structure wasn't found */ return(NULL); } struct vfsops coda_vfsops = { coda_mount, coda_start, coda_unmount, coda_root, coda_quotactl, coda_nb_statfs, coda_sync, coda_vget, (int (*) (struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)) eopnotsupp, (int (*) (struct vnode *, struct fid *)) eopnotsupp, coda_init, }; VFS_SET(coda_vfsops, coda, VFCF_NETWORK); Index: head/sys/contrib/softupdates/ffs_softdep.c =================================================================== --- head/sys/contrib/softupdates/ffs_softdep.c (revision 49534) +++ head/sys/contrib/softupdates/ffs_softdep.c (revision 49535) @@ -1,4485 +1,4485 @@ /* * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * The following are the copyrights and redistribution conditions that * apply to this copy of the soft update software. For a license * to use, redistribute or sell the soft update software under * conditions other than those described here, please contact the * author at one of the following addresses: * * Marshall Kirk McKusick mckusick@mckusick.com * 1614 Oxford Street +1-510-843-9542 * Berkeley, CA 94709-1608 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. None of the names of McKusick, Ganger, Patt, or the University of * Michigan may be used to endorse or promote products derived from * this software without specific prior written permission. * 4. Redistributions in any form must be accompanied by information on * how to obtain complete source code for any accompanying software * that uses this software. This source code must either be included * in the distribution or be available for no more than the cost of * distribution plus a nominal fee, and must be freely redistributable * under reasonable conditions. For an executable file, complete * source code means the source code for all modules it contains. * It does not mean source code for modules or files that typically * accompany the operating system on which the executable file runs, * e.g., standard library modules or system header files. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.40 (McKusick) 6/15/99 - * $Id: ffs_softdep.c,v 1.33 1999/06/27 13:26:23 peter Exp $ + * $Id: ffs_softdep.c,v 1.34 1999/06/29 15:57:40 mckusick Exp $ */ /* * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. */ #ifndef DIAGNOSTIC #define DIAGNOSTIC #endif #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include /* * These definitions need to be adapted to the system to which * this file is being ported. */ /* * malloc types defined for the softdep system. */ MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); #define D_PAGEDEP 0 #define D_INODEDEP 1 #define D_NEWBLK 2 #define D_BMSAFEMAP 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 #define D_FREEFRAG 7 #define D_FREEBLKS 8 #define D_FREEFILE 9 #define D_DIRADD 10 #define D_MKDIR 11 #define D_DIRREM 12 #define D_LAST D_DIRREM /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_NEWBLK, M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") #define CURPROC curproc /* * End system adaptaion definitions. */ /* * Internal function prototypes. */ static void softdep_error __P((char *, int)); static void drain_output __P((struct vnode *, int)); static int getdirtybuf __P((struct buf **, int)); static void clear_remove __P((struct proc *)); static void clear_inodedeps __P((struct proc *)); static int flush_pagedep_deps __P((struct vnode *, struct mount *, struct diraddhd *)); static int flush_inodedep_deps __P((struct fs *, ino_t)); static int handle_written_filepage __P((struct pagedep *, struct buf *)); static void diradd_inode_written __P((struct diradd *, struct inodedep *)); static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); static void handle_allocdirect_partdone __P((struct allocdirect *)); static void handle_allocindir_partdone __P((struct allocindir *)); static void initiate_write_filepage __P((struct pagedep *, struct buf *)); static void handle_written_mkdir __P((struct mkdir *, int)); static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); static void handle_workitem_freefile __P((struct freefile *)); static void handle_workitem_remove __P((struct dirrem *)); static struct dirrem *newdirrem __P((struct buf *, struct inode *, struct inode *, int)); static void free_diradd __P((struct diradd *)); static void free_allocindir __P((struct allocindir *, struct inodedep *)); static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, long *)); static void deallocate_dependencies __P((struct buf *, struct inodedep *)); static void free_allocdirect __P((struct allocdirectlst *, struct allocdirect *, int)); static int free_inodedep __P((struct inodedep *)); static void handle_workitem_freeblocks __P((struct freeblks *)); static void merge_inode_lists __P((struct inodedep *)); static void setup_allocindir_phase2 __P((struct buf *, struct inode *, struct allocindir *)); static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, ufs_daddr_t)); static void handle_workitem_freefrag __P((struct freefrag *)); static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); static void allocdirect_merge __P((struct allocdirectlst *, struct allocdirect *, struct allocdirect *)); static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, struct newblk **)); static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, struct pagedep **)); static void pause_timer __P((void *)); static int request_cleanup __P((int, int)); static void add_to_worklist __P((struct worklist *)); /* * Exported softdep operations. */ struct bio_ops bioops = { softdep_disk_io_initiation, /* io_start */ softdep_disk_write_complete, /* io_complete */ softdep_deallocate_dependencies, /* io_deallocate */ softdep_fsync, /* io_fsync */ softdep_process_worklist, /* io_sync */ }; /* * Locking primitives. * * For a uniprocessor, all we need to do is protect against disk * interrupts. For a multiprocessor, this lock would have to be * a mutex. A single mutex is used throughout this file, though * finer grain locking could be used if contention warranted it. * * For a multiprocessor, the sleep call would accept a lock and * release it after the sleep processing was complete. In a uniprocessor * implementation there is no such interlock, so we simple mark * the places where it needs to be done with the `interlocked' form * of the lock calls. Since the uniprocessor sleep already interlocks * the spl, there is nothing that really needs to be done. */ #ifndef /* NOT */ DEBUG static struct lockit { int lkt_spl; } lk = { 0 }; #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() #define FREE_LOCK(lk) splx((lk)->lkt_spl) #define ACQUIRE_LOCK_INTERLOCKED(lk) #define FREE_LOCK_INTERLOCKED(lk) #else /* DEBUG */ static struct lockit { int lkt_spl; pid_t lkt_held; } lk = { 0, -1 }; static int lockcnt; static void acquire_lock __P((struct lockit *)); static void free_lock __P((struct lockit *)); static void acquire_lock_interlocked __P((struct lockit *)); static void free_lock_interlocked __P((struct lockit *)); #define ACQUIRE_LOCK(lk) acquire_lock(lk) #define FREE_LOCK(lk) free_lock(lk) #define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) #define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) static void acquire_lock(lk) struct lockit *lk; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) panic("softdep_lock: locking against myself"); else panic("softdep_lock: lock held by %d", lk->lkt_held); } lk->lkt_spl = splbio(); lk->lkt_held = CURPROC->p_pid; lockcnt++; } static void free_lock(lk) struct lockit *lk; { if (lk->lkt_held == -1) panic("softdep_unlock: lock not held"); lk->lkt_held = -1; splx(lk->lkt_spl); } static void acquire_lock_interlocked(lk) struct lockit *lk; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) panic("softdep_lock_interlocked: locking against self"); else panic("softdep_lock_interlocked: lock held by %d", lk->lkt_held); } lk->lkt_held = CURPROC->p_pid; lockcnt++; } static void free_lock_interlocked(lk) struct lockit *lk; { if (lk->lkt_held == -1) panic("softdep_unlock_interlocked: lock not held"); lk->lkt_held = -1; } #endif /* DEBUG */ /* * Place holder for real semaphores. */ struct sema { int value; pid_t holder; char *name; int prio; int timo; }; static void sema_init __P((struct sema *, char *, int, int)); static int sema_get __P((struct sema *, struct lockit *)); static void sema_release __P((struct sema *)); static void sema_init(semap, name, prio, timo) struct sema *semap; char *name; int prio, timo; { semap->holder = -1; semap->value = 0; semap->name = name; semap->prio = prio; semap->timo = timo; } static int sema_get(semap, interlock) struct sema *semap; struct lockit *interlock; { if (semap->value++ > 0) { if (interlock != NULL) FREE_LOCK_INTERLOCKED(interlock); tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); if (interlock != NULL) { ACQUIRE_LOCK_INTERLOCKED(interlock); FREE_LOCK(interlock); } return (0); } semap->holder = CURPROC->p_pid; if (interlock != NULL) FREE_LOCK(interlock); return (1); } static void sema_release(semap) struct sema *semap; { if (semap->value <= 0 || semap->holder != CURPROC->p_pid) panic("sema_release: not held"); if (--semap->value > 0) { semap->value = 0; wakeup(semap); } semap->holder = -1; } /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) #else /* DEBUG */ static void worklist_insert __P((struct workhead *, struct worklist *)); static void worklist_remove __P((struct worklist *)); static void workitem_free __P((struct worklist *, int)); #define WORKLIST_INSERT(head, item) worklist_insert(head, item) #define WORKLIST_REMOVE(item) worklist_remove(item) #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) static void worklist_insert(head, item) struct workhead *head; struct worklist *item; { if (lk.lkt_held == -1) panic("worklist_insert: lock not held"); if (item->wk_state & ONWORKLIST) panic("worklist_insert: already on list"); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item) struct worklist *item; { if (lk.lkt_held == -1) panic("worklist_remove: lock not held"); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: not on list"); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } static void workitem_free(item, type) struct worklist *item; int type; { if (item->wk_state & ONWORKLIST) panic("workitem_free: still on list"); if (item->wk_type != type) panic("workitem_free: type mismatch"); FREE(item, DtoM(type)); } #endif /* DEBUG */ /* * Workitem queue management */ static struct workhead softdep_workitem_pending; static int softdep_worklist_busy; static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static struct proc *filesys_syncer; /* proc of filesystem syncer process */ static int req_clear_inodedeps; /* syncer process flush some inodedeps */ #define FLUSH_INODES 1 static int req_clear_remove; /* syncer process flush some freeblks */ #define FLUSH_REMOVE 2 /* * runtime statistics */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ #ifdef DEBUG #include #include #if defined(__FreeBSD__) SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); #else /* !__FreeBSD__ */ struct ctldebug debug20 = { "max_softdeps", &max_softdeps }; struct ctldebug debug21 = { "tickdelay", &tickdelay }; struct ctldebug debug23 = { "blk_limit_push", &stat_blk_limit_push }; struct ctldebug debug24 = { "ino_limit_push", &stat_ino_limit_push }; struct ctldebug debug25 = { "blk_limit_hit", &stat_blk_limit_hit }; struct ctldebug debug26 = { "ino_limit_hit", &stat_ino_limit_hit }; struct ctldebug debug27 = { "indir_blk_ptrs", &stat_indir_blk_ptrs }; struct ctldebug debug28 = { "inode_bitmap", &stat_inode_bitmap }; struct ctldebug debug29 = { "direct_blk_ptrs", &stat_direct_blk_ptrs }; struct ctldebug debug30 = { "dir_entry", &stat_dir_entry }; #endif /* !__FreeBSD__ */ #endif /* DEBUG */ /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ static void add_to_worklist(wk) struct worklist *wk; { static struct worklist *worklist_tail; if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: already on list"); wk->wk_state |= ONWORKLIST; if (LIST_FIRST(&softdep_workitem_pending) == NULL) LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); else LIST_INSERT_AFTER(worklist_tail, wk, wk_list); worklist_tail = wk; } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ int softdep_process_worklist(matchmnt) struct mount *matchmnt; { struct proc *p = CURPROC; struct worklist *wk; struct fs *matchfs; int matchcnt; /* * Record the process identifier of our caller so that we can give * this process preferential treatment in request_cleanup below. */ filesys_syncer = p; matchcnt = 0; matchfs = NULL; if (matchmnt != NULL) matchfs = VFSTOUFS(matchmnt)->um_fs; /* * There is no danger of having multiple processes run this * code. It is single threaded solely so that softdep_flushfiles * (below) can get an accurate count of the number of items * related to its mount point that are in the list. */ if (softdep_worklist_busy && matchmnt == NULL) return (-1); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); req_clear_inodedeps = 0; wakeup(&proc_waiting); } if (req_clear_remove) { clear_remove(p); req_clear_remove = 0; wakeup(&proc_waiting); } ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { WORKLIST_REMOVE(wk); FREE_LOCK(&lk); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ if (WK_DIRREM(wk)->dm_mnt == matchmnt) matchcnt += 1; handle_workitem_remove(WK_DIRREM(wk)); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ if (WK_FREEBLKS(wk)->fb_fs == matchfs) matchcnt += 1; handle_workitem_freeblocks(WK_FREEBLKS(wk)); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ if (WK_FREEFRAG(wk)->ff_fs == matchfs) matchcnt += 1; handle_workitem_freefrag(WK_FREEFRAG(wk)); break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ if (WK_FREEFILE(wk)->fx_fs == matchfs) matchcnt += 1; handle_workitem_freefile(WK_FREEFILE(wk)); break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } if (softdep_worklist_busy && matchmnt == NULL) return (-1); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); req_clear_inodedeps = 0; wakeup(&proc_waiting); } if (req_clear_remove) { clear_remove(p); req_clear_remove = 0; wakeup(&proc_waiting); } ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); return (matchcnt); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushfiles(oldmnt, flags, p) struct mount *oldmnt; int flags; struct proc *p; { struct vnode *devvp; int error, loopcnt; /* * Await our turn to clear out the queue. */ while (softdep_worklist_busy) tsleep(&lbolt, PRIBIO, "softflush", 0); softdep_worklist_busy = 1; if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { softdep_worklist_busy = 0; return (error); } /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ devvp = VFSTOUFS(oldmnt)->um_devvp; for (loopcnt = 10; loopcnt > 0; loopcnt--) { if (softdep_process_worklist(oldmnt) == 0) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) break; /* * If we still found nothing to do, we are really done. */ if (softdep_process_worklist(oldmnt) == 0) break; } vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); VOP_UNLOCK(devvp, 0, p); if (error) break; } softdep_worklist_busy = 0; /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } return (error); } /* * Structure hashing. * * There are three types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ /* * Structures and routines associated with pagedep caching. */ LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; u_long pagedep_hash; /* size of hash table - 1 */ #define PAGEDEP_HASH(mp, inum, lbn) \ (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ pagedep_hash]) static struct sema pagedep_in_progress; /* * Look up a pagedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(ip, lbn, flags, pagedeppp) struct inode *ip; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct mount *mp; int i; #ifdef DEBUG if (lk.lkt_held == -1) panic("pagedep_lookup: lock not held"); #endif mp = ITOV(ip)->v_mount; pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); top: for (pagedep = LIST_FIRST(pagedephd); pagedep; pagedep = LIST_NEXT(pagedep, pd_hash)) if (ip->i_number == pagedep->pd_ino && lbn == pagedep->pd_lbn && mp == pagedep->pd_mnt) break; if (pagedep) { *pagedeppp = pagedep; return (1); } if ((flags & DEPALLOC) == 0) { *pagedeppp = NULL; return (0); } if (sema_get(&pagedep_in_progress, &lk) == 0) { ACQUIRE_LOCK(&lk); goto top; } MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, M_WAITOK); bzero(pagedep, sizeof(struct pagedep)); pagedep->pd_list.wk_type = D_PAGEDEP; pagedep->pd_mnt = mp; pagedep->pd_ino = ip->i_number; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); sema_release(&pagedep_in_progress); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; static u_long inodedep_hash; /* size of hash table - 1 */ static long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) static struct sema inodedep_in_progress; /* * Look up a inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(fs, inum, flags, inodedeppp) struct fs *fs; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; int firsttry; #ifdef DEBUG if (lk.lkt_held == -1) panic("inodedep_lookup: lock not held"); #endif firsttry = 1; inodedephd = INODEDEP_HASH(fs, inum); top: for (inodedep = LIST_FIRST(inodedephd); inodedep; inodedep = LIST_NEXT(inodedep, id_hash)) if (inum == inodedep->id_ino && fs == inodedep->id_fs) break; if (inodedep) { *inodedeppp = inodedep; return (1); } if ((flags & DEPALLOC) == 0) { *inodedeppp = NULL; return (0); } /* * If we are over our limit, try to improve the situation. */ if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 && request_cleanup(FLUSH_INODES, 1)) { firsttry = 0; goto top; } if (sema_get(&inodedep_in_progress, &lk) == 0) { ACQUIRE_LOCK(&lk); goto top; } num_inodedep += 1; MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), M_INODEDEP, M_WAITOK); inodedep->id_list.wk_type = D_INODEDEP; inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino = NULL; inodedep->id_savedsize = -1; inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); sema_release(&inodedep_in_progress); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; u_long newblk_hash; /* size of hash table - 1 */ #define NEWBLK_HASH(fs, inum) \ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static struct sema newblk_in_progress; /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(fs, newblkno, flags, newblkpp) struct fs *fs; ufs_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; newblkhd = NEWBLK_HASH(fs, newblkno); top: for (newblk = LIST_FIRST(newblkhd); newblk; newblk = LIST_NEXT(newblk, nb_hash)) if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) break; if (newblk) { *newblkpp = newblk; return (1); } if ((flags & DEPALLOC) == 0) { *newblkpp = NULL; return (0); } if (sema_get(&newblk_in_progress, 0) == 0) goto top; MALLOC(newblk, struct newblk *, sizeof(struct newblk), M_NEWBLK, M_WAITOK); newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); sema_release(&newblk_in_progress); *newblkpp = newblk; return (0); } /* * Executed during filesystem system initialization before * mounting any file systems. */ void softdep_initialize() { LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); max_softdeps = desiredvnodes * 8; pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum cstotal; struct cg *cgp; struct buf *bp; int error, cyl; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_flag |= MNT_SOFTDEP; /* * When doing soft updates, the counters in the * superblock may have gotten out of sync, so we have * to scan the cylinder groups and recalculate them. */ if (fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("ffs_mountfs: superblock updated for soft updates\n"); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a file system * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs file system maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ { struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) panic("softdep_setup_inomapdep: found inode"); inodedep->id_buf = bp; inodedep->id_state &= ~DEPCOMPLETE; bmsafemap = bmsafemap_lookup(bp); LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, fs, newblkno) struct buf *bp; /* buffer for cylgroup block with block map */ struct fs *fs; /* filesystem doing allocation */ ufs_daddr_t newblkno; /* number of newly allocated block */ { struct newblk *newblk; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); ACQUIRE_LOCK(&lk); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); FREE_LOCK(&lk); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * splbio interrupts blocked. */ static struct bmsafemap * bmsafemap_lookup(bp) struct buf *bp; { struct bmsafemap *bmsafemap; struct worklist *wk; #ifdef DEBUG if (lk.lkt_held == -1) panic("bmsafemap_lookup: lock not held"); #endif for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) if (wk->wk_type == D_BMSAFEMAP) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), M_BMSAFEMAP, M_WAITOK); bmsafemap->sm_list.wk_type = D_BMSAFEMAP; bmsafemap->sm_list.wk_state = 0; bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_allocdirecthd); LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_newblkhd); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t lbn; /* block pointer within inode */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct pagedep *pagedep; struct newblk *newblk; MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_WAITOK); bzero(adp, sizeof(struct allocdirect)); adp->ad_list.wk_type = D_ALLOCDIRECT; adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NDADDR) { /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } for (oldadp = TAILQ_FIRST(adphead); oldadp; oldadp = TAILQ_NEXT(oldadp, ad_next)) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct freefrag *freefrag; #ifdef DEBUG if (lk.lkt_held == -1) panic("allocdirect_merge: lock not held"); #endif if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_lbn >= NDADDR) panic("allocdirect_check: old %d != new %d || lbn %ld >= %d", newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, NDADDR); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_allocdirect. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } free_allocdirect(adphead, oldadp, 0); } /* * Allocate a new freefrag structure if needed. */ static struct freefrag * newfreefrag(ip, blkno, size) struct inode *ip; ufs_daddr_t blkno; long size; { struct freefrag *freefrag; struct fs *fs; if (blkno == 0) return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), M_FREEFRAG, M_WAITOK); freefrag->ff_list.wk_type = D_FREEFRAG; freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ freefrag->ff_inum = ip->i_number; freefrag->ff_fs = fs; freefrag->ff_devvp = ip->i_devvp; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct inode tip; tip.i_fs = freefrag->ff_fs; tip.i_devvp = freefrag->ff_devvp; tip.i_dev = freefrag->ff_devvp->v_rdev; tip.i_number = freefrag->ff_inum; tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); FREE(freefrag, M_FREEFRAG); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 if none */ { struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), M_ALLOCINDIR, M_WAITOK); bzero(aip, sizeof(struct allocindir)); aip->ai_list.wk_type = D_ALLOCINDIR; aip->ai_state = ATTACHED; aip->ai_offset = ptrno; aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct allocindir *aip; struct pagedep *pagedep; aip = newallocindir(ip, ptrno, newblkno, oldblkno); ACQUIRE_LOCK(&lk); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); setup_allocindir_phase2(bp, ip, aip); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ { struct allocindir *aip; aip = newallocindir(ip, ptrno, newblkno, 0); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); setup_allocindir_phase2(bp, ip, aip); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void setup_allocindir_phase2(bp, ip, aip) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct allocindir *aip; /* allocindir allocated by the above routines */ { struct worklist *wk; struct indirdep *indirdep, *newindirdep; struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; struct newblk *newblk; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); for (indirdep = NULL, newindirdep = NULL; ; ) { ACQUIRE_LOCK(&lk); for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); newindirdep = NULL; } FREE_LOCK(&lk); if (indirdep) { if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, &newblk) == 0) panic("setup_allocindir: lost block"); ACQUIRE_LOCK(&lk); if (newblk->nb_state == DEPCOMPLETE) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; aip->ai_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old * dependency into the new one. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) if (oldaip->ai_offset == aip->ai_offset) break; if (oldaip != NULL) { if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("setup_allocindir_phase2: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = aip->ai_freefrag; aip->ai_freefrag = freefrag; free_allocindir(oldaip, NULL); } LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); ((ufs_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; FREE_LOCK(&lk); } if (newindirdep) { if (indirdep->ir_savebp != NULL) brelse(newindirdep->ir_savebp); WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); } if (indirdep) break; MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), M_INDIRDEP, M_WAITOK); newindirdep->ir_list.wk_type = D_INDIRDEP; newindirdep->ir_state = ATTACHED; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); if (bp->b_blkno == bp->b_lblkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); } } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ static long num_freeblks; /* number of freeblks allocated */ void softdep_setup_freeblocks(ip, length) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ { struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct vnode *vp; struct buf *bp; struct fs *fs; int i, error; fs = ip->i_fs; if (length != 0) panic("softde_setup_freeblocks: non-zero length"); /* * If we are over our limit, try to improve the situation. */ if (num_freeblks > max_softdeps / 2 && speedup_syncer() == 0) (void) request_cleanup(FLUSH_REMOVE, 0); num_freeblks += 1; MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), M_FREEBLKS, M_WAITOK); bzero(freeblks, sizeof(struct freeblks)); freeblks->fb_list.wk_type = D_FREEBLKS; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; freeblks->fb_fs = fs; freeblks->fb_oldsize = ip->i_size; freeblks->fb_newsize = length; freeblks->fb_chkcnt = ip->i_blocks; for (i = 0; i < NDADDR; i++) { freeblks->fb_dblks[i] = ip->i_db[i]; ip->i_db[i] = 0; } for (i = 0; i < NIADDR; i++) { freeblks->fb_iblks[i] = ip->i_ib[i]; ip->i_ib[i] = 0; } ip->i_blocks = 0; ip->i_size = 0; /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) softdep_error("softdep_setup_freeblocks", error); *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(&lk); (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. */ WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. */ merge_inode_lists(inodedep); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, 1); FREE_LOCK(&lk); bdwrite(bp); /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); ACQUIRE_LOCK(&lk); drain_output(vp, 1); while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { bp = TAILQ_FIRST(&vp->v_dirtyblkhd); (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); } /* * Try freeing the inodedep in case that was the last dependency. */ if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) (void) free_inodedep(inodedep); FREE_LOCK(&lk); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static void deallocate_dependencies(bp, inodedep) struct buf *bp; struct inodedep *inodedep; { struct worklist *wk; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct diradd *dap; int i; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("deallocate_dependencies: already gone"); indirdep->ir_state |= GOINGAWAY; while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); WORKLIST_REMOVE(wk); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* * None of the directory additions will ever be * visible, so they can simply be tossed. */ for (i = 0; i < DAHASHSZ; i++) while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i]))) free_diradd(dap); while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) free_diradd(dap); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. * If the inode has already been written, then they * can be dumped directly onto the work list. */ for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; dirrem = LIST_NEXT(dirrem, dm_next)) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL) add_to_worklist(&dirrem->dm_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: free_allocindir(WK_ALLOCINDIR(wk), inodedep); continue; case D_ALLOCDIRECT: case D_INODEDEP: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ default: panic("deallocate_dependencies: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Free an allocdirect. Generate a new freefrag work request if appropriate. * This routine must be called with splbio interrupts blocked. */ static void free_allocdirect(adphead, adp, delay) struct allocdirectlst *adphead; struct allocdirect *adp; int delay; { #ifdef DEBUG if (lk.lkt_held == -1) panic("free_allocdirect: lock not held"); #endif if ((adp->ad_state & DEPCOMPLETE) == 0) LIST_REMOVE(adp, ad_deps); TAILQ_REMOVE(adphead, adp, ad_next); if ((adp->ad_state & COMPLETE) == 0) WORKLIST_REMOVE(&adp->ad_list); if (adp->ad_freefrag != NULL) { if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_freefrag->ff_list); else add_to_worklist(&adp->ad_freefrag->ff_list); } WORKITEM_FREE(adp, D_ALLOCDIRECT); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ static long num_freefile; /* number of freefile allocated */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; /* * If we are over our limit, try to improve the situation. */ if (num_freefile > max_softdeps / 2 && speedup_syncer() == 0) (void) request_cleanup(FLUSH_REMOVE, 0); /* * This sets up the inode de-allocation dependency. */ num_freefile += 1; MALLOC(freefile, struct freefile *, sizeof(struct freefile), M_FREEFILE, M_WAITOK); freefile->fx_list.wk_type = D_FREEFILE; freefile->fx_list.wk_state = 0; freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; freefile->fx_fs = ip->i_fs; /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk and we can free the file immediately. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) { add_to_worklist(&freefile->fx_list); FREE_LOCK(&lk); return; } /* * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We could process * the freefile immediately, but then we would have to clear the * id_inowait dependencies here and it is easier just to let the * zero'ed inode be written and let them be cleaned up in the * normal followup actions that follow the inode write. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } /* * If the inodedep has no dependencies associated with it, * then we must free it here and free the file immediately. * This case arises when an early allocation fails (for * example, the user is over their file quota). */ if (free_inodedep(inodedep) == 0) WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); else add_to_worklist(&freefile->fx_list); FREE_LOCK(&lk); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { if ((inodedep->id_state & ONWORKLIST) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || LIST_FIRST(&inodedep->id_pendinghd) != NULL || LIST_FIRST(&inodedep->id_bufwait) != NULL || LIST_FIRST(&inodedep->id_inowait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; return (1); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static void handle_workitem_freeblocks(freeblks) struct freeblks *freeblks; { struct inode tip; ufs_daddr_t bn; struct fs *fs; int i, level, bsize; long nblocks, blocksreleased = 0; int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; tip.i_number = freeblks->fb_previousinum; tip.i_devvp = freeblks->fb_devvp; tip.i_dev = freeblks->fb_devvp->v_rdev; tip.i_fs = freeblks->fb_fs; tip.i_size = freeblks->fb_oldsize; tip.i_uid = freeblks->fb_uid; fs = freeblks->fb_fs; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { tmpval *= NINDIR(fs); baselbns[i] = baselbns[i - 1] + tmpval; } nblocks = btodb(fs->fs_bsize); blocksreleased = 0; /* * Indirect blocks first. */ for (level = (NIADDR - 1); level >= 0; level--) { if ((bn = freeblks->fb_iblks[level]) == 0) continue; if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, baselbns[level], &blocksreleased)) == 0) allerror = error; ffs_blkfree(&tip, bn, fs->fs_bsize); blocksreleased += nblocks; } /* * All direct blocks or frags. */ for (i = (NDADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_dblks[i]) == 0) continue; bsize = blksize(fs, &tip, i); ffs_blkfree(&tip, bn, bsize); blocksreleased += btodb(bsize); } #ifdef DIAGNOSTIC if (freeblks->fb_chkcnt != blocksreleased) panic("handle_workitem_freeblocks: block count"); if (allerror) softdep_error("handle_workitem_freeblks", allerror); #endif /* DIAGNOSTIC */ WORKITEM_FREE(freeblks, D_FREEBLKS); num_freeblks -= 1; } /* * Release blocks associated with the inode ip and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int indir_trunc(ip, dbn, level, lbn, countp) struct inode *ip; ufs_daddr_t dbn; int level; ufs_lbn_t lbn; long *countp; { struct buf *bp; ufs_daddr_t *bap; ufs_daddr_t nb; struct fs *fs; struct worklist *wk; struct indirdep *indirdep; int i, lbnadd, nblocks; int error, allerror = 0; fs = ip->i_fs; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); /* * Get buffer of block pointers to be freed. This routine is not * called until the zero'ed inode has been written, so it is safe * to free blocks as they are encountered. Because the inode has * been zero'ed, calls to bmap on these blocks will fail. So, we * have to use the on-disk address and the block device for the * filesystem to look them up. If the file was deleted before its * indirect blocks were all written to disk, the routine that set * us up (deallocate_dependencies) will have arranged to leave * a complete copy of the indirect block in memory for our use. * Otherwise we have to read the blocks in from the disk. */ ACQUIRE_LOCK(&lk); if ((bp = incore(ip->i_devvp, dbn)) != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: lost indirdep"); WORKLIST_REMOVE(wk); WORKITEM_FREE(indirdep, D_INDIRDEP); if (LIST_FIRST(&bp->b_dep) != NULL) panic("indir_trunc: dangling dep"); FREE_LOCK(&lk); } else { FREE_LOCK(&lk); error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) return (error); } /* * Recursively free indirect blocks. */ bap = (ufs_daddr_t *)bp->b_data; nblocks = btodb(fs->fs_bsize); for (i = NINDIR(fs) - 1; i >= 0; i--) { if ((nb = bap[i]) == 0) continue; if (level != 0) { if ((error = indir_trunc(ip, fsbtodb(fs, nb), level - 1, lbn + (i * lbnadd), countp)) != 0) allerror = error; } ffs_blkfree(ip, nb, fs->fs_bsize); *countp += nblocks; } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (allerror); } /* * Free an allocindir. * This routine must be called with splbio interrupts blocked. */ static void free_allocindir(aip, inodedep) struct allocindir *aip; struct inodedep *inodedep; { struct freefrag *freefrag; #ifdef DEBUG if (lk.lkt_held == -1) panic("free_allocindir: lock not held"); #endif if ((aip->ai_state & DEPCOMPLETE) == 0) LIST_REMOVE(aip, ai_deps); if (aip->ai_state & ONWORKLIST) WORKLIST_REMOVE(&aip->ai_list); LIST_REMOVE(aip, ai_next); if ((freefrag = aip->ai_freefrag) != NULL) { if (inodedep == NULL) add_to_worklist(&freefrag->ff_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs file system will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ void softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ long newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir1, *mkdir2; /* * Whiteouts have no dependencies. */ if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return; } fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_WAITOK); mkdir1->md_list.wk_type = D_MKDIR; mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_WAITOK); mkdir2->md_list.wk_type = D_MKDIR; mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); FREE_LOCK(&lk); bdwrite(newdirbp); /* * Dependency on link count increase for parent directory */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); } else { LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } } /* * Link into parent directory pagedep to await its being written. */ if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); FREE_LOCK(&lk); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct diradd *dap; ufs_lbn_t lbn; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); dap; dap = LIST_NEXT(dap, da_pdlist)) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) break; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], dap, da_pdlist); break; } if (dap == NULL) { for (dap = LIST_FIRST(&pagedep->pd_pendinghd); dap; dap = LIST_NEXT(dap, da_pdlist)) { if (dap->da_offset == oldoffset) { dap->da_offset = newoffset; break; } } } done: bcopy(oldloc, newloc, entrysize); FREE_LOCK(&lk); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap) struct diradd *dap; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; #ifdef DEBUG if (lk.lkt_held == -1) panic("free_diradd: lock not held"); #endif WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~mkdir->md_state; WORKLIST_REMOVE(&mkdir->md_list); LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem; /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir); if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), M_DIRREM, M_WAITOK); bzero(dirrem, sizeof(struct dirrem)); dirrem->dm_list.wk_type = D_DIRREM; dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_mnt = ITOV(ip)->v_mount; dirrem->dm_oldinum = ip->i_number; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. Check for an entry on both the pd_dirraddhd * list and the pd_pendinghd list. */ for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]); dap; dap = LIST_NEXT(dap, da_pdlist)) if (dap->da_offset == offset) break; if (dap == NULL) { for (dap = LIST_FIRST(&pagedep->pd_pendinghd); dap; dap = LIST_NEXT(dap, da_pdlist)) if (dap->da_offset == offset) break; if (dap == NULL) return (dirrem); } /* * Must be ATTACHED at this point, so just delete it. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); free_diradd(dap); dirrem->dm_state |= COMPLETE; return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ long newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; offset = blkoff(dp->i_fs, dp->i_offset); /* * Whiteouts do not need diradd dependencies. */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); return; } /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ dap->da_previous = dirrem; if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If the previous inode was never written or its previous directory * entry was never written, then we do not want to roll back to this * previous value. Instead we want to roll back to zero and immediately * free the unwritten or unreferenced inode. */ if (dirrem->dm_state & COMPLETE) { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); } /* * Called whenever the link count on an inode is increased. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_increase_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); FREE_LOCK(&lk); } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static void handle_workitem_remove(dirrem) struct dirrem *dirrem; { struct proc *p = CURPROC; /* XXX */ struct inodedep *inodedep; struct vnode *vp; struct inode *ip; int error; if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); ip->i_flag |= IN_CHANGE; vput(vp); WORKITEM_FREE(dirrem, D_DIRREM); return; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Next truncate the directory to length zero. When the * truncation completes, arrange to have the reference count on * the parent decremented to account for the loss of "..". */ ip->i_nlink -= 2; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); ip->i_flag |= IN_CHANGE; if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) softdep_error("handle_workitem_remove: truncate", error); /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { vput(vp); WORKITEM_FREE(dirrem, D_DIRREM); return; } ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, &inodedep); dirrem->dm_state = 0; dirrem->dm_oldinum = dirrem->dm_dirinum; WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); vput(vp); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct vnode vp; struct inode tip; struct inodedep *idp; int error; #ifdef DEBUG ACQUIRE_LOCK(&lk); if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) panic("handle_workitem_freefile: inodedep survived"); FREE_LOCK(&lk); #endif tip.i_devvp = freefile->fx_devvp; tip.i_dev = freefile->fx_devvp->v_rdev; tip.i_fs = freefile->fx_fs; vp.v_data = &tip; if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); WORKITEM_FREE(freefile, D_FREEFILE); num_freefile -= 1; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk, *nextwk; struct indirdep *indirdep; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_flags & B_READ) panic("softdep_disk_io_initiation: read"); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { nextwk = LIST_NEXT(wk, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: initiate_write_inodeblock(WK_INODEDEP(wk), bp); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this * will be writing the real pointers, so the * dependency can be freed. */ if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(indirdep->ir_savebp); /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; LIST_REMOVE(wk, wk_list); WORKITEM_FREE(indirdep, D_INDIRDEP); continue; } /* * Replace up-to-date version with safe version. */ ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, M_INDIRDEP, M_WAITOK); bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); FREE_LOCK(&lk); continue; case D_MKDIR: case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; ACQUIRE_LOCK(&lk); for (i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = LIST_NEXT(dap, da_pdlist)) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %d != new %d", "initiate_write_filepage", ep->d_ino, dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } FREE_LOCK(&lk); } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct dinode *dp; struct fs *fs; ufs_lbn_t prevlbn = 0; int i, deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino != NULL) panic("initiate_write_inodeblock: already doing I/O"); MALLOC(inodedep->id_savedino, struct dinode *, sizeof(struct dinode), M_INODEDEP, M_WAITOK); *inodedep->id_savedino = *dp; bzero((caddr_t)dp, sizeof(struct dinode)); return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) return; /* * Set the dependencies to busy. */ ACQUIRE_LOCK(&lk); for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%ld mismatch %d != %d", "softdep_write_inodeblock", adp->ad_lbn, dp->di_db[adp->ad_lbn], adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%ld mismatch %d != %d", "softdep_write_inodeblock", adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* DIAGNOSTIC */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* DIAGNOSTIC */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* DIAGNOSTIC */ dp->di_ib[i] = 0; } FREE_LOCK(&lk); return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; FREE_LOCK(&lk); } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the file system caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. */ void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct workhead reattach; struct newblk *newblk; struct allocindir *aip; struct allocdirect *adp; struct indirdep *indirdep; struct inodedep *inodedep; struct bmsafemap *bmsafemap; #ifdef DEBUG if (lk.lkt_held != -1) panic("softdep_disk_write_complete: lock is held"); lk.lkt_held = -2; #endif LIST_INIT(&reattach); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); } while ((adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; LIST_REMOVE(adp, ad_deps); handle_allocdirect_partdone(adp); } while ((aip = LIST_FIRST(&bmsafemap->sm_allocindirhd))) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; LIST_REMOVE(aip, ai_deps); handle_allocindir_partdone(aip); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); adp->ad_state |= COMPLETE; handle_allocdirect_partdone(adp); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); aip->ai_state |= COMPLETE; handle_allocindir_partdone(aip); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); FREE(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); } WORKLIST_INSERT(&reattach, wk); if ((bp->b_flags & B_DELWRI) == 0) stat_indir_blk_ptrs++; bdirty(bp); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } #ifdef DEBUG if (lk.lkt_held != -2) panic("softdep_disk_write_complete: lock lost"); lk.lkt_held = -1; #endif } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { struct allocdirect *listadp; struct inodedep *inodedep; long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (adp->ad_buf != NULL) panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; listadp = TAILQ_NEXT(listadp, ad_next)) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; listadp = TAILQ_NEXT(listadp, ad_next)) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. */ for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; free_allocdirect(&inodedep->id_inoupdt, adp, 1); } } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (aip->ai_buf != NULL) panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; if (indirdep->ir_state & UNDONE) { LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; LIST_REMOVE(aip, ai_next); if (aip->ai_freefrag != NULL) add_to_worklist(&aip->ai_freefrag->ff_list); WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ static int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { struct worklist *wk, *filefree; struct allocdirect *adp, *nextadp; struct dinode *dp; int hadchanges; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; inodedep->id_state |= COMPLETE; dp = (struct dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino != NULL) { *dp = *inodedep->id_savedino; FREE(inodedep->id_savedino, M_INODEDEP); inodedep->id_savedino = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); return (1); } /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (adp->ad_lbn < NDADDR) { if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) panic("%s: %s #%ld mismatch %d != %d", "handle_written_inodeblock", "direct pointer", adp->ad_lbn, dp->di_db[adp->ad_lbn], adp->ad_oldblkno); dp->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%ld allocated as %d", "handle_written_inodeblock", "indirect pointer", adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR]); dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1) panic("handle_written_inodeblock: bad size"); if (dp->di_size != inodedep->id_savedsize) { dp->di_size = inodedep->id_savedsize; hadchanges = 1; } inodedep->id_savedsize = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. */ filefree = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding filefree to the worklist until * all other additions have been made to ensure * that it will be done after all the old blocks * have been freed. */ if (filefree != NULL) panic("handle_written_inodeblock: filefree"); filefree = wk; continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEBLKS: case D_FREEFRAG: case D_DIRREM: add_to_worklist(wk); continue; default: panic("handle_written_inodeblock: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } if (filefree != NULL) { if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep"); add_to_worklist(filefree); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) return (0); return (hadchanges); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { struct pagedep *pagedep; dap->da_state |= COMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { struct diradd *dap; struct pagedep *pagedep; if (mkdir->md_state != type) panic("handle_written_mkdir: bad type"); dap = mkdir->md_diradd; dap->da_state &= ~type; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) dap->da_state |= DEPCOMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ static int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Free any directory additions that have been committed. */ while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); } /* * If no dependencies remain, the pagedep will be freed. * Otherwise it will remain to update the page before it * is written back to disk. */ if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { for (i = 0; i < DAHASHSZ; i++) if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) break; if (i == DAHASHSZ) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (0); } } return (1); } /* * Writing back in-core inode structures. * * The file system only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } if (inodedep->id_nlinkdelta != 0) { ip->i_effnlink -= inodedep->id_nlinkdelta; ip->i_flag |= IN_MODIFIED; inodedep->id_nlinkdelta = 0; (void) free_inodedep(inodedep); } FREE_LOCK(&lk); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct worklist *wk; int error, gotit; /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) { (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } if (ip->i_nlink < ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(inodedep); if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { FREE_LOCK(&lk); return; } gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); FREE_LOCK(&lk); if (gotit && (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); if ((inodedep->id_state & DEPCOMPLETE) == 0) panic("softdep_update_inodeblock: update failed"); } /* * Merge the new inode dependency list (id_newinoupdt) into the old * inode dependency list (id_inoupdt). This routine must be called * with splbio interrupts blocked. */ static void merge_inode_lists(inodedep) struct inodedep *inodedep; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_lbn == newadp->ad_lbn) { allocdirect_merge(&inodedep->id_inoupdt, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); } while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct diradd *dap, *olddap; struct inodedep *inodedep; struct pagedep *pagedep; struct worklist *wk; struct mount *mnt; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct proc *p = CURPROC; /* XXX */ int error, ret, flushparent; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); fs = ip->i_fs; for (error = 0, flushparent = 0, olddap = NULL; ; ) { ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) break; if (LIST_FIRST(&inodedep->id_inowait) != NULL || LIST_FIRST(&inodedep->id_bufwait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) panic("softdep_fsync: pending ops"); if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == olddap) panic("softdep_fsync: flush failed"); olddap = dap; /* * Flush our parent if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; mnt = pagedep->pd_mnt; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); flushparent = dap->da_state & MKDIR_PARENT; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_flag & VXLOCK) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we must unlock ourselves before * requesting the lock on our parent. See the comment in * ufs_lookup for details on possible races. */ FREE_LOCK(&lk); VOP_UNLOCK(vp, 0, p); if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (flushparent) { if ((error = UFS_UPDATE(pvp, 1)) != 0) { vput(pvp); return (error); } } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, &bp); ret = VOP_BWRITE(bp->b_vp, bp); vput(pvp); if (error != 0) return (error); if (ret != 0) return (ret); } FREE_LOCK(&lk); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; if (vp->v_type != VBLK) panic("softdep_fsync_mountdev: vnode not VBLK"); ACQUIRE_LOCK(&lk); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP) { BUF_UNLOCK(bp); continue; } bremfree(bp); FREE_LOCK(&lk); (void) bawrite(bp); ACQUIRE_LOCK(&lk); /* * Since we may have slept during the I/O, we need * to start from a known point. */ nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); } drain_output(vp, 1); FREE_LOCK(&lk); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed by pushing the dirty blocks * associated with the file. If any I/O errors occur, they are returned. */ int softdep_sync_metadata(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct pagedep *pagedep; struct allocdirect *adp; struct allocindir *aip; struct buf *bp, *nbp; struct worklist *wk; int i, error, waitfor; /* * Check whether this vnode is involved in a filesystem * that is doing soft dependency processing. */ if (vp->v_type != VBLK) { if (!DOINGSOFTDEP(vp)) return (0); } else if (vp->v_specmountpoint == NULL || (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) return (0); /* * Ensure that any direct block dependencies have been cleared. */ ACQUIRE_LOCK(&lk); if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { FREE_LOCK(&lk); return (error); } /* * For most files, the only metadata dependencies are the * cylinder group maps that allocate their inode or blocks. * The block allocation dependencies can be found by traversing * the dependency lists for any buffers that remain on their * dirty buffer list. The inode allocation dependency will * be resolved when the inode is updated with MNT_WAIT. * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. */ waitfor = MNT_NOWAIT; top: if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { FREE_LOCK(&lk); return (0); } bp = TAILQ_FIRST(&vp->v_dirtyblkhd); loop: /* * As we hold the buffer locked, none of its dependencies * will disappear. */ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) { switch (wk->wk_type) { case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); if (adp->ad_state & DEPCOMPLETE) break; nbp = adp->ad_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); if (aip->ai_state & DEPCOMPLETE) break; nbp = aip->ai_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_INDIRDEP: restart: for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); aip; aip = LIST_NEXT(aip, ai_next)) { if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; if (getdirtybuf(&nbp, MNT_WAIT) == 0) goto restart; FREE_LOCK(&lk); if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); goto restart; } break; case D_INODEDEP: if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, WK_INODEDEP(wk)->id_ino)) != 0) { FREE_LOCK(&lk); bawrite(bp); return (error); } break; case D_PAGEDEP: /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, pagedep->pd_mnt, &pagedep->pd_diraddhd[i]))) { FREE_LOCK(&lk); bawrite(bp); return (error); } } break; case D_MKDIR: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_MKDIR(wk)->md_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_BMSAFEMAP: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_BMSAFEMAP(wk)->sm_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); nbp = TAILQ_NEXT(bp, b_vnbufs); FREE_LOCK(&lk); bawrite(bp); ACQUIRE_LOCK(&lk); if (nbp != NULL) { bp = nbp; goto loop; } /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, proceed with the second pass * which will wait for the I/O as per above. */ drain_output(vp, 1); /* * The brief unlock is to allow any pent up dependency * processing to be done. */ if (waitfor == MNT_NOWAIT) { waitfor = MNT_WAIT; FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); goto top; } /* * If we have managed to get rid of all the dirty buffers, * then we are done. For certain directories and block * devices, we may need to do further work. */ if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { FREE_LOCK(&lk); return (0); } FREE_LOCK(&lk); /* * If we are trying to sync a block device, some of its buffers may * contain metadata that cannot be written until the contents of some * partially written files have been written to disk. The only easy * way to accomplish this is to sync the entire filesystem (luckily * this happens rarely). */ if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, ap->a_p)) != 0) return (error); return (0); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(fs, ino) struct fs *fs; ino_t ino; { struct inodedep *inodedep; struct allocdirect *adp; int error, waitfor; struct buf *bp; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ for (waitfor = MNT_NOWAIT; ; ) { FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; if (getdirtybuf(&bp, waitfor) == 0) { if (waitfor == MNT_NOWAIT) continue; break; } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) { ACQUIRE_LOCK(&lk); return (error); } ACQUIRE_LOCK(&lk); break; } if (adp != NULL) continue; for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; if (getdirtybuf(&bp, waitfor) == 0) { if (waitfor == MNT_NOWAIT) continue; break; } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) { ACQUIRE_LOCK(&lk); return (error); } ACQUIRE_LOCK(&lk); break; } if (adp != NULL) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct proc *p = CURPROC; /* XXX */ struct inodedep *inodedep; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int gotit, error = 0; struct buf *bp; ino_t inum; ump = VFSTOUFS(mp); while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); if ((error = UFS_UPDATE(pvp, 1)) != 0) break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_PARENT) panic("flush_pagedep_deps: MKDIR"); } /* * Flush the file on which the directory entry depends. * If the inode has already been pushed out of the cache, * then all the block dependencies will have been flushed * leaving only inode dependencies (e.g., bitmaps). Thus, * we do a ufs_ihashget to check for the vnode in the cache. * If it is there, we do a full flush. If it is no longer * there we need only dispose of any remaining bitmap * dependencies and write the inode to disk. */ inum = dap->da_newinum; FREE_LOCK(&lk); if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) { ACQUIRE_LOCK(&lk); if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0 && dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 1 failed"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT); FREE_LOCK(&lk); if (gotit && (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0) break; ACQUIRE_LOCK(&lk); } if (dap != LIST_FIRST(diraddhdp)) continue; /* * If the inode is still sitting in a buffer waiting * to be written, push it to disk. */ FREE_LOCK(&lk); if ((error = bread(ump->um_devvp, fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) break; if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) break; ACQUIRE_LOCK(&lk); if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 2 failed"); continue; } if (vp->v_type == VDIR) { /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. We do not want or need * the full semantics of a synchronous VOP_FSYNC as * that may end up here again, once for each directory * level in the filesystem. Instead, we push the blocks * and wait for them to clear. */ if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { vput(vp); break; } drain_output(vp, 0); } error = UFS_UPDATE(vp, 1); vput(vp); if (error) break; /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 3 failed"); ACQUIRE_LOCK(&lk); } if (error) ACQUIRE_LOCK(&lk); return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. Therefore we deliberately slow things * down and speed up the I/O processing if we find ourselves with too * many dependencies in progress. */ static int request_cleanup(resource, islocked) int resource; int islocked; { struct callout_handle handle; struct proc *p = CURPROC; /* * We never hold up the filesystem syncer process. */ if (p == filesys_syncer) return (0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: stat_ino_limit_push += 1; req_clear_inodedeps = 1; break; case FLUSH_REMOVE: stat_blk_limit_push += 1; req_clear_remove = 1; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ if (islocked == 0) ACQUIRE_LOCK(&lk); if (proc_waiting == 0) { proc_waiting = 1; handle = timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2); } FREE_LOCK_INTERLOCKED(&lk); (void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0); ACQUIRE_LOCK_INTERLOCKED(&lk); if (proc_waiting) { untimeout(pause_timer, NULL, handle); proc_waiting = 0; } else { switch (resource) { case FLUSH_INODES: stat_ino_limit_hit += 1; break; case FLUSH_REMOVE: stat_blk_limit_hit += 1; break; } } if (islocked == 0) FREE_LOCK(&lk); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. */ void pause_timer(arg) void *arg; { proc_waiting = 0; wakeup(&proc_waiting); } /* * Flush out a directory with at least one removal dependency in an effort * to reduce the number of freefile and freeblks dependency structures. */ static void clear_remove(p) struct proc *p; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; static int next = 0; struct mount *mp; struct vnode *vp; int error, cnt; ino_t ino; ACQUIRE_LOCK(&lk); for (cnt = 0; cnt < pagedep_hash; cnt++) { pagedephd = &pagedep_hashtbl[next++]; if (next >= pagedep_hash) next = 0; for (pagedep = LIST_FIRST(pagedephd); pagedep; pagedep = LIST_NEXT(pagedep, pd_hash)) { if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) continue; mp = pagedep->pd_mnt; ino = pagedep->pd_ino; FREE_LOCK(&lk); if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_remove: vget", error); return; } if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_remove: fsync", error); drain_output(vp, 0); vput(vp); return; } } FREE_LOCK(&lk); } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(p) struct proc *p; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; static int next = 0; struct mount *mp; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ACQUIRE_LOCK(&lk); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt < inodedep_hash; cnt++) { inodedephd = &inodedep_hashtbl[next++]; if (next >= inodedep_hash) next = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } /* * Ugly code to find mount point given pointer to superblock. */ fs = inodedep->id_fs; for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; mp = CIRCLEQ_NEXT(mp, mnt_list)) if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) break; /* * Find the last inode in the block with dependencies. */ firstino = inodedep->id_ino & ~(INOPB(fs) - 1); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) continue; FREE_LOCK(&lk); if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); return; } if (ino == lastino) { if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_inodedeps: fsync2", error); drain_output(vp, 0); } vput(vp); ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); } /* * Acquire exclusive access to a buffer. * Must be called with splbio blocked. * Return 1 if buffer was acquired. */ static int getdirtybuf(bpp, waitfor) struct buf **bpp; int waitfor; { struct buf *bp; for (;;) { if ((bp = *bpp) == NULL) return (0); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) break; if (waitfor != MNT_WAIT) return (0); FREE_LOCK_INTERLOCKED(&lk); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK) panic("getdirtybuf: inconsistent lock"); ACQUIRE_LOCK_INTERLOCKED(&lk); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (0); } bremfree(bp); return (1); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode locked. */ static void drain_output(vp, islocked) struct vnode *vp; int islocked; { if (!islocked) ACQUIRE_LOCK(&lk); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; FREE_LOCK_INTERLOCKED(&lk); tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); ACQUIRE_LOCK_INTERLOCKED(&lk); } if (!islocked) FREE_LOCK(&lk); } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_flags & B_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } Index: head/sys/dev/pccard/if_xe.c =================================================================== --- head/sys/dev/pccard/if_xe.c (revision 49534) +++ head/sys/dev/pccard/if_xe.c (revision 49535) @@ -1,2507 +1,2507 @@ /*- * Copyright (c) 1998, 1999 Scott Mitchell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: if_xe.c,v 1.20 1999/06/13 19:17:40 scott Exp $ * $FreeBSD$ */ /* * Portions of this software were derived from Werner Koch's xirc2ps driver * for Linux under the terms of the following license (from v1.30 of the * xirc2ps driver): * * Copyright (c) 1997 by Werner Koch (dd9jn) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * FreeBSD device driver for Xircom CreditCard PCMCIA Ethernet adapters. The * following cards are currently known to work with the driver: * Xircom CreditCard 10/100 (CE3) * Xircom CreditCard Ethernet + Modem 28 (CEM28) * Xircom CreditCard Ethernet 10/100 + Modem 56 (CEM56) * Xircom RealPort Ethernet 10 * Xircom RealPort Ethernet 10/100 * Xircom RealPort Ethernet 10/100 + Modem 56 (REM56, REM56G) * Intel EtherExpress Pro/100 PC Card Mobile Adapter 16 (Pro/100 M16A) * Compaq Netelligent 10/100 PC Card (CPQ-10/100) * * Some other cards *should* work, but support for them is either broken or in * an unknown state at the moment. I'm always interested in hearing from * people who own any of these cards: * Xircom CreditCard 10Base-T (PS-CE2-10) * Xircom CreditCard Ethernet + ModemII (CEM2) * Xircom CEM28 and CEM33 Ethernet/Modem cards (may be variants of CEM2?) * * Thanks to all who assisted with the development and testing of the driver, * especially: Werner Koch, Duke Kamstra, Duncan Barclay, Jason George, Dru * Nelson, Mike Kephart, Bill Rainey and Douglas Rand. Apologies if I've left * out anyone who deserves a mention here. * * Special thanks to Ade Lovett for both hosting the mailing list and doing * the CEM56/REM56 support code; and the FreeBSD UK Users' Group for hosting * the web pages. * * Contact points: * * Driver web page: http://ukug.uk.freebsd.org/~scott/xe_drv/ * * Mailing list: http://www.lovett.com/lists/freebsd-xircom/ * or send "subscribe freebsd-xircom" to * * Author email: */ #ifndef XE_DEBUG #define XE_DEBUG 1 /* Increase for more voluminous output! */ #endif #include "xe.h" #include "card.h" #include "apm.h" #include "bpf.h" #if NXE > 0 #if NCARD > 0 #include #include -#include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #if NBPF > 0 #include #endif /* NBPF > 0 */ #include #include #include #include #if NAPM > 0 #include #endif /* NAPM > 0 */ #include #include #include #include /* * One of these structures per allocated device */ struct xe_softc { struct arpcom arpcom; struct ifmedia ifmedia; struct ifmib_iso_8802_3 mibdata; struct callout_handle chand; struct isa_device *dev; struct pccard_devinfo *crd; struct ifnet *ifp; struct ifmedia *ifm; char *card_type; /* Card model name */ char *vendor; /* Card manufacturer */ int unit; /* Unit number, from dev->id_unit */ int srev; /* Silicon revision */ int tx_queued; /* Packets currently waiting to transmit */ int tx_tpr; /* Last value of TPR reg on card */ int tx_collisions; /* Collisions since last successful send */ int tx_timeouts; /* Count of transmit timeouts */ int autoneg_status; /* Autonegotiation progress state */ int media; /* Private media word */ u_char version; /* Bonding Version register from card */ u_char modem; /* 1 = Card has a modem */ u_char ce2; /* 1 = Card has CE2 silicon */ u_char mohawk; /* 1 = Card has Mohawk (CE3) silicon */ u_char dingo; /* 1 = Card has Dingo (CEM56) silicon */ u_char phy_ok; /* 1 = MII-compliant PHY found and initialised */ u_char gone; /* 1 = Card bailed out */ #if NAPM > 0 struct apmhook suspend_hook; struct apmhook resume_hook; #endif /* NAPM > 0 */ }; static struct xe_softc *sca[MAXSLOT]; /* * MII command structure */ struct xe_mii_frame { u_int8_t mii_stdelim; u_int8_t mii_opcode; u_int8_t mii_phyaddr; u_int8_t mii_regaddr; u_int8_t mii_turnaround; u_int16_t mii_data; }; /* * For accessing card registers */ #define XE_INB(r) inb(scp->dev->id_iobase+(r)) #define XE_INW(r) inw(scp->dev->id_iobase+(r)) #define XE_OUTB(r, b) outb(scp->dev->id_iobase+(r), (b)) #define XE_OUTW(r, w) outw(scp->dev->id_iobase+(r), (w)) #define XE_SELECT_PAGE(p) XE_OUTB(XE_PR, (p)) /* * Horrid stuff for accessing CIS tuples */ #define CARD_MAJOR 50 #define CISTPL_BUFSIZE 512 #define CISTPL_TYPE(tpl) tpl[0] #define CISTPL_LEN(tpl) tpl[2] #define CISTPL_DATA(tpl,pos) tpl[4 + ((pos)<<1)] /* * Media autonegotiation progress constants */ #define XE_AUTONEG_NONE 0 /* No autonegotiation in progress */ #define XE_AUTONEG_WAITING 1 /* Waiting for transmitter to go idle */ #define XE_AUTONEG_STARTED 2 /* Waiting for autonegotiation to complete */ #define XE_AUTONEG_100TX 3 /* Trying to force 100baseTX link */ #define XE_AUTONEG_FAIL 4 /* Autonegotiation failed */ /* * Prototypes start here */ static int xe_probe (struct isa_device *dev); static int xe_card_init (struct pccard_devinfo *devi); static int xe_attach (struct isa_device *dev); static void xe_init (void *xscp); static void xe_start (struct ifnet *ifp); static int xe_ioctl (struct ifnet *ifp, u_long command, caddr_t data); static int xe_card_intr (struct pccard_devinfo *devi); static void xe_watchdog (struct ifnet *ifp); static int xe_media_change (struct ifnet *ifp); static void xe_media_status (struct ifnet *ifp, struct ifmediareq *mrp); static timeout_t xe_setmedia; static void xe_hard_reset (struct xe_softc *scp); static void xe_soft_reset (struct xe_softc *scp); static void xe_stop (struct xe_softc *scp); static void xe_enable_intr (struct xe_softc *scp); static void xe_disable_intr (struct xe_softc *scp); static void xe_setmulti (struct xe_softc *scp); static void xe_setaddrs (struct xe_softc *scp); static int xe_pio_write_packet (struct xe_softc *scp, struct mbuf *mbp); static void xe_card_unload (struct pccard_devinfo *devi); static u_int32_t xe_compute_crc (u_int8_t *data, int len); static int xe_compute_hashbit (u_int32_t crc); /* * MII functions */ static void xe_mii_sync (struct xe_softc *scp); static int xe_mii_init (struct xe_softc *scp); static void xe_mii_send (struct xe_softc *scp, u_int32_t bits, int cnt); static int xe_mii_readreg (struct xe_softc *scp, struct xe_mii_frame *frame); static int xe_mii_writereg (struct xe_softc *scp, struct xe_mii_frame *frame); static u_int16_t xe_phy_readreg (struct xe_softc *scp, u_int16_t reg); static void xe_phy_writereg (struct xe_softc *scp, u_int16_t reg, u_int16_t data); /* * Debug functions */ #ifdef XE_DEBUG #define XE_REG_DUMP(scp) xe_reg_dump((scp)) #define XE_MII_DUMP(scp) xe_mii_dump((scp)) static void xe_reg_dump (struct xe_softc *scp); static void xe_mii_dump (struct xe_softc *scp); #else #define XE_REG_DUMP(scp) #define XE_MII_DUMP(scp) #endif #if NAPM > 0 /* * APM hook functions */ static int xe_suspend (void *xunit); static int xe_resume (void *xunit); #endif /* NAPM > 0 */ /* * PCMCIA driver hooks */ #ifdef PCCARD_MODULE PCCARD_MODULE(xe, xe_card_init, xe_card_unload, xe_card_intr, 0, net_imask); #else static struct pccard_device xe_info = { /* For pre 3.1-STABLE code */ "xe", xe_card_init, xe_card_unload, xe_card_intr, 0, &net_imask }; DATA_SET(pccarddrv_set, xe_info); #endif /* PCCARD_MODULE */ /* * ISA driver hooks. I'd like to do without these but the kernel config stuff * seems to require them. */ struct isa_driver xedriver = { xe_probe, xe_attach, "xe" }; /* * ISA probe routine. * All of the supported devices are PCMCIA cards. I have no idea if it's even * possible to successfully probe/attach these at boot time (pccardd normally * does a lot of setup work) so I don't even bother trying. */ static int xe_probe (struct isa_device *dev) { #ifdef XE_DEBUG printf("xe%d: probe\n", dev->id_unit); #endif bzero(sca, MAXSLOT * sizeof(sca[0])); return 0; } /* * Two routines to read from/write to the attribute memory * the write portion is used only for fixing up the RealPort cards, * the reader portion was needed for debugging info, and duplicated some * code in xe_card_init(), so it appears here instead with suitable * modifications to xe_card_init() * -aDe Lovett */ static int xe_memwrite(struct pccard_devinfo *devi, off_t offset, u_char byte) { struct iovec iov; struct uio uios; iov.iov_base = &byte; iov.iov_len = sizeof(byte); uios.uio_iov = &iov; uios.uio_iovcnt = 1; uios.uio_offset = offset; uios.uio_resid = sizeof(byte); uios.uio_segflg = UIO_SYSSPACE; uios.uio_rw = UIO_WRITE; uios.uio_procp = 0; #if 0 /* THIS IS BOGUS */ return cdevsw[CARD_MAJOR]->d_write(makedev(CARD_MAJOR, devi->slt->slotnum), &uios, 0); #else return (-1); #endif } static int xe_memread(struct pccard_devinfo *devi, off_t offset, u_char *buf, int size) { struct iovec iov; struct uio uios; iov.iov_base = buf; iov.iov_len = size; uios.uio_iov = &iov; uios.uio_iovcnt = 1; uios.uio_offset = offset; uios.uio_resid = size; uios.uio_segflg = UIO_SYSSPACE; uios.uio_rw = UIO_READ; uios.uio_procp = 0; #if 0 /* THIS IS BOGUS */ return cdevsw[CARD_MAJOR]->d_read(makedev(CARD_MAJOR, devi->slt->slotnum), &uios, 0); #else return (-1); #endif } /* * Hacking for RealPort cards */ static int xe_cem56fix(struct xe_softc *scp) { struct pccard_devinfo *devi; struct slot *slt; struct slot_ctrl *ctrl; int ioport, fail; /* initialise a few variables */ devi = scp->crd; slt = devi->slt; ctrl = slt->ctrl; /* allocate a new I/O slot for the ethernet */ /* XXX: ctrl->mapio() always appears to return 0 (success), so * this may cause problems if another device is listening * on 0x300 already. In this case, you should choose a * known free I/O port address in the kernel config line * for the driver. It will be picked up here and used * instead of the autodetected value. */ slt->io[1].window = 1; slt->io[1].flags = IODF_WS|IODF_16BIT|IODF_ZEROWS|IODF_ACTIVE; slt->io[1].size = 0x10; #ifdef XE_IOBASE printf( "xe%d: user requested ioport 0x%x\n", scp->unit, XE_IOBASE ); ioport = XE_IOBASE; slt->io[1].start = ioport; fail = ctrl->mapio(slt, 1); #else for (ioport = 0x300; ioport < 0x400; ioport += 0x10) { slt->io[1].start = ioport; if ((fail = ctrl->mapio( slt, 1 )) == 0) break; } #endif /* did we find one? */ if (fail) { printf( "xe%d: xe_cem56fix: no free address space\n", scp->unit ); return -1; } /* munge the id_iobase entry for use by the rest of the driver */ #if XE_DEBUG > 1 printf( "xe%d: using 0x%x for RealPort ethernet\n", scp->unit, ioport ); #endif scp->dev->id_iobase = ioport; scp->dev->id_alive = 0x10; /* magic to set up the ethernet */ xe_memwrite( devi, DINGO_ECOR, DINGO_ECOR_IRQ_LEVEL|DINGO_ECOR_INT_ENABLE| DINGO_ECOR_IOB_ENABLE|DINGO_ECOR_ETH_ENABLE ); xe_memwrite( devi, DINGO_EBAR0, ioport & 0xff ); xe_memwrite( devi, DINGO_EBAR1, (ioport >> 8) & 0xff ); xe_memwrite( devi, DINGO_DCOR0, DINGO_DCOR0_SF_INT ); xe_memwrite( devi, DINGO_DCOR1, DINGO_DCOR1_INT_LEVEL|DINGO_DCOR1_EEDIO ); xe_memwrite( devi, DINGO_DCOR2, 0x00 ); xe_memwrite( devi, DINGO_DCOR3, 0x00 ); xe_memwrite( devi, DINGO_DCOR4, 0x00 ); /* success! */ return 0; } /* * PCMCIA probe routine. * Probe and identify the device. Called by the slot manager when the card is * inserted or the machine wakes up from suspend mode. Assmes that the slot * structure has been initialised already. */ static int xe_card_init(struct pccard_devinfo *devi) { struct xe_softc *scp; struct isa_device *dev; u_char buf[CISTPL_BUFSIZE]; u_char ver_str[CISTPL_BUFSIZE>>1]; off_t offs; int unit, success, rc, i; unit = devi->isahd.id_unit; scp = sca[unit]; dev = &devi->isahd; success = 0; #ifdef XE_DEBUG printf("xe: Probing for unit %d\n", unit); #endif /* Check that unit number is OK */ if (unit > MAXSLOT) { printf("xe%d: bad unit\n", unit); return (ENODEV); } /* Don't attach an active device */ if (scp && !scp->gone) { printf("xe%d: already attached\n", unit); return (EBUSY); } /* Allocate per-instance storage */ if (!scp) { if ((scp = malloc(sizeof(*scp), M_DEVBUF, M_NOWAIT)) == NULL) { printf("xe%d: failed to allocage driver storage\n", unit); return (ENOMEM); } bzero(scp, sizeof(*scp)); } /* Re-attach an existing device */ if (scp->gone) { scp->gone = 0; return 0; } /* Grep through CIS looking for relevant tuples */ offs = 0; do { u_int16_t vendor; u_int8_t rev, media, prod; /* * Read tuples one at a time into buf. Sucks, but it only happens once. * XXX - This assumes that attribute has been mapped by pccardd, which * XXX - seems to be the default situation. If not, we're well and truly * XXX - FUBAR. This is a general PCCARD problem, not our fault :) */ if ((rc = xe_memread( devi, offs, buf, CISTPL_BUFSIZE )) == 0) { switch (CISTPL_TYPE(buf)) { case 0x15: /* Grab version string (needed to ID some weird CE2's) */ #if XE_DEBUG > 1 printf("xe%d: Got version string (0x15)\n", unit); #endif for (i = 0; i < CISTPL_LEN(buf); ver_str[i] = CISTPL_DATA(buf, i++)); ver_str[i] = '\0'; ver_str[(CISTPL_BUFSIZE>>1) - 1] = CISTPL_LEN(buf); success++; break; case 0x20: /* Figure out what type of card we have */ #if XE_DEBUG > 1 printf("xe%d: Got card ID (0x20)\n", unit); #endif vendor = CISTPL_DATA(buf, 0) + (CISTPL_DATA(buf, 1) << 8); rev = CISTPL_DATA(buf, 2); media = CISTPL_DATA(buf, 3); prod = CISTPL_DATA(buf, 4); switch (vendor) { /* Get vendor ID */ case 0x0105: scp->vendor = "Xircom"; break; case 0x0138: case 0x0183: scp->vendor = "Compaq"; break; case 0x0089: scp->vendor = "Intel"; break; default: scp->vendor = "Unknown"; } if (!((prod & 0x40) && (media & 0x01))) { #if XE_DEBUG > 1 printf("xe%d: Not a PCMCIA Ethernet card!\n", unit); #endif rc = ENODEV; /* Not a PCMCIA Ethernet device */ } else { if (media & 0x10) { /* Ethernet/modem cards */ #if XE_DEBUG > 1 printf("xe%d: Card is Ethernet/modem combo\n", unit); #endif scp->modem = 1; switch (prod & 0x0f) { case 1: scp->card_type = "CEM"; break; case 2: scp->ce2 = 1; scp->card_type = "CEM2"; break; case 3: scp->ce2 = 1; scp->card_type = "CEM3"; break; case 4: scp->ce2 = 1; scp->card_type = "CEM33"; break; case 5: scp->mohawk = 1; scp->card_type = "CEM56M"; break; case 6: case 7: /* Some kind of RealPort card */ scp->mohawk = 1; scp->dingo = 1; scp->card_type = "CEM56"; break; default: rc = ENODEV; } } else { /* Ethernet-only cards */ #if XE_DEBUG > 1 printf("xe%d: Card is Ethernet only\n", unit); #endif switch (prod & 0x0f) { case 1: scp->card_type = "CE"; break; case 2: scp->ce2 = 1; scp->card_type = "CE2"; break; case 3: scp->mohawk = 1; scp->card_type = "CE3"; break; default: rc = ENODEV; } } } success++; break; case 0x22: /* Get MAC address */ if ((CISTPL_LEN(buf) == 8) && (CISTPL_DATA(buf, 0) == 0x04) && (CISTPL_DATA(buf, 1) == ETHER_ADDR_LEN)) { #if XE_DEBUG > 1 printf("xe%d: Got MAC address (0x22)\n", unit); #endif for (i = 0; i < ETHER_ADDR_LEN; scp->arpcom.ac_enaddr[i] = CISTPL_DATA(buf, i+2), i++); } success++; break; default: } } /* Skip to next tuple */ offs += ((CISTPL_LEN(buf) + 2) << 1); } while ((CISTPL_TYPE(buf) != 0xff) && (CISTPL_LEN(buf) != 0xff) && (rc == 0)); /* Die now if something went wrong above */ if ((rc != 0) || (success < 3)) { free(scp, M_DEVBUF); return rc; } /* Check for certain strange CE2's that look like CE's */ if (strcmp(scp->card_type, "CE") == 0) { u_char *str = ver_str; #if XE_DEBUG > 1 printf("xe%d: Checking for weird CE2 string\n", unit); #endif str += strlen(str) + 1; /* Skip forward to 3rd version string */ str += strlen(str) + 1; str += strlen(str) + 1; for (i = 0; i < strlen(str) - 2; i++) { if (bcmp(&str[i], "CE2", 3) ==0) { /* Look for "CE2" string */ scp->card_type = "CE2"; } } } /* Reject unsupported cards */ if (strcmp(scp->card_type, "CE") == 0 || strcmp(scp->card_type, "CEM") == 0) { printf("xe%d: Sorry, your %s card is not supported :(\n", unit, scp->card_type); free(scp, M_DEVBUF); return ENODEV; } /* Fill in some private data */ sca[unit] = scp; scp->dev = &devi->isahd; scp->crd = devi; scp->ifp = &scp->arpcom.ac_if; scp->ifm = &scp->ifmedia; scp->unit = unit; scp->autoneg_status = 0; /* Hack RealPorts into submission */ if (scp->dingo && xe_cem56fix(scp) < 0) { printf( "xe%d: Unable to fix your RealPort\n", unit ); sca[unit] = 0; free(scp, M_DEVBUF); return ENODEV; } /* Hopefully safe to read this here */ XE_SELECT_PAGE(4); scp->version = XE_INB(XE_BOV); /* Attempt to attach the device */ if (!xe_attach(scp->dev)) { sca[unit] = 0; free(scp, M_DEVBUF); return ENXIO; } #if NAPM > 0 /* Establish APM hooks once device attached */ scp->suspend_hook.ah_name = "xe_suspend"; scp->suspend_hook.ah_fun = xe_suspend; scp->suspend_hook.ah_arg = (void *)unit; scp->suspend_hook.ah_order = APM_MIN_ORDER; apm_hook_establish(APM_HOOK_SUSPEND, &scp->suspend_hook); scp->resume_hook.ah_name = "xe_resume"; scp->resume_hook.ah_fun = xe_resume; scp->resume_hook.ah_arg = (void *)unit; scp->resume_hook.ah_order = APM_MIN_ORDER; apm_hook_establish(APM_HOOK_RESUME, &scp->resume_hook); #endif /* NAPM > 0 */ /* Success */ return 0; } /* * Attach a device (called when xe_card_init succeeds). Assume that the probe * routine has set up the softc structure correctly and that we can trust the * unit number. */ static int xe_attach (struct isa_device *dev) { struct xe_softc *scp = sca[dev->id_unit]; int i; #ifdef XE_DEBUG printf("xe%d: attach\n", scp->unit); #endif /* Initialise the ifnet structure */ if (!scp->ifp->if_name) { scp->ifp->if_softc = scp; scp->ifp->if_name = "xe"; scp->ifp->if_unit = scp->unit; scp->ifp->if_timer = 0; scp->ifp->if_flags = (IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); scp->ifp->if_linkmib = &scp->mibdata; scp->ifp->if_linkmiblen = sizeof scp->mibdata; scp->ifp->if_output = ether_output; scp->ifp->if_start = xe_start; scp->ifp->if_ioctl = xe_ioctl; scp->ifp->if_watchdog = xe_watchdog; scp->ifp->if_init = xe_init; scp->ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; } /* Initialise the ifmedia structure */ ifmedia_init(scp->ifm, 0, xe_media_change, xe_media_status); callout_handle_init(&scp->chand); /* * Fill in supported media types. Some cards _do_ support full duplex * operation, but this driver doesn't, yet. Therefore we leave those modes * out of the list. We support some form of autoselection in all cases. */ if (scp->mohawk) { ifmedia_add(scp->ifm, IFM_ETHER|IFM_100_TX, 0, NULL); ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_T, 0, NULL); } else { ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_T, 0, NULL); ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_2, 0, NULL); } ifmedia_add(scp->ifm, IFM_ETHER|IFM_AUTO, 0, NULL); /* Default is to autoselect best supported media type */ ifmedia_set(scp->ifm, IFM_ETHER|IFM_AUTO); /* Print some useful information */ printf("\n"); printf("xe%d: %s %s, bonding version %#x%s%s\n", scp->unit, scp->vendor, scp->card_type, scp->version, scp->mohawk ? ", 100Mbps capable" : "", scp->modem ? ", with modem" : ""); if (scp->mohawk) { XE_SELECT_PAGE(0x10); printf("xe%d: DingoID = %#x, RevisionID = %#x, VendorID = %#x\n", scp->unit, XE_INW(XE_DINGOID), XE_INW(XE_RevID), XE_INW(XE_VendorID)); } if (scp->ce2) { XE_SELECT_PAGE(0x45); printf("xe%d: CE2 version = %#x\n", scp->unit, XE_INB(XE_REV)); } /* Print MAC address */ printf("xe%d: Ethernet address %02x", scp->unit, scp->arpcom.ac_enaddr[0]); for (i = 1; i < ETHER_ADDR_LEN; i++) { printf(":%02x", scp->arpcom.ac_enaddr[i]); } printf("\n"); /* Attach the interface */ if_attach(scp->ifp); ether_ifattach(scp->ifp); #if NBPF > 0 /* If BPF is in the kernel, call the attach for it */ #if XE_DEBUG > 1 printf("xe%d: BPF listener attached\n", scp->unit); #endif bpfattach(scp->ifp, DLT_EN10MB, sizeof(struct ether_header)); #endif /* Done */ return 1; } /* * Initialize device. Completes the reset procedure on the card and starts * output. If there's an autonegotiation in progress we DON'T do anything; * the media selection code will call us again when it's done. */ static void xe_init(void *xscp) { struct xe_softc *scp = xscp; int s; #ifdef XE_DEBUG printf("xe%d: init\n", scp->unit); #endif if (scp->gone) return; if (TAILQ_EMPTY(&scp->ifp->if_addrhead)) return; /* Reset transmitter flags */ scp->tx_queued = 0; scp->tx_tpr = 0; scp->tx_collisions = 0; scp->ifp->if_timer = 0; s = splimp(); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC0, 0x20); /* Disable source insertion (WTF is that?) */ /* * Set the 'local memory dividing line' -- splits the 32K card memory into * 8K for transmit buffers and 24K for receive. This is done automatically * on newer revision cards. */ if (scp->srev != 1) { XE_SELECT_PAGE(2); XE_OUTW(XE_RBS, 0x2000); } /* Set up multicast addresses */ xe_setmulti(scp); /* Fix the data offset register -- reset leaves it off-by-one */ XE_SELECT_PAGE(0); XE_OUTW(XE_DO, 0x2000); /* * Set MAC interrupt masks and clear status regs. The bit names are direct * from the Linux code; I have no idea what most of them do. */ XE_SELECT_PAGE(0x40); /* Bit 7..0 */ XE_OUTB(XE_RX0Msk, 0xff); /* ROK, RAB, rsv, RO, CRC, AE, PTL, MP */ XE_OUTB(XE_TX0Msk, 0xff); /* TOK, TAB, SQE, LL, TU, JAB, EXC, CRS */ XE_OUTB(XE_TX0Msk+1, 0xb0); /* rsv, rsv, PTD, EXT, rsv, rsv, rsv, rsv */ XE_OUTB(XE_RST0, 0x00); /* ROK, RAB, REN, RO, CRC, AE, PTL, MP */ XE_OUTB(XE_TXST0, 0x00); /* TOK, TAB, SQE, LL, TU, JAB, EXC, CRS */ XE_OUTB(XE_TXST1, 0x00); /* TEN, rsv, PTD, EXT, retry_counter:4 */ /* * Check for an in-progress autonegotiation. If one is active, just set * IFF_RUNNING and return. The media selection code will call us again when * it's done. */ if (scp->autoneg_status) { scp->ifp->if_flags |= IFF_RUNNING; } else { /* Enable receiver, put MAC online */ XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_RX_ENABLE|XE_CMD0_ONLINE); /* Set up IMR, enable interrupts */ xe_enable_intr(scp); /* Attempt to start output */ scp->ifp->if_flags |= IFF_RUNNING; scp->ifp->if_flags &= ~IFF_OACTIVE; xe_start(scp->ifp); } (void)splx(s); } /* * Start output on interface. We make two assumptions here: * 1) that the current priority is set to splimp _before_ this code * is called *and* is returned to the appropriate priority after * return * 2) that the IFF_OACTIVE flag is checked before this code is called * (i.e. that the output part of the interface is idle) */ static void xe_start(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; struct mbuf *mbp; if (scp->gone) return; /* * Loop while there are packets to be sent, and space to send them. */ while (1) { IF_DEQUEUE(&ifp->if_snd, mbp); /* Suck a packet off the send queue */ if (mbp == NULL) { /* * We are using the !OACTIVE flag to indicate to the outside world that * we can accept an additional packet rather than that the transmitter * is _actually_ active. Indeed, the transmitter may be active, but if * we haven't filled all the buffers with data then we still want to * accept more. */ ifp->if_flags &= ~IFF_OACTIVE; return; } if (xe_pio_write_packet(scp, mbp) != 0) { IF_PREPEND(&ifp->if_snd, mbp); /* Push the packet back onto the queue */ ifp->if_flags |= IFF_OACTIVE; return; } #if NBPF > 0 /* Tap off here if there is a bpf listener */ if (ifp->if_bpf) { #if XE_DEBUG > 1 printf("xe%d: sending output packet to BPF\n", scp->unit); #endif bpf_mtap(ifp, mbp); } #endif /* NBPF > 0 */ ifp->if_timer = 5; /* In case we don't hear from the card again */ scp->tx_queued++; m_freem(mbp); } } /* * Process an ioctl request. Adapted from the ed driver. */ static int xe_ioctl (register struct ifnet *ifp, u_long command, caddr_t data) { struct xe_softc *scp; int s, error; scp = ifp->if_softc; error = 0; if (scp->gone) { return ENXIO; } s = splimp(); switch (command) { case SIOCSIFADDR: case SIOCGIFADDR: case SIOCSIFMTU: error = ether_ioctl(ifp, command, data); break; case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. If it is * marked down and running, then stop it. */ if (ifp->if_flags & IFF_UP) { if (!(ifp->if_flags & IFF_RUNNING)) { xe_hard_reset(scp); xe_setmedia(scp); xe_init(scp); } } else { if (ifp->if_flags & IFF_RUNNING) xe_stop(scp); } case SIOCADDMULTI: case SIOCDELMULTI: /* * Multicast list has (maybe) changed; set the hardware filter * accordingly. This also serves to deal with promiscuous mode if we have * a BPF listener active. */ xe_setmulti(scp); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: /* * Someone wants to get/set media options. */ error = ifmedia_ioctl(ifp, (struct ifreq *)data, &scp->ifmedia, command); break; default: error = EINVAL; } (void)splx(s); return error; } /* * Card interrupt handler: should return true if the interrupt was for us, in * case we are sharing our IRQ line with other devices (this will probably be * the case for multifunction cards). * * This function is probably more complicated than it needs to be, as it * attempts to deal with the case where multiple packets get sent between * interrupts. This is especially annoying when working out the collision * stats. Not sure whether this case ever really happens or not (maybe on a * slow/heavily loaded machine?) so it's probably best to leave this like it * is. * * Note that the crappy PIO used to get packets on and off the card means that * you will spend a lot of time in this routine -- I can get my P150 to spend * 90% of its time servicing interrupts if I really hammer the network. Could * fix this, but then you'd start dropping/losing packets. The moral of this * story? If you want good network performance _and_ some cycles left over to * get your work done, don't buy a Xircom card. Or convince them to tell me * how to do memory-mapped I/O :) */ static int xe_card_intr(struct pccard_devinfo *devi) { struct xe_softc *scp; struct ifnet *ifp; int unit, result; u_int16_t rx_bytes, rxs, txs; u_int8_t psr, isr, esr, rsr; unit = devi->isahd.id_unit; scp = sca[unit]; ifp = &scp->arpcom.ac_if; rx_bytes = 0; /* Bytes received on this interrupt */ result = 0; /* Set true if the interrupt is for us */ if (scp->gone) return 0; if (scp->mohawk) { XE_OUTB(XE_CR, 0); /* Disable interrupts */ } psr = XE_INB(XE_PR); /* Stash the current register page */ /* * Read ISR to see what caused this interrupt. Note that this clears the * ISR on CE2 type cards. */ if ((isr = XE_INB(XE_ISR)) && isr != 0xff) { result = 1; /* This device did generate an int */ esr = XE_INB(XE_ESR); /* Read the other status registers */ XE_SELECT_PAGE(0x40); rxs = XE_INB(XE_RST0); XE_OUTB(XE_RST0, ~rxs & 0xff); txs = XE_INB(XE_TXST0); txs |= XE_INB(XE_TXST1) << 8; XE_OUTB(XE_TXST0, 0); XE_OUTB(XE_TXST1, 0); XE_SELECT_PAGE(0); #if XE_DEBUG > 2 printf("xe%d: ISR=%#2.2x ESR=%#2.2x RST=%#2.2x TXST=%#4.4x\n", unit, isr, esr, rxs, txs); #endif /* * Handle transmit interrupts */ if (isr & XE_ISR_TX_PACKET) { u_int8_t new_tpr, sent; if ((new_tpr = XE_INB(XE_TPR)) < scp->tx_tpr) /* Update packet count */ sent = (0xff - scp->tx_tpr) + new_tpr; /* TPR rolled over */ else sent = new_tpr - scp->tx_tpr; if (sent > 0) { /* Packets sent since last interrupt */ scp->tx_tpr = new_tpr; scp->tx_queued -= sent; ifp->if_opackets += sent; ifp->if_collisions += scp->tx_collisions; /* * Collision stats are a PITA. If multiples frames have been sent, we * distribute any outstanding collision count equally amongst them. * However, if we're missing interrupts we're quite likely to also * miss some collisions; thus the total count will be off anyway. * Likewise, if we miss a frame dropped due to excessive collisions * any outstanding collisions count will be held against the next * frame to be successfully sent. Hopefully it averages out in the * end! * XXX - This will screw up if tx_collisions/sent > 14. FIX IT! */ switch (scp->tx_collisions) { case 0: break; case 1: scp->mibdata.dot3StatsSingleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[0]++; break; default: if (sent == 1) { scp->mibdata.dot3StatsMultipleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[scp->tx_collisions-1]++; } else { /* Distribute across multiple frames */ scp->mibdata.dot3StatsMultipleCollisionFrames += sent; scp->mibdata. dot3StatsCollFrequencies[scp->tx_collisions/sent] += sent - scp->tx_collisions%sent; scp->mibdata. dot3StatsCollFrequencies[scp->tx_collisions/sent + 1] += scp->tx_collisions%sent; } } scp->tx_collisions = 0; } ifp->if_timer = 0; ifp->if_flags &= ~IFF_OACTIVE; } if (txs & 0x0002) { /* Excessive collisions (packet dropped) */ ifp->if_collisions += 16; ifp->if_oerrors++; scp->tx_collisions = 0; scp->mibdata.dot3StatsExcessiveCollisions++; scp->mibdata.dot3StatsMultipleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[15]++; XE_OUTB(XE_CR, XE_CR_RESTART_TX); } if (txs & 0x0040) /* Transmit aborted -- probably collisions */ scp->tx_collisions++; /* * Handle receive interrupts */ while ((esr = XE_INB(XE_ESR)) & XE_ESR_FULL_PACKET_RX) { if ((rsr = XE_INB(XE_RSR)) & XE_RSR_RX_OK) { struct ether_header *ehp; struct mbuf *mbp; u_int16_t len; len = XE_INW(XE_RBC); if (len == 0) continue; #if 0 /* * Limit the amount of time we spend in this loop, dropping packets if * necessary. The Linux code does this with considerably more * finesse, adjusting the threshold dynamically. */ if ((rx_bytes += len) > 22000) { ifp->if_iqdrops++; scp->mibData.dot3StatsMissedFrames++; XE_OUTW(XE_DO, 0x8000); continue; } #endif if (len & 0x01) len++; MGETHDR(mbp, M_DONTWAIT, MT_DATA); /* Allocate a header mbuf */ if (mbp != NULL) { mbp->m_pkthdr.rcvif = ifp; mbp->m_pkthdr.len = mbp->m_len = len; /* * If the mbuf header isn't big enough for the packet, attach an * mbuf cluster to hold it. The +2 is to allow for the nasty little * alignment hack below. */ if (len + 2 > MHLEN) { MCLGET(mbp, M_DONTWAIT); if ((mbp->m_flags & M_EXT) == 0) { m_freem(mbp); mbp = NULL; } } } if (mbp != NULL) { /* * The Ethernet header is 14 bytes long; thus the actual packet data * won't be 32-bit aligned when it's dumped into the mbuf. We * offset everything by 2 bytes to fix this. Apparently the * alignment is important for NFS, damn its eyes. */ mbp->m_data += 2; ehp = mtod(mbp, struct ether_header *); /* * Now get the packet, including the Ethernet header and trailer (?) * We use programmed I/O, because we don't know how to do shared * memory with these cards. So yes, it's real slow, and heavy on * the interrupts (CPU on my P150 maxed out at ~950KBps incoming). */ if (scp->srev == 0) { /* Workaround a bug in old cards */ u_short rhs; XE_SELECT_PAGE(5); rhs = XE_INW(XE_RHSA); XE_SELECT_PAGE(0); rhs += 3; /* Skip control info */ if (rhs >= 0x8000) rhs = 0; if (rhs + len > 0x8000) { int i; /* * XXX - This i-- seems very wrong, but it's what the Linux guys * XXX - do. Need someone with an old CE2 to test this for me. * XXX - 99/3/28: Changed the first i-- to an i++, maybe that'll * XXX - fix it? It seems as though the previous version would * XXX - have caused an infinite loop (what, another one?). */ for (i = 0; i < len; i++, rhs++) { ((char *)ehp)[i] = XE_INB(XE_EDP); if (rhs == 0x8000) { rhs = 0; i--; } } } else insw(scp->dev->id_iobase+XE_EDP, ehp, len >> 1); } else insw(scp->dev->id_iobase+XE_EDP, ehp, len >> 1); #if NBPF > 0 /* * Check if there's a BPF listener on this interface. If so, hand * off the raw packet to bpf. */ if (ifp->if_bpf) { #if XE_DEBUG > 1 printf("xe%d: passing input packet to BPF\n", scp->unit); #endif bpf_mtap(ifp, mbp); /* * Note that the interface cannot be in promiscuous mode if there * are no BPF listeners. And if we are in promiscuous mode, we * have to check if this packet is really ours. */ if ((ifp->if_flags & IFF_PROMISC) && bcmp(ehp->ether_dhost, scp->arpcom.ac_enaddr, sizeof(ehp->ether_dhost)) != 0 && (rsr & XE_RSR_PHYS_PACKET)) { m_freem(mbp); mbp = NULL; } } #endif /* NBPF > 0 */ if (mbp != NULL) { mbp->m_pkthdr.len = mbp->m_len = len - ETHER_HDR_LEN; mbp->m_data += ETHER_HDR_LEN; /* Strip off Ethernet header */ ether_input(ifp, ehp, mbp); /* Send the packet on its way */ ifp->if_ipackets++; /* Success! */ } XE_OUTW(XE_DO, 0x8000); /* skip_rx_packet command */ } } else if (rsr & XE_RSR_LONG_PACKET) { /* Packet length >1518 bytes */ scp->mibdata.dot3StatsFrameTooLongs++; ifp->if_ierrors++; } else if (rsr & XE_RSR_CRC_ERROR) { /* Bad checksum on packet */ scp->mibdata.dot3StatsFCSErrors++; ifp->if_ierrors++; } else if (rsr & XE_RSR_ALIGN_ERROR) { /* Packet alignment error */ scp->mibdata.dot3StatsAlignmentErrors++; ifp->if_ierrors++; } } if (rxs & 0x10) { /* Receiver overrun */ scp->mibdata.dot3StatsInternalMacReceiveErrors++; ifp->if_ierrors++; XE_OUTB(XE_CR, XE_CR_CLEAR_OVERRUN); } } XE_SELECT_PAGE(psr); /* Restore saved page */ XE_OUTB(XE_CR, XE_CR_ENABLE_INTR); /* Re-enable interrupts */ /* Could force an int here, instead of dropping packets? */ /* XE_OUTB(XE_CR, XE_CR_ENABLE_INTR|XE_CE_FORCE_INTR); */ return result; } /* * Device timeout/watchdog routine. Called automatically if we queue a packet * for transmission but don't get an interrupt within a specified timeout * (usually 5 seconds). When this happens we assume the worst and reset the * card. */ static void xe_watchdog(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; if (scp->gone) return; printf("xe%d: watchdog timeout; resetting card\n", scp->unit); scp->tx_timeouts++; ifp->if_oerrors += scp->tx_queued; xe_stop(scp); xe_hard_reset(scp); xe_setmedia(scp); xe_init(scp); } /* * Change media selection. */ static int xe_media_change(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; #ifdef XE_DEBUG printf("xe%d: media_change\n", ifp->if_unit); #endif if (IFM_TYPE(scp->ifm->ifm_media) != IFM_ETHER) return(EINVAL); /* * Some card/media combos aren't always possible -- filter those out here. */ if ((IFM_SUBTYPE(scp->ifm->ifm_media) == IFM_AUTO || IFM_SUBTYPE(scp->ifm->ifm_media) == IFM_100_TX) && !scp->phy_ok) return (EINVAL); xe_setmedia(scp); return 0; } /* * Return current media selection. */ static void xe_media_status(struct ifnet *ifp, struct ifmediareq *mrp) { #ifdef XE_DEBUG printf("xe%d: media_status\n", ifp->if_unit); #endif mrp->ifm_active = ((struct xe_softc *)ifp->if_softc)->media; return; } /* * Select active media. */ static void xe_setmedia(void *xscp) { struct xe_softc *scp = xscp; u_int16_t bmcr, bmsr, anar, lpar; #ifdef XE_DEBUG printf("xe%d: setmedia\n", scp->unit); #endif /* Cancel any pending timeout */ untimeout(xe_setmedia, scp, scp->chand); xe_disable_intr(scp); /* Select media */ scp->media = IFM_ETHER; switch (IFM_SUBTYPE(scp->ifm->ifm_media)) { case IFM_AUTO: /* Autoselect media */ scp->media = IFM_ETHER|IFM_AUTO; /* * Autoselection is really awful. It goes something like this: * * Wait until the transmitter goes idle (2sec timeout). * Reset card * IF a 100Mbit PHY exists * Start NWAY autonegotiation (3.5sec timeout) * IF that succeeds * Select 100baseTX or 10baseT, whichever was detected * ELSE * Reset card * IF a 100Mbit PHY exists * Try to force a 100baseTX link (3sec timeout) * IF that succeeds * Select 100baseTX * ELSE * Disable the PHY * ENDIF * ENDIF * ENDIF * ENDIF * IF nothing selected so far * IF a 100Mbit PHY exists * Select 10baseT * ELSE * Select 10baseT or 10base2, whichever is connected * ENDIF * ENDIF */ switch (scp->autoneg_status) { case XE_AUTONEG_NONE: #if XE_DEBUG > 1 printf("xe%d: Waiting for idle transmitter\n", scp->unit); #endif scp->arpcom.ac_if.if_flags |= IFF_OACTIVE; scp->autoneg_status = XE_AUTONEG_WAITING; scp->chand = timeout(xe_setmedia, scp, hz * 2); return; case XE_AUTONEG_WAITING: xe_soft_reset(scp); if (scp->phy_ok) { #if XE_DEBUG > 1 printf("xe%d: Starting autonegotiation\n", scp->unit); #endif bmcr = xe_phy_readreg(scp, PHY_BMCR); bmcr &= ~(PHY_BMCR_AUTONEGENBL); xe_phy_writereg(scp, PHY_BMCR, bmcr); anar = xe_phy_readreg(scp, PHY_ANAR); anar &= ~(PHY_ANAR_100BT4|PHY_ANAR_100BTXFULL|PHY_ANAR_10BTFULL); anar |= PHY_ANAR_100BTXHALF|PHY_ANAR_10BTHALF; xe_phy_writereg(scp, PHY_ANAR, anar); bmcr |= PHY_BMCR_AUTONEGENBL|PHY_BMCR_AUTONEGRSTR; xe_phy_writereg(scp, PHY_BMCR, bmcr); scp->autoneg_status = XE_AUTONEG_STARTED; scp->chand = timeout(xe_setmedia, scp, hz * 7/2); return; } else { scp->autoneg_status = XE_AUTONEG_FAIL; } break; case XE_AUTONEG_STARTED: bmsr = xe_phy_readreg(scp, PHY_BMSR); lpar = xe_phy_readreg(scp, PHY_LPAR); if (bmsr & (PHY_BMSR_AUTONEGCOMP|PHY_BMSR_LINKSTAT)) { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation complete!\n", scp->unit); #endif /* * XXX - Shouldn't have to do this, but (on my hub at least) the * XXX - transmitter won't work after a successful autoneg. So we see * XXX - what the negotiation result was and force that mode. I'm * XXX - sure there is an easy fix for this. */ if (lpar & PHY_LPAR_100BTXHALF) { xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); XE_MII_DUMP(scp); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media = IFM_ETHER|IFM_100_TX; scp->autoneg_status = XE_AUTONEG_NONE; } else { /* * XXX - Bit of a hack going on in here. * XXX - This is derived from Ken Hughes patch to the Linux driver * XXX - to make it work with 10Mbit _autonegotiated_ links on CE3B * XXX - cards. What's a CE3B and how's it differ from a plain CE3? * XXX - these are the things we need to find out. */ xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); /* BEGIN HACK */ XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media = IFM_ETHER|IFM_10_T; scp->autoneg_status = XE_AUTONEG_NONE; /* END HACK */ /*XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08);*/ /* Disable PHY? */ /*scp->autoneg_status = XE_AUTONEG_FAIL;*/ } } else { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation failed; trying 100baseTX\n", scp->unit); #endif XE_MII_DUMP(scp); xe_soft_reset(scp); if (scp->phy_ok) { xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); scp->autoneg_status = XE_AUTONEG_100TX; scp->chand = timeout(xe_setmedia, scp, hz * 3); return; } else { scp->autoneg_status = XE_AUTONEG_FAIL; } } break; case XE_AUTONEG_100TX: (void)xe_phy_readreg(scp, PHY_BMSR); bmsr = xe_phy_readreg(scp, PHY_BMSR); if (bmsr & PHY_BMSR_LINKSTAT) { #if XE_DEBUG > 1 printf("xe%d: Got 100baseTX link!\n", scp->unit); #endif XE_MII_DUMP(scp); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media = IFM_ETHER|IFM_100_TX; scp->autoneg_status = XE_AUTONEG_NONE; } else { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation failed; disabling PHY\n", scp->unit); #endif XE_MII_DUMP(scp); xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08); /* Disable PHY? */ scp->autoneg_status = XE_AUTONEG_FAIL; } break; } /* * If we got down here _and_ autoneg_status is XE_AUTONEG_FAIL, then * either autonegotiation failed, or never got started to begin with. In * either case, select a suitable 10Mbit media and hope it works. We * don't need to reset the card again, since it will have been done * already by the big switch above. */ if (scp->autoneg_status == XE_AUTONEG_FAIL) { #if XE_DEBUG > 1 printf("xe%d: Selecting 10baseX\n", scp->unit); #endif if (scp->mohawk) { XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media = IFM_ETHER|IFM_10_T; scp->autoneg_status = XE_AUTONEG_NONE; } else { XE_SELECT_PAGE(4); XE_OUTB(XE_GPR0, 4); DELAY(50000); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, (XE_INB(XE_ESR) & XE_ESR_MEDIA_SELECT) ? 0x80 : 0xc0); scp->media = IFM_ETHER|((XE_INB(XE_ESR) & XE_ESR_MEDIA_SELECT) ? IFM_10_T : IFM_10_2); scp->autoneg_status = XE_AUTONEG_NONE; } } break; /* * If a specific media has been requested, we just reset the card and * select it (one small exception -- if 100baseTX is requested by there is * no PHY, we fall back to 10baseT operation). */ case IFM_100_TX: /* Force 100baseTX */ xe_soft_reset(scp); if (scp->phy_ok) { #if XE_DEBUG > 1 printf("xe%d: Selecting 100baseTX\n", scp->unit); #endif XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0); xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media |= IFM_100_TX; break; } /* FALLTHROUGH */ case IFM_10_T: /* Force 10baseT */ xe_soft_reset(scp); #if XE_DEBUG > 1 printf("xe%d: Selecting 10baseT\n", scp->unit); #endif if (scp->phy_ok) { xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08); /* Disable PHY */ } XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media |= IFM_10_T; break; case IFM_10_2: xe_soft_reset(scp); #if XE_DEBUG > 1 printf("xe%d: Selecting 10base2\n", scp->unit); #endif XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0xc0); scp->media |= IFM_10_2; break; } /* * Finally, the LEDs are set to match whatever media was chosen and the * transmitter is unblocked. */ #if XE_DEBUG > 1 printf("xe%d: Setting LEDs\n", scp->unit); #endif XE_SELECT_PAGE(2); switch (IFM_SUBTYPE(scp->media)) { case IFM_100_TX: case IFM_10_T: XE_OUTB(XE_LED, 0x3b); if (scp->dingo) XE_OUTB(0x0b, 0x04); /* 100Mbit LED */ break; case IFM_10_2: XE_OUTB(XE_LED, 0x3a); break; } /* Restart output? */ scp->ifp->if_flags &= ~IFF_OACTIVE; xe_init(scp); } /* * Hard reset (power cycle) the card. */ static void xe_hard_reset(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: hard_reset\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Power cycle the card. */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR1, 0); /* Power off */ DELAY(40000); if (scp->mohawk) XE_OUTB(XE_GPR1, 1); /* And back on again */ else XE_OUTB(XE_GPR1, 5); /* Also set AIC bit, whatever that is */ DELAY(40000); XE_SELECT_PAGE(0); (void)splx(s); } /* * Soft reset the card. Also makes sure that the ML6692 and 10Mbit controller * are powered up, sets the silicon revision number in softc, disables * interrupts and checks for the prescence of a 100Mbit PHY. This should * leave us in a position where we can access the PHY and do media * selection. The function imposes a 0.5s delay while the hardware powers up. */ static void xe_soft_reset(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: soft_reset\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Reset the card, (again). */ XE_SELECT_PAGE(0); XE_OUTB(XE_CR, XE_CR_SOFT_RESET); DELAY(40000); XE_OUTB(XE_CR, 0); DELAY(40000); if (scp->mohawk) { /* * set GP1 and GP2 as outputs (bits 2 & 3) * set GP1 low to power on the ML6692 (bit 0) * set GP2 high to power on the 10Mhz chip (bit 1) */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR0, 0x0e); } /* * Wait for everything to wake up. */ DELAY(500000); /* * Get silicon revision number. */ XE_SELECT_PAGE(4); if (scp->mohawk) scp->srev = (XE_INB(XE_BOV) & 0x70) >> 4; else scp->srev = (XE_INB(XE_BOV) & 0x30) >> 4; #ifdef XE_DEBUG printf("xe%d: silicon revision = %d\n", scp->unit, scp->srev); #endif /* * Shut off interrupts. */ xe_disable_intr(scp); /* * Check for PHY. */ if (scp->mohawk) { scp->phy_ok = xe_mii_init(scp); } XE_SELECT_PAGE(0); (void)splx(s); } /* * Take interface offline. This is done by powering down the device, which I * assume means just shutting down the transceiver and Ethernet logic. This * requires a _hard_ reset to recover from, as we need to power up again. */ static void xe_stop(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: stop\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Shut off interrupts. */ xe_disable_intr(scp); /* * Power down. */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR1, 0); XE_SELECT_PAGE(0); /* * ~IFF_RUNNING == interface down. */ scp->ifp->if_flags &= ~IFF_RUNNING; scp->ifp->if_flags &= ~IFF_OACTIVE; scp->ifp->if_timer = 0; (void)splx(s); } /* * Enable Ethernet interrupts from the card. */ static void xe_enable_intr(struct xe_softc *scp) { #ifdef XE_DEBUG printf("xe%d: enable_intr\n", scp->unit); #endif XE_SELECT_PAGE(1); XE_OUTB(XE_IMR0, 0xff); /* Unmask everything */ XE_OUTB(XE_IMR1, 0x01); /* Unmask TX underrun detection */ DELAY(1); XE_SELECT_PAGE(0); XE_OUTB(XE_CR, XE_CR_ENABLE_INTR); /* Enable interrupts */ if (scp->modem && !scp->dingo) { /* This bit is just magic */ if (!(XE_INB(0x10) & 0x01)) { XE_OUTB(0x10, 0x11); /* Unmask master int enable bit */ } } } /* * Disable all Ethernet interrupts from the card. */ static void xe_disable_intr(struct xe_softc *scp) { #ifdef XE_DEBUG printf("xe%d: disable_intr\n", scp->unit); #endif XE_SELECT_PAGE(0); XE_OUTB(XE_CR, 0); /* Disable interrupts */ if (scp->modem && !scp->dingo) { /* More magic (does this work?) */ XE_OUTB(0x10, 0x10); /* Mask the master int enable bit */ } XE_SELECT_PAGE(1); XE_OUTB(XE_IMR0, 0); /* Forbid all interrupts */ XE_OUTB(XE_IMR1, 0); XE_SELECT_PAGE(0); } /* * Set up multicast filter and promiscuous mode */ static void xe_setmulti(struct xe_softc *scp) { struct ifnet *ifp; struct ifmultiaddr *maddr; int count; ifp = &scp->arpcom.ac_if; maddr = ifp->if_multiaddrs.lh_first; /* Get length of multicast list */ for (count = 0; maddr != NULL; maddr = maddr->ifma_link.le_next, count++); if ((ifp->if_flags & IFF_PROMISC) || (ifp->if_flags & IFF_ALLMULTI) || (count > 9)) { /* * Go into promiscuous mode if either of the PROMISC or ALLMULTI flags are * set, or if we have been asked to deal with more than 9 multicast * addresses. To do this: set MPE and PME in SWC1 */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x06); } else if ((ifp->if_flags & IFF_MULTICAST) && (count > 0)) { /* * Program the filters for up to 9 addresses */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x01); XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_OFFLINE); /*xe_reg_dump(scp);*/ xe_setaddrs(scp); /*xe_reg_dump(scp);*/ XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_RX_ENABLE|XE_CMD0_ONLINE); } else { /* * No multicast operation (default) */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0); } XE_SELECT_PAGE(0); } /* * Set up all on-chip addresses (for multicast). AFAICS, there are 10 * of these things; the first is our MAC address, the other 9 are mcast * addresses, padded with the MAC address if there aren't enough. * XXX - This doesn't work right, but I'm not sure why yet. We seem to be * XXX - doing much the same as the Linux code, which is weird enough that * XXX - it's probably right (despite my earlier comments to the contrary). */ static void xe_setaddrs(struct xe_softc *scp) { struct ifmultiaddr *maddr; u_int8_t *addr; u_int8_t page, slot, byte, i; maddr = scp->arpcom.ac_if.if_multiaddrs.lh_first; XE_SELECT_PAGE(page = 0x50); for (slot = 0, byte = 8; slot < 10; slot++) { if (slot == 0) addr = (u_int8_t *)(&scp->arpcom.ac_enaddr); else { while (maddr != NULL && maddr->ifma_addr->sa_family != AF_LINK) maddr = maddr->ifma_link.le_next; if (maddr != NULL) addr = LLADDR((struct sockaddr_dl *)maddr->ifma_addr); else addr = (u_int8_t *)(&scp->arpcom.ac_enaddr); } for (i = 0; i < 6; i++, byte++) { #if XE_DEBUG > 2 if (i) printf(":%x", addr[i]); else printf("xe%d: individual addresses %d: %x", scp->unit, slot, addr[0]); #endif if (byte > 15) { page++; byte = 8; XE_SELECT_PAGE(page); } if (scp->mohawk) XE_OUTB(byte, addr[5 - i]); else XE_OUTB(byte, addr[i]); } #if XE_DEBUG > 2 printf("\n"); #endif } XE_SELECT_PAGE(0); } /* * Write an outgoing packet to the card using programmed I/O. */ static int xe_pio_write_packet(struct xe_softc *scp, struct mbuf *mbp) { struct mbuf *mbp2; u_int16_t len, pad, free, ok; u_int8_t *data; u_int8_t savebyte[2], wantbyte; /* Get total packet length */ for (len = 0, mbp2 = mbp; mbp2 != NULL; len += mbp2->m_len, mbp2 = mbp2->m_next); /* Packets < minimum length may need to be padded out */ pad = 0; if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) { pad = (ETHER_MIN_LEN - ETHER_CRC_LEN - len + 1) >> 1; len = ETHER_MIN_LEN - ETHER_CRC_LEN; } /* Check transmit buffer space */ XE_SELECT_PAGE(0); XE_OUTW(XE_TRS, len+2); free = XE_INW(XE_TSO); ok = free & 0x8000; free &= 0x7fff; if (free <= len + 2) return 1; /* Send packet length to card */ XE_OUTW(XE_EDP, len); /* * Write packet to card using PIO (code stolen from the ed driver) */ wantbyte = 0; while (mbp != NULL) { len = mbp->m_len; if (len > 0) { data = mtod(mbp, caddr_t); if (wantbyte) { /* Finish the last word */ savebyte[1] = *data; XE_OUTW(XE_EDP, *(u_short *)savebyte); data++; len--; wantbyte = 0; } if (len > 1) { /* Output contiguous words */ outsw(scp->dev->id_iobase+XE_EDP, data, len >> 1); data += len & ~1; len &= 1; } if (len == 1) { /* Save last byte, if necessary */ savebyte[0] = *data; wantbyte = 1; } } mbp = mbp->m_next; } if (wantbyte) /* Last byte for odd-length packets */ XE_OUTW(XE_EDP, *(u_short *)savebyte); /* * For CE3 cards, just tell 'em to send -- apparently the card will pad out * short packets with random cruft. Otherwise, write nonsense words to fill * out the packet. I guess it is then sent automatically (?) */ if (scp->mohawk) XE_OUTB(XE_CR, XE_CR_TX_PACKET|XE_CR_ENABLE_INTR); else while (pad > 0) { XE_OUTW(XE_EDP, 0xdead); pad--; } return 0; } /* * The device entry is being removed, probably because someone ejected the * card. The interface should have been brought down manually before calling * this function; if not you may well lose packets. In any case, I shut down * the card and the interface, and hope for the best. The 'gone' flag is set, * so hopefully no-one else will try to access the missing card. */ static void xe_card_unload(struct pccard_devinfo *devi) { struct xe_softc *scp; struct ifnet *ifp; int unit; unit = devi->isahd.id_unit; scp = sca[unit]; ifp = &scp->arpcom.ac_if; if (scp->gone) { printf("xe%d: already unloaded\n", unit); return; } if_down(ifp); ifp->if_flags &= ~(IFF_RUNNING|IFF_OACTIVE); xe_stop(scp); scp->gone = 1; } /* * Compute the 32-bit Ethernet CRC for the given buffer. */ static u_int32_t xe_compute_crc(u_int8_t *data, int len) { u_int32_t crc = 0xffffffff; u_int32_t poly = 0x04c11db6; u_int8_t current, crc31, bit; int i, k; for (i = 0; i < len; i++) { current = data[i]; for (k = 1; k <= 8; k++) { if (crc & 0x80000000) { crc31 = 0x01; } else { crc31 = 0; } bit = crc31 ^ (current & 0x01); crc <<= 1; current >>= 1; if (bit) { crc = (crc ^ poly)|1; } } } return crc; } /* * Convert a CRC into an index into the multicast hash table. What we do is * take the most-significant 6 bits of the CRC, reverse them, and use that as * the bit number in the hash table. Bits 5:3 of the result give the byte * within the table (0-7); bits 2:0 give the bit number within that byte (also * 0-7), ie. the number of shifts needed to get it into the lsb position. */ static int xe_compute_hashbit(u_int32_t crc) { u_int8_t hashbit = 0; int i; for (i = 0; i < 6; i++) { hashbit >>= 1; if (crc & 0x80000000) { hashbit &= 0x80; } crc <<= 1; } return (hashbit >> 2); } /************************************************************** * * * M I I F U N C T I O N S * * * **************************************************************/ /* * Alternative MII/PHY handling code adapted from the xl driver. It doesn't * seem to work any better than the xirc2_ps stuff, but it's cleaner code. * XXX - this stuff shouldn't be here. It should all be abstracted off to * XXX - some kind of common MII-handling code, shared by all drivers. But * XXX - that's a whole other mission. */ #define XE_MII_SET(x) XE_OUTB(XE_GPR2, (XE_INB(XE_GPR2) | 0x04) | (x)) #define XE_MII_CLR(x) XE_OUTB(XE_GPR2, (XE_INB(XE_GPR2) | 0x04) & ~(x)) /* * Sync the PHYs by setting data bit and strobing the clock 32 times. */ static void xe_mii_sync(struct xe_softc *scp) { register int i; XE_SELECT_PAGE(2); XE_MII_SET(XE_MII_DIR|XE_MII_WRD); for (i = 0; i < 32; i++) { XE_MII_SET(XE_MII_CLK); DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); } } /* * Look for a MII-compliant PHY. If we find one, reset it. */ static int xe_mii_init(struct xe_softc *scp) { u_int16_t status; status = xe_phy_readreg(scp, PHY_BMSR); if ((status & 0xff00) != 0x7800) { #if XE_DEBUG > 1 printf("xe%d: no PHY found, %0x\n", scp->unit, status); #endif return 0; } else { #if XE_DEBUG > 1 printf("xe%d: PHY OK!\n", scp->unit); #endif /* Reset the PHY */ xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_RESET); DELAY(500); while(xe_phy_readreg(scp, PHY_BMCR) & PHY_BMCR_RESET); XE_MII_DUMP(scp); return 1; } } /* * Clock a series of bits through the MII. */ static void xe_mii_send(struct xe_softc *scp, u_int32_t bits, int cnt) { int i; XE_SELECT_PAGE(2); XE_MII_CLR(XE_MII_CLK); for (i = (0x1 << (cnt - 1)); i; i >>= 1) { if (bits & i) { XE_MII_SET(XE_MII_WRD); } else { XE_MII_CLR(XE_MII_WRD); } DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); } } /* * Read an PHY register through the MII. */ static int xe_mii_readreg(struct xe_softc *scp, struct xe_mii_frame *frame) { int i, ack, s; s = splimp(); /* * Set up frame for RX. */ frame->mii_stdelim = XE_MII_STARTDELIM; frame->mii_opcode = XE_MII_READOP; frame->mii_turnaround = 0; frame->mii_data = 0; XE_SELECT_PAGE(2); XE_OUTB(XE_GPR2, 0); /* * Turn on data xmit. */ XE_MII_SET(XE_MII_DIR); xe_mii_sync(scp); /* * Send command/address info. */ xe_mii_send(scp, frame->mii_stdelim, 2); xe_mii_send(scp, frame->mii_opcode, 2); xe_mii_send(scp, frame->mii_phyaddr, 5); xe_mii_send(scp, frame->mii_regaddr, 5); /* Idle bit */ XE_MII_CLR((XE_MII_CLK|XE_MII_WRD)); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); /* Turn off xmit. */ XE_MII_CLR(XE_MII_DIR); /* Check for ack */ XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); ack = XE_INB(XE_GPR2) & XE_MII_RDD; /* * Now try reading data bits. If the ack failed, we still * need to clock through 16 cycles to keep the PHY(s) in sync. */ if (ack) { for(i = 0; i < 16; i++) { XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); } goto fail; } for (i = 0x8000; i; i >>= 1) { XE_MII_CLR(XE_MII_CLK); DELAY(1); if (!ack) { if (XE_INB(XE_GPR2) & XE_MII_RDD) frame->mii_data |= i; DELAY(1); } XE_MII_SET(XE_MII_CLK); DELAY(1); } fail: XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); splx(s); if (ack) return(1); return(0); } /* * Write to a PHY register through the MII. */ static int xe_mii_writereg(struct xe_softc *scp, struct xe_mii_frame *frame) { int s; s = splimp(); /* * Set up frame for TX. */ frame->mii_stdelim = XE_MII_STARTDELIM; frame->mii_opcode = XE_MII_WRITEOP; frame->mii_turnaround = XE_MII_TURNAROUND; XE_SELECT_PAGE(2); /* * Turn on data output. */ XE_MII_SET(XE_MII_DIR); xe_mii_sync(scp); xe_mii_send(scp, frame->mii_stdelim, 2); xe_mii_send(scp, frame->mii_opcode, 2); xe_mii_send(scp, frame->mii_phyaddr, 5); xe_mii_send(scp, frame->mii_regaddr, 5); xe_mii_send(scp, frame->mii_turnaround, 2); xe_mii_send(scp, frame->mii_data, 16); /* Idle bit. */ XE_MII_SET(XE_MII_CLK); DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); /* * Turn off xmit. */ XE_MII_CLR(XE_MII_DIR); splx(s); return(0); } /* * Read a register from the PHY. */ static u_int16_t xe_phy_readreg(struct xe_softc *scp, u_int16_t reg) { struct xe_mii_frame frame; bzero((char *)&frame, sizeof(frame)); frame.mii_phyaddr = 0; frame.mii_regaddr = reg; xe_mii_readreg(scp, &frame); return(frame.mii_data); } /* * Write to a PHY register. */ static void xe_phy_writereg(struct xe_softc *scp, u_int16_t reg, u_int16_t data) { struct xe_mii_frame frame; bzero((char *)&frame, sizeof(frame)); frame.mii_phyaddr = 0; frame.mii_regaddr = reg; frame.mii_data = data; xe_mii_writereg(scp, &frame); return; } #ifdef XE_DEBUG /* * A bit of debugging code. */ static void xe_mii_dump(struct xe_softc *scp) { int i, s; s = splimp(); printf("xe%d: MII registers: ", scp->unit); for (i = 0; i < 2; i++) { printf(" %d:%04x", i, xe_phy_readreg(scp, i)); } for (i = 4; i < 7; i++) { printf(" %d:%04x", i, xe_phy_readreg(scp, i)); } printf("\n"); (void)splx(s); } static void xe_reg_dump(struct xe_softc *scp) { int page, i, s; s = splimp(); printf("xe%d: Common registers: ", scp->unit); for (i = 0; i < 8; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); for (page = 0; page <= 8; page++) { printf("xe%d: Register page %2.2x: ", scp->unit, page); XE_SELECT_PAGE(page); for (i = 8; i < 16; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); } for (page = 0x10; page < 0x5f; page++) { if ((page >= 0x11 && page <= 0x3f) || (page == 0x41) || (page >= 0x43 && page <= 0x4f) || (page >= 0x59)) continue; printf("xe%d: Register page %2.2x: ", scp->unit, page); XE_SELECT_PAGE(page); for (i = 8; i < 16; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); } (void)splx(s); } #endif #if NAPM > 0 /************************************************************** * * * A P M F U N C T I O N S * * * **************************************************************/ /* * This is called when we go into suspend/standby mode */ static int xe_suspend(void *xunit) { #ifdef XE_DEBUG struct xe_softc *scp = sca[(int)xunit]; printf("xe%d: APM suspend\n", scp->unit); #endif return 0; } /* * This is called when we wake up again */ static int xe_resume(void *xunit) { #ifdef XE_DEBUG struct xe_softc *scp = sca[(int)xunit]; printf("xe%d: APM resume\n", scp->unit); #endif return 0; } #endif /* NAPM > 0 */ #endif /* NCARD > 0 */ #endif /* NXE > 0 */ Index: head/sys/dev/vinum/vinuminterrupt.c =================================================================== --- head/sys/dev/vinum/vinuminterrupt.c (revision 49534) +++ head/sys/dev/vinum/vinuminterrupt.c (revision 49535) @@ -1,431 +1,430 @@ /* vinuminterrupt.c: bottom half of the driver */ /*- * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. * * Written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinuminterrupt.c,v 1.6 1999/06/18 00:50:53 grog Exp grog $ + * $Id: vinuminterrupt.c,v 1.12 1999/08/07 08:06:30 grog Exp $ */ #include #include -#include #include void complete_raid5_write(struct rqelement *); void freerq(struct request *rq); void free_rqg(struct rqgroup *rqg); void complete_rqe(struct buf *bp); void sdio_done(struct buf *bp); /* * Take a completed buffer, transfer the data back if * it's a read, and complete the high-level request * if this is the last subrequest. * * The bp parameter is in fact a struct rqelement, which * includes a couple of extras at the end. */ void complete_rqe(struct buf *bp) { struct rqelement *rqe; struct request *rq; struct rqgroup *rqg; struct buf *ubp; /* user buffer */ rqe = (struct rqelement *) bp; /* point to the element element that completed */ rqg = rqe->rqg; /* and the request group */ rq = rqg->rq; /* and the complete request */ ubp = rq->bp; /* user buffer */ #ifdef VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_iodone, (union rqinfou) rqe, ubp); #endif if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ if (bp->b_error != 0) /* did it return a number? */ rq->error = bp->b_error; /* yes, put it in. */ else if (rq->error == 0) /* no: do we have one already? */ rq->error = EIO; /* no: catchall "I/O error" */ SD[rqe->sdno].lasterror = rq->error; if (bp->b_flags & B_READ) { log(LOG_ERR, "%s: fatal read I/O error\n", SD[rqe->sdno].name); set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ } else { /* write operation */ log(LOG_ERR, "%s: fatal write I/O error\n", SD[rqe->sdno].name); set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ } if (rq->error == ENXIO) { /* the drive's down too */ log(LOG_ERR, "%s: fatal drive I/O error\n", DRIVE[rqe->driveno].label.name); DRIVE[rqe->driveno].lasterror = rq->error; set_drive_state(rqe->driveno, /* take the drive down */ drive_down, setstate_force); } } /* Now update the statistics */ if (bp->b_flags & B_READ) { /* read operation */ DRIVE[rqe->driveno].reads++; DRIVE[rqe->driveno].bytes_read += bp->b_bcount; SD[rqe->sdno].reads++; SD[rqe->sdno].bytes_read += bp->b_bcount; PLEX[rqe->rqg->plexno].reads++; PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; } else { /* write operation */ DRIVE[rqe->driveno].writes++; DRIVE[rqe->driveno].bytes_written += bp->b_bcount; SD[rqe->sdno].writes++; SD[rqe->sdno].bytes_written += bp->b_bcount; PLEX[rqe->rqg->plexno].writes++; PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; } rqg->active--; /* one less request active */ if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ int *sdata; /* source */ int *data; /* and group data */ int length; /* and count involved */ int count; /* loop counter */ struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ /* XOR destination is the user data */ sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ length = urqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ for (count = 0; count < length; count++) data[count] ^= sdata[count]; #ifdef VINUMDEBUG if (debug & DEBUG_RESID) { if ((rqg->active == 0) /* XXXX finished this group */ &&(*(char *) data != '<')) /* and not what we expected */ Debugger("complete_request checksum"); } #endif /* * In a normal read, we will normally read directly * into the user buffer. This doesn't work if * we're also doing a recovery, so we have to * copy it */ if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ char *dst; dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ length = rqe->datalen << DEV_BSHIFT; /* and count involved */ bcopy(src, dst, length); /* move it */ } } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation */ &&(rqg->active == 0)) /* and we've finished phase 1 */ complete_raid5_write(rqe); if (rqg->active == 0) /* request group finished, */ rq->active--; /* one less */ if (rq->active == 0) { /* request finished, */ #if VINUMDEBUG if (debug & DEBUG_RESID) { if (ubp->b_resid != 0) /* still something to transfer? */ Debugger("resid"); { int i; for (i = 0; i < ubp->b_bcount; i += 512) /* XXX debug */ if (((char *) ubp->b_data)[i] != '<') { /* and not what we expected */ log(LOG_DEBUG, "At 0x%x (offset 0x%x): '%c' (0x%x)\n", (int) (&((char *) ubp->b_data)[i]), i, ((char *) ubp->b_data)[i], ((char *) ubp->b_data)[i]); Debugger("complete_request checksum"); } } } #endif if (rq->error) { /* did we have an error? */ if (rq->isplex) { /* plex operation, */ ubp->b_flags |= B_ERROR; /* yes, propagate to user */ ubp->b_error = rq->error; } else /* try to recover */ queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */ } else { ubp->b_resid = 0; /* completed our transfer */ if (rq->isplex == 0) /* volume request, */ VOL[rq->volplex.volno].active--; /* another request finished */ biodone(ubp); /* top level buffer completed */ freerq(rq); /* return the request storage */ } } } /* Free a request block and anything hanging off it */ void freerq(struct request *rq) { struct rqgroup *rqg; struct rqgroup *nrqg; /* next in chain */ int rqno; for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */ for (rqno = 0; rqno < rqg->count; rqno++) if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */ &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */ Free(rqg->rqe[rqno].b.b_data); /* free it */ nrqg = rqg->next; /* note the next one */ Free(rqg); /* and free this one */ } Free(rq); /* free the request itself */ } void free_rqg(struct rqgroup *rqg) { if ((rqg->flags & XFR_GROUPOP) /* RAID 5 request */ &&(rqg->rqe) /* got a buffer structure */ &&(rqg->rqe->b.b_data)) /* and it has a buffer allocated */ Free(rqg->rqe->b.b_data); /* free it */ } /* I/O on subdisk completed */ void sdio_done(struct buf *bp) { struct sdbuf *sbp; sbp = (struct sdbuf *) bp; if (sbp->b.b_flags & B_ERROR) { /* had an error */ bp->b_flags |= B_ERROR; bp->b_error = sbp->b.b_error; } bp->b_resid = sbp->b.b_resid; biodone(sbp->bp); /* complete the caller's I/O */ /* Now update the statistics */ if (bp->b_flags & B_READ) { /* read operation */ DRIVE[sbp->driveno].reads++; DRIVE[sbp->driveno].bytes_read += bp->b_bcount; SD[sbp->sdno].reads++; SD[sbp->sdno].bytes_read += bp->b_bcount; } else { /* write operation */ DRIVE[sbp->driveno].writes++; DRIVE[sbp->driveno].bytes_written += bp->b_bcount; SD[sbp->sdno].writes++; SD[sbp->sdno].bytes_written += bp->b_bcount; } Free(sbp); } /* Start the second phase of a RAID5 group write operation. */ /* * XXX This could be improved on. It's quite CPU intensive, * and doing it at the end tends to lump it all together. * We should do this a transfer at a time */ void complete_raid5_write(struct rqelement *rqe) { int *sdata; /* source */ int *pdata; /* and parity block data */ int length; /* and count involved */ int count; /* loop counter */ int rqno; /* request index */ int rqoffset; /* offset of request data from parity data */ struct buf *bp; /* user buffer header */ struct request *rq; /* pointer to our request */ struct rqgroup *rqg; /* and to the request group */ struct rqelement *prqe; /* point to the parity block */ struct drive *drive; /* drive to access */ rqg = rqe->rqg; /* and to our request group */ rq = rqg->rq; /* point to our request */ bp = rq->bp; /* user's buffer header */ prqe = &rqg->rqe[0]; /* point to the parity block */ /* * If we get to this function, we have normal or * degraded writes, or a combination of both. We do * the same thing in each case: we perform an * exclusive or to the parity block. The only * difference is the origin of the data and the * address range. */ if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ /* Now get what data we need from each block */ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ /* * This can do with improvement. If we're doing * both a degraded and a normal write, we don't * need to xor (nor to read) the part of the block * that we're going to overwrite. FIXME XXX */ rqe = &rqg->rqe[rqno]; /* this request */ sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ /* * add the data block to the parity block. Before * we started the request, we zeroed the parity * block, so the result of adding all the other * blocks and the block we want to write will be * the correct parity block. */ /* XXX do this in assembler */ for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ Free(rqe->b.b_data); /* free it now */ rqe->flags &= ~XFR_MALLOCED; } } } if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ /* Get what data we need from each block */ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ rqe = &rqg->rqe[rqno]; /* this request */ if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ length = rqe->datalen << (DEV_BSHIFT - 2); /* and count involved */ /* * "remove" the old data block * from the parity block */ /* XXX do this in assembler */ if ((pdata < ((int *) prqe->b.b_data)) || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) || (sdata < ((int *) rqe->b.b_data)) || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) Debugger("Bounds overflow"); /* XXX */ for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; /* "add" the new data block */ sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ if ((sdata < ((int *) bp->b_data)) || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount)))) Debugger("Bounds overflow"); /* XXX */ for (count = 0; count < length; count++) pdata[count] ^= sdata[count]; /* Free the malloced buffer */ if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ Free(rqe->b.b_data); /* free it */ rqe->flags &= ~XFR_MALLOCED; } else Debugger("not malloced"); /* XXX */ if ((rqe->b.b_flags & B_READ) /* this was a read */ &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ rqe->b.b_flags |= B_CALL; /* call us when you're done */ rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */ rqg->active++; /* another active request */ rqe->b.b_vp->v_numoutput++; /* one more output going */ drive = &DRIVE[rqe->driveno]; /* drive to access */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", rqe->b.b_flags & B_READ ? "Read" : "Write", major(rqe->b.b_dev), minor(rqe->b.b_dev), rqe->sdno, (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), rqe->b.b_blkno, rqe->b.b_bcount); /* XXX */ if (debug & DEBUG_NUMOUTPUT) log(LOG_DEBUG, " raid5.2 sd %d numoutput %ld\n", rqe->sdno, rqe->b.b_vp->v_numoutput); if (debug & DEBUG_LASTREQS) logrq(loginfo_raid5_data, (union rqinfou) rqe, bp); #endif (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); } } } } /* Finally, write the parity block */ rqe = &rqg->rqe[0]; rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ rqe->b.b_flags |= B_CALL; /* call us when you're done */ rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ rqg->active++; /* another active request */ rqe->b.b_vp->v_numoutput++; /* one more output going */ drive = &DRIVE[rqe->driveno]; /* drive to access */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", rqe->b.b_flags & B_READ ? "Read" : "Write", major(rqe->b.b_dev), minor(rqe->b.b_dev), rqe->sdno, (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), rqe->b.b_blkno, rqe->b.b_bcount); /* XXX */ if (debug & DEBUG_NUMOUTPUT) log(LOG_DEBUG, " raid5.3 sd %d numoutput %ld\n", rqe->sdno, rqe->b.b_vp->v_numoutput); if (debug & DEBUG_LASTREQS) logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp); #endif (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); } Index: head/sys/dev/vinum/vinumio.c =================================================================== --- head/sys/dev/vinum/vinumio.c (revision 49534) +++ head/sys/dev/vinum/vinumio.c (revision 49535) @@ -1,1103 +1,1102 @@ /*- * Copyright (c) 1997, 1998 * Nan Yang Computer Services Limited. All rights reserved. * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinumio.c,v 1.33 1999/08/07 08:07:05 grog Exp $ + * $Id: vinumio.c,v 1.34 1999/08/08 14:11:03 bde Exp $ */ #include #include -#include static char *sappend(char *txt, char *s); static int drivecmp(const void *va, const void *vb); /* * Open the device associated with the drive, and set drive's vp. * Return an error number */ int open_drive(struct drive *drive, struct proc *p, int verbose) { struct nameidata nd; int error; if (drive->devicename[0] != '/') /* no device name */ sprintf(drive->devicename, "/dev/%s", drive->label.name); /* get it from the drive name */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, drive->devicename, p); error = vn_open(&nd, FREAD | FWRITE, 0); /* open the device */ if (error != 0) { /* can't open? */ set_drive_state(drive->driveno, drive_down, setstate_force); drive->lasterror = error; if (verbose) log(LOG_WARNING, "vinum open_drive %s: failed with error %d\n", drive->devicename, error); /* XXX */ return error; } drive->vp = nd.ni_vp; drive->p = p; if (drive->vp->v_usecount > 1) { /* already in use? */ if (verbose) log(LOG_WARNING, "open_drive %s: use count %d, ignoring\n", /* XXX where does this come from? */ drive->devicename, drive->vp->v_usecount); } if (drive->vp->v_type != VBLK) { /* only consider block devices */ VOP_UNLOCK(drive->vp, 0, drive->p); close_drive(drive); set_drive_state(drive->driveno, drive_down, setstate_force); /* this also closes the drive */ drive->lasterror = ENOTBLK; if (verbose) log(LOG_WARNING, "vinum open_drive %s: Not a block device\n", drive->devicename); /* XXX */ return ENOTBLK; } drive->vp->v_numoutput = 0; VOP_UNLOCK(drive->vp, 0, drive->p); return 0; } /* * Set some variables in the drive struct * in more convenient form. Return error indication */ int set_drive_parms(struct drive *drive) { drive->blocksize = BLKDEV_IOSIZE; /* XXX do we need this? */ drive->secsperblock = drive->blocksize /* number of sectors per block */ / drive->partinfo.disklab->d_secsize; /* Now update the label part */ bcopy(hostname, drive->label.sysname, VINUMHOSTNAMELEN); /* put in host name */ getmicrotime(&drive->label.date_of_birth); /* and current time */ drive->label.drive_size = ((u_int64_t) drive->partinfo.part->p_size) /* size of the drive in bytes */ *((u_int64_t) drive->partinfo.disklab->d_secsize); #if VINUMDEBUG if (debug & DEBUG_BIGDRIVE) /* pretend we're 100 times as big */ drive->label.drive_size *= 100; #endif /* number of sectors available for subdisks */ drive->sectors_available = drive->label.drive_size / DEV_BSIZE - DATASTART; /* * XXX Bug in 3.0 as of January 1998: you can open * non-existent slices. They have a length of 0 */ if (drive->label.drive_size < MINVINUMSLICE) { /* too small to worry about */ set_drive_state(drive->driveno, drive_down, setstate_force); drive->lasterror = ENOSPC; return ENOSPC; } drive->freelist_size = INITIAL_DRIVE_FREELIST; /* initial number of entries */ drive->freelist = (struct drive_freelist *) Malloc(INITIAL_DRIVE_FREELIST * sizeof(struct drive_freelist)); if (drive->freelist == NULL) /* can't malloc, dammit */ return ENOSPC; drive->freelist_entries = 1; /* just (almost) the complete drive */ drive->freelist[0].offset = DATASTART; /* starts here */ drive->freelist[0].sectors = (drive->label.drive_size >> DEV_BSHIFT) - DATASTART; /* and it's this long */ if (drive->label.name[0] != '\0') /* got a name */ set_drive_state(drive->driveno, drive_up, setstate_force); /* our drive is accessible */ else /* we know about it, but that's all */ drive->state = drive_referenced; return 0; } /* * Initialize a drive: open the device and add device * information */ int init_drive(struct drive *drive, int verbose) { int error; if (drive->devicename[0] != '/') { drive->lasterror = EINVAL; log(LOG_ERR, "vinum: Can't open drive without drive name\n"); return EINVAL; } error = open_drive(drive, curproc, verbose); /* open the drive */ if (error) return error; error = VOP_IOCTL(drive->vp, /* get the partition information */ DIOCGPART, (caddr_t) & drive->partinfo, FREAD, NOCRED, curproc); if (error) { if (verbose) log(LOG_WARNING, "vinum open_drive %s: Can't get partition information, error %d\n", drive->devicename, error); /* XXX */ close_drive(drive); drive->lasterror = error; drive->state = drive_down; /* don't tell the system about this one at all */ return error; } if (drive->partinfo.part->p_fstype != FS_VINUM) { /* not Vinum */ drive->lasterror = EFTYPE; if (verbose) log(LOG_WARNING, "vinum open_drive %s: Wrong partition type for vinum\n", drive->devicename); /* XXX */ close_drive(drive); drive->state = drive_down; /* don't tell the system about this one at all */ return EFTYPE; } return set_drive_parms(drive); /* set various odds and ends */ } /* Close a drive if it's open. */ void close_drive(struct drive *drive) { LOCKDRIVE(drive); /* keep the daemon out */ if (drive->vp) close_locked_drive(drive); /* and close it */ unlockdrive(drive); } /* * Real drive close code, called with drive already locked. * We have also checked that the drive is open. No errors. */ void close_locked_drive(struct drive *drive) { /* * If we can't access the drive, we can't flush * the queues, which spec_close() will try to * do. Get rid of them here first. */ if (drive->state < drive_up) { /* we can't access the drive, */ vn_lock(drive->vp, LK_EXCLUSIVE | LK_RETRY, drive->p); vinvalbuf(drive->vp, 0, NOCRED, drive->p, 0, 0); VOP_UNLOCK(drive->vp, 0, drive->p); } vn_close(drive->vp, FREAD | FWRITE, NOCRED, drive->p); #ifdef VINUMDEBUG if ((debug & DEBUG_WARNINGS) /* want to hear about them */ &&(drive->vp->v_usecount)) /* XXX shouldn't happen */ log(LOG_WARNING, "close_drive %s: use count still %d\n", drive->devicename, drive->vp->v_usecount); #endif drive->vp = NULL; } /* * Remove drive from the configuration. * Caller must ensure that it isn't active */ void remove_drive(int driveno) { struct drive *drive = &vinum_conf.drive[driveno]; long long int nomagic = VINUM_NOMAGIC; /* no magic number */ if (drive->state > drive_referenced) { /* real drive */ if (drive->state == drive_up) write_drive(drive, /* obliterate the magic, but leave a hint */ (char *) &nomagic, 8, VINUM_LABEL_OFFSET); free_drive(drive); /* close it and free resources */ save_config(); /* and save the updated configuration */ } } /* * Transfer drive data. Usually called from one of these defines; * #define read_drive(a, b, c, d) driveio (a, b, c, d, B_READ) * #define write_drive(a, b, c, d) driveio (a, b, c, d, B_WRITE) * * length and offset are in bytes, but must be multiples of sector * size. The function *does not check* for this condition, and * truncates ruthlessly. * Return error number */ int driveio(struct drive *drive, char *buf, size_t length, off_t offset, int flag) { int error; struct buf *bp; char foo[40]; error = 0; /* to keep the compiler happy */ while (length) { /* divide into small enough blocks */ int len = min(length, MAXBSIZE); /* maximum block device transfer is MAXBSIZE */ bp = geteblk(len); /* get a buffer header */ bp->b_flags = flag; bp->b_dev = drive->vp->v_rdev; /* device */ bp->b_blkno = offset / drive->partinfo.disklab->d_secsize; /* block number */ bp->b_data = buf; bp->b_bcount = len; bp->b_bufsize = len; (*bdevsw(bp->b_dev)->d_strategy) (bp); /* initiate the transfer */ error = biowait(bp); printf("driveio: %s dev %d.%d, block 0x%x, len 0x%lx, error %d\n", /* XXX */ flag ? "read" : "write", major(bp->b_dev), minor(bp->b_dev), bp->b_blkno, bp->b_bcount, error); bcopy(buf, foo, 40); foo[39] = '\0'; printf("---> %s\n", foo); /* XXXXXX */ bp->b_flags |= B_INVAL | B_AGE; brelse(bp); if (error) break; length -= len; /* update pointers */ buf += len; offset += len; } return error; } /* * Read data from a drive * * Return error number */ int read_drive(struct drive *drive, void *buf, size_t length, off_t offset) { int error; struct buf *bp; daddr_t nextbn; long bscale; struct uio uio; struct iovec iov; daddr_t blocknum; /* block number */ int blockoff; /* offset in block */ int count; /* amount to transfer */ iov.iov_base = buf; iov.iov_len = length; uio.uio_iov = &iov; uio.uio_iovcnt = length; uio.uio_offset = offset; uio.uio_resid = length; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = curproc; bscale = btodb(drive->blocksize); /* mask off offset from block number */ do { blocknum = btodb(uio.uio_offset) & ~(bscale - 1); /* get the block number */ blockoff = uio.uio_offset % drive->blocksize; /* offset in block */ count = min((unsigned) (drive->blocksize - blockoff), /* amount to transfer in this block */ uio.uio_resid); /* XXX Check this. I think the test is wrong */ if (drive->vp->v_lastr + bscale == blocknum) { /* did our last read finish in this block? */ nextbn = blocknum + bscale; /* note the end of the transfer */ error = breadn(drive->vp, /* and read with read-ahead */ blocknum, (int) drive->blocksize, &nextbn, (int *) &drive->blocksize, 1, NOCRED, &bp); } else /* random read: just read this block */ error = bread(drive->vp, blocknum, (int) drive->blocksize, NOCRED, &bp); drive->vp->v_lastr = blocknum; /* note the last block we read */ count = min(count, drive->blocksize - bp->b_resid); if (error) { brelse(bp); return error; } error = uiomove((char *) bp->b_data + blockoff, count, &uio); /* move the data */ brelse(bp); } while (error == 0 && uio.uio_resid > 0 && count != 0); return error; } /* * Write data to a drive * * Return error number */ int write_drive(struct drive *drive, void *buf, size_t length, off_t offset) { int error; struct buf *bp; struct uio uio; struct iovec iov; daddr_t blocknum; /* block number */ int blockoff; /* offset in block */ int count; /* amount to transfer */ int blockshift; if (drive->state == drive_down) /* currently down */ return 0; /* ignore */ if (drive->vp == NULL) { drive->lasterror = ENODEV; return ENODEV; /* not configured yet */ } iov.iov_base = buf; iov.iov_len = length; uio.uio_iov = &iov; uio.uio_iovcnt = length; uio.uio_offset = offset; uio.uio_resid = length; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_procp = curproc; error = 0; blockshift = btodb(drive->blocksize) - 1; /* amount to shift block number * to get sector number */ do { blocknum = btodb(uio.uio_offset) & ~blockshift; /* get the block number */ blockoff = uio.uio_offset % drive->blocksize; /* offset in block */ count = min((unsigned) (drive->blocksize - blockoff), /* amount to transfer in this block */ uio.uio_resid); if (count == drive->blocksize) /* the whole block */ bp = getblk(drive->vp, blocknum, drive->blocksize, 0, 0); /* just transfer it */ else /* partial block: */ error = bread(drive->vp, /* read it first */ blocknum, drive->blocksize, NOCRED, &bp); count = min(count, drive->blocksize - bp->b_resid); /* how much will we transfer now? */ if (error == 0) error = uiomove((char *) bp->b_data + blockoff, /* move the data to the block */ count, &uio); if (error) { brelse(bp); drive->lasterror = error; switch (error) { case EIO: set_drive_state(drive->driveno, drive_down, setstate_force); break; /* XXX Add other possibilities here */ default: } return error; } if (count + blockoff == drive->blocksize) /* * The transfer goes to the end of the block. There's * no need to wait for any more data to arrive. */ bawrite(bp); /* start the write now */ else bdwrite(bp); /* do a delayed write */ } while (error == 0 && uio.uio_resid > 0 && count != 0); if (error) drive->lasterror = error; return error; /* OK */ } /* Wake up on completion */ void drive_io_done(struct buf *bp) { wakeup((caddr_t) bp); /* Wachet auf! */ bp->b_flags &= ~B_CALL; /* don't do this again */ } /* * Check a drive for a vinum header. If found, * update the drive information. We come here * with a partially populated drive structure * which includes the device name. * * Return information on what we found. * * This function is called from two places: check_drive, * which wants to find out whether the drive is a * Vinum drive, and config_drive, which asserts that * it is a vinum drive. In the first case, we don't * print error messages (verbose==0), in the second * we do (verbose==1). */ enum drive_label_info read_drive_label(struct drive *drive, int verbose) { int error; int result; /* result of our search */ struct vinum_hdr *vhdr; /* and as header */ error = init_drive(drive, 0); /* find the drive */ if (error) /* find the drive */ return DL_CANT_OPEN; /* not ours */ vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* allocate buffers */ CHECKALLOC(vhdr, "Can't allocate memory"); error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET); if (vhdr->magic == VINUM_MAGIC) { /* ours! */ if (drive->label.name[0] /* we have a name for this drive */ &&(strcmp(drive->label.name, vhdr->label.name))) { /* but it doesn't match the real name */ drive->lasterror = EINVAL; result = DL_WRONG_DRIVE; /* it's the wrong drive */ } else { drive->state = drive_up; /* it's OK by us */ result = DL_OURS; } /* * We copy the drive anyway so that we have * the correct name in the drive info. This * may not be the name specified */ drive->label = vhdr->label; /* put in the label information */ } else if (vhdr->magic == VINUM_NOMAGIC) /* was ours, but we gave it away */ result = DL_DELETED_LABEL; /* and return the info */ else result = DL_NOT_OURS; /* we could have it, but we don't yet */ Free(vhdr); /* that's all. */ return result; } /* * Check a drive for a vinum header. If found, * read configuration information from the drive and * incorporate the data into the configuration. * * Return drive number. */ struct drive * check_drive(char *devicename) { int driveno; int i; struct drive *drive; driveno = find_drive_by_dev(devicename, 1); /* if entry doesn't exist, create it */ drive = &vinum_conf.drive[driveno]; /* and get a pointer */ if (read_drive_label(drive, 0) == DL_OURS) { /* not ours */ for (i = 0; i < vinum_conf.drives_allocated; i++) { /* see if the name already exists */ if ((i != driveno) /* not this drive */ &&(DRIVE[i].state != drive_unallocated) /* and it's allocated */ &&(strcmp(DRIVE[i].label.name, DRIVE[driveno].label.name) == 0)) { /* and it has the same name */ struct drive *mydrive = &DRIVE[i]; if (mydrive->devicename[0] == '/') { /* we know a device name for it */ /* * set an error, but don't take the drive down: * that would cause unneeded error messages. */ drive->lasterror = EEXIST; break; } else { /* it's just a place holder, */ int sdno; for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* look at each subdisk */ if ((SD[sdno].driveno == i) /* it's pointing to this one, */ &&(SD[sdno].state != sd_unallocated)) { /* and it's a real subdisk */ SD[sdno].driveno = drive->driveno; /* point to the one we found */ update_sd_state(sdno); /* and update its state */ } } bzero(mydrive, sizeof(struct drive)); /* don't deallocate it, just remove it */ } } } } else { if (drive->lasterror == 0) drive->lasterror = ENODEV; set_drive_state(drive->driveno, drive_down, setstate_force); } return drive; } static char * sappend(char *txt, char *s) { while ((*s++ = *txt++) != 0); return s - 1; } void format_config(char *config, int len) { int i; int j; char *s = config; char *configend = &config[len]; bzero(config, len); /* First write the volume configuration */ for (i = 0; i < vinum_conf.volumes_allocated; i++) { struct volume *vol; vol = &vinum_conf.volume[i]; if ((vol->state > volume_uninit) && (vol->name[0] != '\0')) { /* paranoia */ if (vol->preferred_plex >= 0) /* preferences, */ snprintf(s, configend - s, "volume %s state %s readpol prefer %s", vol->name, volume_state(vol->state), vinum_conf.plex[vol->preferred_plex].name); else /* default round-robin */ snprintf(s, configend - s, "volume %s state %s", vol->name, volume_state(vol->state)); while (*s) s++; /* find the end */ s = sappend("\n", s); } } /* Then the plex configuration */ for (i = 0; i < vinum_conf.plexes_allocated; i++) { struct plex *plex; plex = &vinum_conf.plex[i]; if ((plex->state != plex_referenced) && (plex->name[0] != '\0')) { /* paranoia */ snprintf(s, configend - s, "plex name %s state %s org %s ", plex->name, plex_state(plex->state), plex_org(plex->organization)); while (*s) s++; /* find the end */ if ((plex->organization == plex_striped) || (plex->organization == plex_raid5)) { snprintf(s, configend - s, "%ds ", (int) plex->stripesize); while (*s) s++; /* find the end */ } if (plex->volno >= 0) /* we have a volume */ snprintf(s, configend - s, "vol %s ", vinum_conf.volume[plex->volno].name); while (*s) s++; /* find the end */ for (j = 0; j < plex->subdisks; j++) { snprintf(s, configend - s, " sd %s", vinum_conf.sd[plex->sdnos[j]].name); } s = sappend("\n", s); } } /* And finally the subdisk configuration */ for (i = 0; i < vinum_conf.subdisks_allocated; i++) { struct sd *sd; sd = &SD[i]; if ((sd->state != sd_referenced) && (sd->name[0] != '\0')) { /* paranoia */ if (sd->plexno >= 0) snprintf(s, configend - s, "sd name %s drive %s plex %s state %s len %llus driveoffset %llus plexoffset %llds\n", sd->name, vinum_conf.drive[sd->driveno].label.name, vinum_conf.plex[sd->plexno].name, sd_state(sd->state), (unsigned long long) sd->sectors, (unsigned long long) sd->driveoffset, (long long) sd->plexoffset); else snprintf(s, configend - s, "sd name %s drive %s state %s len %llus driveoffset %llus detached\n", sd->name, vinum_conf.drive[sd->driveno].label.name, sd_state(sd->state), (unsigned long long) sd->sectors, (unsigned long long) sd->driveoffset); while (*s) s++; /* find the end */ } } if (s > &config[len - 2]) panic("vinum: configuration data overflow"); } /* * issue a save config request to the dæmon. The actual work * is done in process context by daemon_save_config */ void save_config(void) { queue_daemon_request(daemonrq_saveconfig, (union daemoninfo) NULL); } /* * Write the configuration to all vinum slices. This * is performed by the dæmon only */ void daemon_save_config(void) { int error; int written_config; /* set when we first write the config to disk */ int driveno; struct drive *drive; /* point to current drive info */ struct vinum_hdr *vhdr; /* and as header */ char *config; /* point to config data */ int wlabel_on; /* to set writing label on/off */ /* don't save the configuration while we're still working on it */ if (vinum_conf.flags & VF_CONFIGURING) return; written_config = 0; /* no config written yet */ /* Build a volume header */ vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* get space for the config data */ CHECKALLOC(vhdr, "Can't allocate config data"); vhdr->magic = VINUM_MAGIC; /* magic number */ vhdr->config_length = MAXCONFIG; /* length of following config info */ config = Malloc(MAXCONFIG); /* get space for the config data */ CHECKALLOC(config, "Can't allocate config data"); format_config(config, MAXCONFIG); error = 0; /* no errors yet */ for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) { drive = &vinum_conf.drive[driveno]; /* point to drive */ if (drive->state > drive_referenced) { LOCKDRIVE(drive); /* don't let it change */ /* * First, do some drive consistency checks. Some * of these are kludges, others require a process * context and couldn't be done before */ if ((drive->devicename[0] == '\0') /* XXX we keep getting these nameless drives */ ||(drive->label.name[0] == '\0')) { /* XXX we keep getting these nameless drives */ unlockdrive(drive); log(LOG_WARNING, "Removing incomplete drive, index %d\n", driveno); if (drive->vp) /* how can it be open without a name? */ close_drive(drive); free_drive(drive); /* get rid of it */ break; } if ((drive->vp == NULL) /* drive not open */ &&(drive->state > drive_down)) { /* and it thinks it's not down */ unlockdrive(drive); set_drive_state(driveno, drive_down, setstate_force); /* tell it what's what */ continue; } if ((drive->state == drive_down) /* it's down */ &&(drive->vp != NULL)) { /* but open, */ unlockdrive(drive); close_drive(drive); /* close it */ } else if (drive->state > drive_down) { getmicrotime(&drive->label.last_update); /* time of last update is now */ bcopy((char *) &drive->label, /* and the label info from the drive structure */ (char *) &vhdr->label, sizeof(vhdr->label)); if ((drive->state != drive_unallocated) && (drive->state != drive_referenced)) { /* and it's a real drive */ wlabel_on = 1; /* enable writing the label */ error = VOP_IOCTL(drive->vp, /* make the label writeable */ DIOCWLABEL, (caddr_t) & wlabel_on, FWRITE, NOCRED, curproc); if (error == 0) error = write_drive(drive, (char *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET); if (error == 0) error = write_drive(drive, config, MAXCONFIG, VINUM_CONFIG_OFFSET); /* first config copy */ if (error == 0) error = write_drive(drive, config, MAXCONFIG, VINUM_CONFIG_OFFSET + MAXCONFIG); /* second copy */ wlabel_on = 0; /* enable writing the label */ if (error == 0) VOP_IOCTL(drive->vp, /* make the label non-writeable again */ DIOCWLABEL, (caddr_t) & wlabel_on, FWRITE, NOCRED, curproc); unlockdrive(drive); if (error) { log(LOG_ERR, "vinum: Can't write config to %s, error %d\n", drive->devicename, error); set_drive_state(drive->driveno, drive_down, setstate_force); } else written_config = 1; /* we've written it on at least one drive */ } } else /* not worth looking at, */ unlockdrive(drive); /* just unlock it again */ } } Free(vhdr); Free(config); } /* * Disk labels are a mess. The correct way to access them * is with the DIOC[GSW]DINFO ioctls, but some programs, such * as newfs, access the disk directly, so we have to write * things there. We do this only on request. If a user * request tries to read it directly, we fake up one on the fly. */ /* * get_volume_label returns a label structure to lp, which * is allocated by the caller */ void get_volume_label(struct volume *vol, struct disklabel *lp) { bzero(lp, sizeof(struct disklabel)); strncpy(lp->d_typename, "vinum", sizeof(lp->d_typename)); lp->d_type = DTYPE_VINUM; strncpy(lp->d_packname, vol->name, min(sizeof(lp->d_packname), sizeof(vol->name))); lp->d_rpm = 14400 * vol->plexes; /* to keep them guessing */ lp->d_interleave = 1; lp->d_flags = 0; /* * Fitting unto the vine, a vinum has a single * track with all its sectors */ lp->d_secsize = DEV_BSIZE; /* bytes per sector */ lp->d_nsectors = vol->size; /* data sectors per track */ lp->d_ntracks = 1; /* tracks per cylinder */ lp->d_ncylinders = 1; /* data cylinders per unit */ lp->d_secpercyl = vol->size; /* data sectors per cylinder */ lp->d_secperunit = vol->size; /* data sectors per unit */ lp->d_bbsize = BBSIZE; lp->d_sbsize = SBSIZE; lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; /* * Set up partitions a, b and c to be identical * and the size of the volume. a is UFS, b is * swap, c is nothing */ lp->d_partitions[0].p_size = vol->size; lp->d_partitions[0].p_fsize = 1024; lp->d_partitions[0].p_fstype = FS_BSDFFS; /* FreeBSD File System :-) */ lp->d_partitions[0].p_fsize = 1024; /* FS fragment size */ lp->d_partitions[0].p_frag = 8; /* and fragments per block */ lp->d_partitions[SWAP_PART].p_size = vol->size; lp->d_partitions[SWAP_PART].p_fstype = FS_SWAP; /* swap partition */ lp->d_partitions[LABEL_PART].p_size = vol->size; lp->d_npartitions = LABEL_PART + 1; strncpy(lp->d_packname, vol->name, min(sizeof(lp->d_packname), sizeof(vol->name))); lp->d_checksum = dkcksum(lp); } /* Write a volume label. This implements the VINUM_LABEL ioctl. */ int write_volume_label(int volno) { struct disklabel *lp; struct buf *bp; struct disklabel *dlp; struct volume *vol; int error; lp = (struct disklabel *) Malloc((sizeof(struct disklabel) + (DEV_BSIZE - 1)) & (DEV_BSIZE - 1)); if (lp == 0) return ENOMEM; if ((unsigned) (volno) >= (unsigned) vinum_conf.volumes_allocated) /* invalid volume */ return ENOENT; vol = &VOL[volno]; /* volume in question */ if (vol->state <= volume_uninit) /* nothing there */ return ENXIO; else if (vol->state < volume_up) /* not accessible */ return EIO; /* I/O error */ get_volume_label(vol, lp); /* get the label */ /* * Now write to disk. This code is derived from the * system writedisklabel (), which does silly things * like reading the label and refusing to write * unless it's already there. */ bp = geteblk((int) lp->d_secsize); /* get a buffer */ bp->b_dev = makedev(CDEV_MAJOR, vol->volno); /* our own raw volume */ bp->b_blkno = LABELSECTOR * ((int) lp->d_secsize / DEV_BSIZE); bp->b_bcount = lp->d_secsize; bzero(bp->b_data, lp->d_secsize); dlp = (struct disklabel *) bp->b_data; *dlp = *lp; bp->b_flags &= ~B_INVAL; bp->b_flags |= B_WRITE; vinumstrategy(bp); /* write it out */ error = biowait(bp); bp->b_flags |= B_INVAL | B_AGE; brelse(bp); return error; } /* Initialize a subdisk */ int initsd(int sdno) { return 0; } /* Look at all disks on the system for vinum slices */ int vinum_scandisk(char *devicename[], int drives) { struct drive *volatile drive; volatile int driveno; int firstdrive; /* first drive in this list */ volatile int gooddrives; /* number of usable drives found */ int firsttime; /* set if we have never configured before */ int error; struct nameidata nd; /* mount point credentials */ char *config_text; /* read the config info from disk into here */ char *volatile cptr; /* pointer into config information */ char *eptr; /* end pointer into config information */ char *config_line; /* copy the config line to */ volatile int status; int *volatile drivelist; /* list of drive indices */ #define DRIVENAMELEN 64 #define DRIVEPARTS 35 /* max partitions per drive, excluding c */ char partname[DRIVENAMELEN]; /* for creating partition names */ status = 0; /* success indication */ vinum_conf.flags |= VF_READING_CONFIG; /* reading config from disk */ gooddrives = 0; /* number of usable drives found */ firstdrive = vinum_conf.drives_used; /* the first drive */ firsttime = vinum_conf.drives_used == 0; /* are we a virgin? */ /* allocate a drive pointer list */ drivelist = (int *) Malloc(drives * DRIVEPARTS * sizeof(int)); CHECKALLOC(drivelist, "Can't allocate memory"); /* Open all drives and find which was modified most recently */ for (driveno = 0; driveno < drives; driveno++) { char part; /* UNIX partition */ int slice; int founddrive; /* flag when we find a vinum drive */ founddrive = 0; /* no vinum drive found yet on this spindle */ /* first try the partition table */ for (slice = 1; slice < 5; slice++) for (part = 'a'; part < 'i'; part++) { if (part != 'c') { /* don't do the c partition */ snprintf(partname, DRIVENAMELEN, "%ss%d%c", devicename[driveno], slice, part); drive = check_drive(partname); /* try to open it */ if (drive->lasterror != 0) /* didn't work, */ free_drive(drive); /* get rid of it */ else if (drive->flags & VF_CONFIGURED) /* already read this config, */ log(LOG_WARNING, "vinum: already read config from %s\n", /* say so */ drive->label.name); else { drivelist[gooddrives] = drive->driveno; /* keep the drive index */ drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */ gooddrives++; } } } if (founddrive == 0) { /* didn't find anything, */ for (part = 'a'; part < 'i'; part++) /* try the compatibility partition */ if (part != 'c') { /* don't do the c partition */ snprintf(partname, /* /dev/sd0a */ DRIVENAMELEN, "%s%c", devicename[driveno], part); drive = check_drive(partname); /* try to open it */ if ((drive->lasterror != 0) /* didn't work, */ ||(drive->state != drive_up)) free_drive(drive); /* get rid of it */ else if (drive->flags & VF_CONFIGURED) /* already read this config, */ log(LOG_WARNING, "vinum: already read config from %s\n", /* say so */ drive->label.name); else { drivelist[gooddrives] = drive->driveno; /* keep the drive index */ drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */ gooddrives++; } } } } if (gooddrives == 0) { log(LOG_WARNING, "vinum: no drives found\n"); return ENOENT; } /* * We now have at least one drive * open. Sort them in order of config time * and merge the config info with what we * have already */ qsort(drivelist, gooddrives, sizeof(int), drivecmp); config_text = (char *) Malloc(MAXCONFIG * 2); /* allocate buffers */ CHECKALLOC(config_text, "Can't allocate memory"); config_line = (char *) Malloc(MAXCONFIGLINE * 2); /* allocate buffers */ CHECKALLOC(config_line, "Can't allocate memory"); for (driveno = 0; driveno < gooddrives; driveno++) { /* now include the config */ drive = &DRIVE[drivelist[driveno]]; /* point to the drive */ if (firsttime && (driveno == 0)) /* we've never configured before, */ log(LOG_INFO, "vinum: reading configuration from %s\n", drive->devicename); else log(LOG_INFO, "vinum: updating configuration from %s\n", drive->devicename); /* Read in both copies of the configuration information */ error = read_drive(drive, config_text, MAXCONFIG * 2, VINUM_CONFIG_OFFSET); if (error != 0) { log(LOG_ERR, "vinum: Can't read device %s, error %d\n", drive->devicename, error); Free(config_text); Free(config_line); free_drive(drive); /* give it back */ status = error; } /* * XXX At this point, check that the two copies are the same, and do something useful if not. * In particular, consider which is newer, and what this means for the integrity of the * data on the drive */ else { vinum_conf.drives_used++; /* another drive in use */ /* Parse the configuration, and add it to the global configuration */ for (cptr = config_text; *cptr != '\0';) { /* love this style(9) */ volatile int parse_status; /* return value from parse_config */ for (eptr = config_line; (*cptr != '\n') && (*cptr != '\0');) /* until the end of the line */ *eptr++ = *cptr++; *eptr = '\0'; /* and delimit */ if (setjmp(command_fail) == 0) { /* come back here on error and continue */ parse_status = parse_config(config_line, &keyword_set, 1); /* parse the config line */ if (parse_status < 0) { /* error in config */ /* * This config should have been parsed in user * space. If we run into problems here, something * serious is afoot. Complain and let the user * snarf the config to see what's wrong */ log(LOG_ERR, "vinum: Config error on drive %s, aborting integration\n", nd.ni_dirp); Free(config_text); Free(config_line); free_drive(drive); /* give it back */ status = EINVAL; } } while (*cptr == '\n') cptr++; /* skip to next line */ } } drive->flags |= VF_CONFIGURED; /* read this drive's configuration */ } Free(config_text); Free(drivelist); vinum_conf.flags &= ~VF_READING_CONFIG; /* no longer reading from disk */ if (status != 0) throw_rude_remark(status, "Couldn't read configuration"); updateconfig(VF_READING_CONFIG); /* update from disk config */ return 0; } /* * Compare the modification dates of the drives, for qsort. * Return 1 if a < b, 0 if a == b, 01 if a > b: in other * words, sort backwards */ int drivecmp(const void *va, const void *vb) { const struct drive *a = &DRIVE[*(const int *) va]; const struct drive *b = &DRIVE[*(const int *) vb]; if ((a->label.last_update.tv_sec == b->label.last_update.tv_sec) && (a->label.last_update.tv_usec == b->label.last_update.tv_usec)) return 0; else if ((a->label.last_update.tv_sec > b->label.last_update.tv_sec) || ((a->label.last_update.tv_sec == b->label.last_update.tv_sec) && (a->label.last_update.tv_usec > b->label.last_update.tv_usec))) return -1; else return 1; } Index: head/sys/dev/vinum/vinumraid5.c =================================================================== --- head/sys/dev/vinum/vinumraid5.c (revision 49534) +++ head/sys/dev/vinum/vinumraid5.c (revision 49535) @@ -1,638 +1,637 @@ /*- * Copyright (c) 1997, 1998 * Cybernet Corporation and Nan Yang Computer Services Limited. * All rights reserved. * * This software was developed as part of the NetMAX project. * * Written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Cybernet Corporation * and Nan Yang Computer Services Limited * 4. Neither the name of the Companies nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: raid5.c,v 1.15 1999/07/07 03:46:01 grog Exp grog $ + * $Id: vinumraid5.c,v 1.1 1999/08/07 08:22:49 grog Exp $ */ /* * XXX To do: * * lock ranges while calculating parity */ #include #include -#include #include /* * Parameters which describe the current transfer. * These are only used for calculation, but they * need to be passed to other functions, so it's * tidier to put them in a struct */ struct metrics { daddr_t stripebase; /* base address of stripe (1st subdisk) */ int stripeoffset; /* offset in stripe */ int stripesectors; /* total sectors to transfer in this stripe */ daddr_t sdbase; /* offset in subdisk of stripe base */ int sdcount; /* number of disks involved in this transfer */ daddr_t diskstart; /* remember where this transfer starts */ int psdno; /* number of parity subdisk */ int badsdno; /* number of down subdisk, if there is one */ int firstsdno; /* first data subdisk number */ /* These correspond to the fields in rqelement, sort of */ int useroffset; /* * Initial offset and length values for the first * data block */ int initoffset; /* start address of block to transfer */ short initlen; /* length in sectors of data transfer */ /* Define a normal operation */ int dataoffset; /* start address of block to transfer */ int datalen; /* length in sectors of data transfer */ /* Define a group operation */ int groupoffset; /* subdisk offset of group operation */ int grouplen; /* length in sectors of group operation */ /* Define a normal write operation */ int writeoffset; /* subdisk offset of normal write */ int writelen; /* length in sectors of write operation */ enum xferinfo flags; /* to check what we're doing */ int rqcount; /* number of elements in request */ }; enum requeststatus bre5(struct request *rq, int plexno, daddr_t * diskstart, daddr_t diskend); void complete_raid5_write(struct rqelement *); enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); void setrqebounds(struct rqelement *rqe, struct metrics *mp); /* * define the low-level requests needed to perform a * high-level I/O operation for a specific plex 'plexno'. * * Return 0 if all subdisks involved in the request are up, 1 if some * subdisks are not up, and -1 if the request is at least partially * outside the bounds of the subdisks. * * Modify the pointer *diskstart to point to the end address. On * read, return on the first bad subdisk, so that the caller * (build_read_request) can try alternatives. * * On entry to this routine, the prq structures are not assigned. The * assignment is performed by expandrq(). Strictly speaking, the * elements rqe->sdno of all entries should be set to -1, since 0 * (from bzero) is a valid subdisk number. We avoid this problem by * initializing the ones we use, and not looking at the others (index * >= prq->requests). */ enum requeststatus bre5(struct request *rq, int plexno, daddr_t * diskaddr, daddr_t diskend) { struct metrics m; /* most of the information */ struct sd *sd; struct plex *plex; struct buf *bp; /* user's bp */ struct rqgroup *rqg; /* the request group that we will create */ struct rqelement *rqe; /* point to this request information */ int rsectors; /* sectors remaining in this stripe */ int mysdno; /* another sd index in loops */ int rqno; /* request number */ m.diskstart = *diskaddr; /* start of transfer */ bp = rq->bp; /* buffer pointer */ plex = &PLEX[plexno]; /* point to the plex */ while (*diskaddr < diskend) { /* until we get it all sorted out */ struct rqelement *prqe = NULL; /* XXX */ m.badsdno = -1; /* no bad subdisk yet */ /* Part A: Define the request */ /* * First, calculate some sizes: * The offset of the start address from * the start of the stripe */ m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); /* * The plex-relative address of the * start of the stripe */ m.stripebase = *diskaddr - m.stripeoffset; /* subdisk containing the parity stripe */ m.psdno = plex->subdisks - 1 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) % plex->subdisks; /* * The number of the subdisk in which * the start is located */ m.firstsdno = m.stripeoffset / plex->stripesize; if (m.firstsdno >= m.psdno) /* at or past parity sd */ m.firstsdno++; /* increment it */ /* * The offset from the beginning of * the stripe on this subdisk */ m.initoffset = m.stripeoffset % plex->stripesize; /* The offset of the stripe start relative to this subdisk */ m.sdbase = m.stripebase / (plex->subdisks - 1); m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ /* * The number of sectors to transfer in the * current (first) subdisk */ m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */ plex->stripesize - m.initoffset); /* and the amount left in this block */ /* * The number of sectors to transfer in this stripe * is the minumum of the amount remaining to transfer * and the amount left in this stripe */ m.stripesectors = min(diskend - *diskaddr, plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); /* The number of data subdisks involved in this request */ m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; /* Part B: decide what kind of transfer this will be */ /* * start and end addresses of the transfer in * the current block. * * There are a number of different kinds of transfer, each of which relates to a * specific subdisk: * * 1. Normal read. All participating subdisks are up, and the transfer can be * made directly to the user buffer. The bounds of the transfer are described * by m.dataoffset and m.datalen. We have already calculated m.initoffset and * m.initlen, which define the parameters for the first data block. * * 2. Recovery read. One participating subdisk is down. To recover data, all * the other subdisks, including the parity subdisk, must be read. The data is * recovered by exclusive-oring all the other blocks. The bounds of the transfer * are described by m.groupoffset and m.grouplen. * * 3. A read request may request reading both available data (normal read) and * non-available data (recovery read). This can be a problem if the address ranges * of the two reads do not coincide: in this case, the normal read needs to be * extended to cover the address range of the recovery read, and must thus be * performed out of malloced memory. * * 4. Normal write. All the participating subdisks are up. The bounds of the transfer * are described by m.dataoffset and m.datalen. Since these values differ for each * block, we calculate the bounds for the parity block independently as the maximum * of the individual blocks and store these values in m.writeoffset and m.writelen. * This write proceeds in four phases: * * i. Read the old contents of each block and the parity block. * * ii. ``Remove'' the old contents from the parity block with exclusive or. * * iii. ``Insert'' the new contents of the block in the parity block, again with * exclusive or. * * iv. Write the new contents of the data blocks and the parity block. The data block * transfers can be made directly from the user buffer. * * 5. Degraded write where the data block is not available. The bounds of the * transfer are described by m.groupoffset and m.grouplen. This requires the * following steps: * * i. Read in all the other data blocks, excluding the parity block. * * ii. Recreate the parity block from the other data blocks and the data to be written. * * iii. Write the parity block. * * 6. Parityless write, a write where the parity block is not available. This * is in fact the simplest: just write the data blocks. This can proceed directly * from the user buffer. The bounds of the transfer are described * by m.dataoffset and m.datalen. * * 7. Combination of degraded data block write and normal write. In this case the * address ranges of the reads may also need to be extended to cover all * participating blocks. * * All requests in a group transfer transfer the same address range relative * to their subdisk. The individual transfers may vary, but since our group of * requests is all in a single slice, we can define a range in which they all * fall. * * In the following code section, we determine which kind of transfer we will perform. * If there is a group transfer, we also decide its bounds relative to the subdisks. * At the end, we have the following values: * * m.flags indicates the kinds of transfers we will perform * m.initoffset indicates the offset of the beginning of any data * operation relative to the beginning of the stripe base. * m.initlen specifies the length of any data operation. * m.dataoffset contains the same value as m.initoffset. * m.datalen contains the same value as m.initlen. Initially * dataoffset and datalen describe the parameters for the first * data block; while building the data block requests, they are * updated for each block. * m.groupoffset indicates the offset of any group operation relative * to the beginning of the stripe base * m.grouplen specifies the length of any group operation * m.writeoffset indicates the offset of a normal write relative * to the beginning of the stripe base. This value differs from * m.dataoffset in that it applies to the entire operation, and * not just the first block. * m.writelen specifies the total span of a normal write operation. * writeoffset and writelen are used to define the parity block. */ m.groupoffset = 0; /* assume no group... */ m.grouplen = 0; /* until we know we have one */ m.writeoffset = m.initoffset; /* start offset of transfer */ m.writelen = 0; /* nothing to write yet */ m.flags = 0; /* no flags yet */ rsectors = m.stripesectors; /* remaining sectors to examine */ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ m.datalen = m.initlen; if (m.sdcount > 1) { plex->multiblock++; /* more than one block for the request */ /* * If we have two transfers that don't overlap, * (one at the end of the first block, the other * at the beginning of the second block), * it's cheaper to split them */ if (rsectors < plex->stripesize) { m.sdcount = 1; /* just one subdisk */ m.stripesectors = m.initlen; /* and just this many sectors */ rsectors = m.initlen; /* and in the loop counter */ } } if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ m.badsdno = m.psdno; /* note that it's down */ if (bp->b_flags & B_READ) { /* read operation */ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { if (mysdno == m.psdno) /* ignore parity on read */ mysdno++; if (mysdno == plex->subdisks) /* wraparound */ mysdno = 0; if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ if (m.badsdno >= 0) /* we had one already, */ /* * XXX be cleverer here. We can still * read what we can read. */ return REQUEST_DOWN; /* we can't take a second */ m.badsdno = mysdno; /* got the first */ m.groupoffset = m.dataoffset; /* define the bounds */ m.grouplen = m.datalen; m.flags |= XFR_RECOVERY_READ; /* we need recovery */ plex->recovered_reads++; /* count another one */ } else m.flags |= XFR_NORMAL_READ; /* normal read */ /* Update the pointers for the next block */ m.dataoffset = 0; /* back to the start of the stripe */ rsectors -= m.datalen; /* remaining sectors to examine */ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ } } else { /* write operation */ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ mysdno++; if (mysdno == plex->subdisks) /* wraparound */ mysdno = 0; if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ sd = &SD[plex->sdnos[mysdno]]; if (sd->state != sd_up) { enum requeststatus s; s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ if (s && (m.badsdno >= 0)) { /* second bad disk, */ int sdno; /* * If the parity disk is down, there's * no recovery. We make all involved * subdisks stale. Otherwise, we * should be able to recover, but it's * like pulling teeth. Fix it later. * * XXX be cleverer here. We should * still write what we can write. */ for (sdno = 0; sdno < m.sdcount; sdno++) { struct sd *sd = &SD[plex->sdnos[sdno]]; if (sd->state >= sd_reborn) /* sort of up, */ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ } return s; /* and crap out */ } m.badsdno = mysdno; /* note which one is bad */ m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ plex->degraded_writes++; /* count another one */ m.groupoffset = m.dataoffset; /* define the bounds */ m.grouplen = m.datalen; } else { m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ if (m.writeoffset > m.dataoffset) { /* move write operation lower */ m.writelen = max(m.writeoffset + m.writelen, m.dataoffset + m.datalen) - m.dataoffset; m.writeoffset = m.dataoffset; } else m.writelen = max(m.writeoffset + m.writelen, m.dataoffset + m.datalen) - m.writeoffset; } /* Update the pointers for the next block */ m.dataoffset = 0; /* back to the start of the stripe */ rsectors -= m.datalen; /* remaining sectors to examine */ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ } if (m.badsdno == m.psdno) { /* got a bad parity block, */ struct sd *psd = &SD[plex->sdnos[m.psdno]]; if (psd->state == sd_down) set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ else if (psd->state == sd_crashed) set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ plex->parityless_writes++; /* count another one */ } } /* reset the initial transfer values */ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ m.datalen = m.initlen; /* * XXX see if we can satisfy a recovery_read from a * different plex. If so, return from here with no requests WRITEME */ /* decide how many requests we need */ if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) /* doing a recovery read or degraded write, */ m.rqcount = plex->subdisks; /* all subdisks */ else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ else /* parityless write or normal read */ m.rqcount = m.sdcount; /* just the data blocks */ /* Part C: build the requests */ rqg = allocrqg(rq, m.rqcount); /* get a request group */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; } rqg->plexno = plexno; rqg->flags = m.flags; rqno = 0; /* index in the request group */ /* 1: PARITY BLOCK */ /* * Are we performing an operation which requires parity? In that case, * work out the parameters and define the parity block. * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE */ if (m.flags & XFR_PARITYOP) { /* need parity */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point back to group */ rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ setrqebounds(rqe, &m); /* set up the bounds of the transfer */ rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; prqe = rqe; /* debug XXX */ if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ rqe->b.b_flags |= B_READ; /* we must read first */ m.sdcount++; /* adjust the subdisk count */ rqno++; /* and point to the next request */ } /* * 2: DATA BLOCKS * Now build up requests for the blocks required * for individual transfers */ for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (mysdno == plex->subdisks) /* got to the end, */ mysdno = 0; /* wrap around */ if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point to group */ if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ else rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ if (mysdno == m.badsdno) { /* this is the bad subdisk */ rqg->badsdno = rqno; /* note which one */ rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ /* * we can't read or write from/to it, * but we don't need to malloc */ rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); } setrqebounds(rqe, &m); /* set up the bounds of the transfer */ #if VINUMDEBUG if (prqe && (rqe->groupoffset + rqe->sdoffset) < prqe->sdoffset) /* XXX */ Debugger("Low data block"); /* XXX */ #endif rqe->useroffset = m.useroffset; /* offset in user buffer */ rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ if ((m.flags & XFR_PARITYOP) /* parity operation, */ &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ rqe->b.b_flags |= B_READ; /* we must read first */ /* Now update pointers for the next block */ *diskaddr += m.datalen; /* skip past what we've done */ m.stripesectors -= m.datalen; /* deduct from what's left */ m.useroffset += m.datalen; /* and move on in the user buffer */ m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */ m.dataoffset = 0; /* start at the beginning of next block */ } /* * 3: REMAINING BLOCKS FOR RECOVERY * Finally, if we have a recovery operation, build * up transfers for the other subdisks. Follow the * subdisks around until we get to where we started. * These requests use only the group parameters. */ if ((rqno < m.rqcount) /* haven't done them all already */ &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { for (; rqno < m.rqcount; rqno++, mysdno++) { if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ if (mysdno == plex->subdisks) /* got to the end, */ mysdno = 0; /* wrap around */ if (mysdno == m.psdno) /* parity, */ mysdno++; /* we've given already */ rqe = &rqg->rqe[rqno]; /* point to element */ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ rqe->rqg = rqg; /* point to group */ rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ rqe->dataoffset = 0; /* for tidiness' sake */ rqe->groupoffset = 0; /* group starts at the beginining */ rqe->datalen = 0; rqe->grouplen = m.grouplen; rqe->buflen = m.grouplen; rqe->flags = (m.flags | XFR_MALLOCED) & ~XFR_DATAOP; /* transfer flags without data op stuf */ rqe->sdno = sd->sdno; /* subdisk number */ rqe->driveno = sd->driveno; if (build_rq_buffer(rqe, plex)) /* build the buffer */ return REQUEST_ENOMEM; /* can't do it */ rqe->b.b_flags |= B_READ; /* we must read first */ } } if (*diskaddr < diskend) /* didn't finish the request on this stripe */ plex->multistripe++; /* count another one */ } return REQUEST_OK; } /* * Helper function for rqe5: adjust the bounds of the transfers to minimize * the buffer allocation. * * Each request can handle two of three different data ranges: * * 1. The range described by the parameters dataoffset and datalen, * for normal read or parityless write. * 2. The range described by the parameters groupoffset and grouplen, * for recovery read and degraded write. * 3. For normal write, the range depends on the kind of block. For * data blocks, the range is defined by dataoffset and datalen. For * parity blocks, it is defined by writeoffset and writelen. * * In order not to allocate more memory than necessary, this function * adjusts the bounds parameter for each request to cover just the minimum * necessary for the function it performs. This will normally vary from one * request to the next. * * Things are slightly different for the parity block. In this case, the bounds * defined by mp->writeoffset and mp->writelen also play a rôle. Select this * case by setting the parameter forparity != 0 */ void setrqebounds(struct rqelement *rqe, struct metrics *mp) { /* parity block of a normal write */ if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */ if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */ /* * With a combined normal and degraded write, we * will zero out the area of the degraded write * in the second phase, so we don't need to read * it in. Unfortunately, we need a way to tell * build_request_buffer the size of the buffer, * and currently that's the length of the read. * As a result, we read everything, even the stuff * that we're going to nuke. * FIXME XXX */ if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */ rqe->groupoffset = 0; /* and the group at the beginning */ } else { /* individual data starts first */ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ rqe->dataoffset = 0; /* individual data starts at the beginning */ rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */ } rqe->datalen = mp->writelen; rqe->grouplen = mp->grouplen; } else { /* just normal write (case 3) */ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ rqe->dataoffset = 0; /* degradation starts at the beginning */ rqe->groupoffset = 0; /* for tidiness' sake */ rqe->datalen = mp->writelen; rqe->grouplen = 0; } } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */ if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */ if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */ rqe->groupoffset = 0; /* and the group at the beginning */ } else { /* individual data starts first */ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ rqe->dataoffset = 0; /* individual data starts at the beginning */ rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */ } rqe->datalen = mp->datalen; rqe->grouplen = mp->grouplen; } else { /* just data operation (case 1) */ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ rqe->dataoffset = 0; /* degradation starts at the beginning */ rqe->groupoffset = 0; /* for tidiness' sake */ rqe->datalen = mp->datalen; rqe->grouplen = 0; } } else { /* just group operations (case 2) */ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ rqe->dataoffset = 0; /* for tidiness' sake */ rqe->groupoffset = 0; /* group starts at the beginining */ rqe->datalen = 0; rqe->grouplen = mp->grouplen; } rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */ rqe->groupoffset + rqe->grouplen); } Index: head/sys/dev/vinum/vinumrequest.c =================================================================== --- head/sys/dev/vinum/vinumrequest.c (revision 49534) +++ head/sys/dev/vinum/vinumrequest.c (revision 49535) @@ -1,1064 +1,1063 @@ /*- * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. * * Written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinumrequest.c,v 1.24 1999/07/05 01:53:14 grog Exp grog $ + * $Id: vinumrequest.c,v 1.29 1999/08/07 08:13:23 grog Exp $ */ #include #include -#include #include enum requeststatus bre(struct request *rq, int plexno, daddr_t * diskstart, daddr_t diskend); enum requeststatus bre5(struct request *rq, int plexno, daddr_t * diskstart, daddr_t diskend); enum requeststatus build_read_request(struct request *rq, int volplexno); enum requeststatus build_write_request(struct request *rq); enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); void freerq(struct request *rq); int find_alternate_sd(struct request *rq); int check_range_covered(struct request *); void complete_rqe(struct buf *bp); void complete_raid5_write(struct rqelement *); int abortrequest(struct request *rq, int error); void sdio_done(struct buf *bp); int vinum_bounds_check(struct buf *bp, struct volume *vol); caddr_t allocdatabuf(struct rqelement *rqe); void freedatabuf(struct rqelement *rqe); #ifdef VINUMDEBUG struct rqinfo rqinfo[RQINFO_SIZE]; struct rqinfo *rqip = rqinfo; void logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp) { int s = splhigh(); microtime(&rqip->timestamp); /* when did this happen? */ rqip->type = type; rqip->bp = ubp; /* user buffer */ switch (type) { case loginfo_user_bp: case loginfo_user_bpl: bcopy(info.bp, &rqip->info.b, sizeof(struct buf)); rqip->devmajor = major(info.bp->b_dev); rqip->devminor = minor(info.bp->b_dev); break; case loginfo_iodone: case loginfo_rqe: case loginfo_raid5_data: case loginfo_raid5_parity: bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement)); rqip->devmajor = major(info.rqe->b.b_dev); rqip->devminor = minor(info.rqe->b.b_dev); break; case loginfo_unused: break; } rqip++; if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */ rqip = rqinfo; splx(s); } #endif void vinumstrategy(struct buf *bp) { int volno; struct volume *vol = NULL; switch (DEVTYPE(bp->b_dev)) { case VINUM_SD_TYPE: case VINUM_RAWSD_TYPE: sdio(bp); return; /* * In fact, vinum doesn't handle drives: they're * handled directly by the disk drivers */ case VINUM_DRIVE_TYPE: default: bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; biodone(bp); return; case VINUM_VOLUME_TYPE: /* volume I/O */ volno = Volno(bp->b_dev); vol = &VOL[volno]; if (vol->state != volume_up) { /* can't access this volume */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; biodone(bp); return; } if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */ biodone(bp); /* have nothing to do with this */ return; } /* FALLTHROUGH */ /* * Plex I/O is pretty much the same as volume I/O * for a single plex. Indicate this by passing a NULL * pointer (set above) for the volume */ case VINUM_PLEX_TYPE: case VINUM_RAWPLEX_TYPE: bp->b_resid = bp->b_bcount; /* transfer everything */ vinumstart(bp, 0); return; } } /* * Start a transfer. Return -1 on error, * 0 if OK, 1 if we need to retry. * Parameter reviveok is set when doing * transfers for revives: it allows transfers to * be started immediately when a revive is in * progress. During revive, normal transfers * are queued if they share address space with * a currently active revive operation. */ int vinumstart(struct buf *bp, int reviveok) { int plexno; int maxplex; /* maximum number of plexes to handle */ struct volume *vol; struct request *rq; /* build up our request here */ enum requeststatus status; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bp, (union rqinfou) bp, bp); #endif /* * XXX In these routines, we're assuming that * we will always be called with bp->b_bcount * which is a multiple of the sector size. This * is a reasonable assumption, since we are only * called from system routines. Should we check * anyway? */ if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */ bp->b_error = EINVAL; /* invalid size */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */ if (rq == NULL) { /* can't do it */ bp->b_error = ENOMEM; /* can't get memory */ bp->b_flags |= B_ERROR; biodone(bp); return -1; } bzero(rq, sizeof(struct request)); /* * Note the volume ID. This can be NULL, which * the request building functions use as an * indication for single plex I/O */ rq->bp = bp; /* and the user buffer struct */ if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */ rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */ vol = &VOL[rq->volplex.volno]; /* and point to it */ vol->active++; /* one more active request */ maxplex = vol->plexes; /* consider all its plexes */ } else { vol = NULL; /* no volume */ rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */ rq->isplex = 1; /* note that it's a plex */ maxplex = 1; /* just the one plex */ } if (bp->b_flags & B_READ) { /* * This is a read request. Decide * which plex to read from. * * There's a potential race condition here, * since we're not locked, and we could end * up multiply incrementing the round-robin * counter. This doesn't have any serious * effects, however. */ if (vol != NULL) { vol->reads++; vol->bytes_read += bp->b_bcount; plexno = vol->preferred_plex; /* get the plex to use */ if (plexno < 0) { /* round robin */ plexno = vol->last_plex_read; vol->last_plex_read++; if (vol->last_plex_read == vol->plexes) /* got the the end? */ vol->last_plex_read = 0; /* wrap around */ } status = build_read_request(rq, plexno); /* build a request */ } else { daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */ status = bre(rq, /* build a request list */ rq->volplex.plexno, &diskaddr, diskaddr + (bp->b_bcount / DEV_BSIZE)); } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } else /* * This is a write operation. We write to all * plexes. If this is a RAID 5 plex, we must also * update the parity stripe. */ { if (vol != NULL) { vol->writes++; vol->bytes_written += bp->b_bcount; status = build_write_request(rq); /* Not all the subdisks are up */ } else { /* plex I/O */ daddr_t diskstart; diskstart = bp->b_blkno; /* start offset of transfer */ status = bre(rq, Plexno(bp->b_dev), &diskstart, bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */ } if ((status > REQUEST_RECOVERED) /* can't satisfy it */ ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */ if (status == REQUEST_DOWN) { /* not enough subdisks */ bp->b_error = EIO; /* I/O error */ bp->b_flags |= B_ERROR; } if ((bp->b_flags & B_DONE) == 0) biodone(bp); freerq(rq); return -1; } return launch_requests(rq, reviveok); /* now start the requests if we can */ } } /* * Call the low-level strategy routines to * perform the requests in a struct request */ int launch_requests(struct request *rq, int reviveok) { struct rqgroup *rqg; int rqno; /* loop index */ struct rqelement *rqe; /* current element */ int s; /* * First find out whether we're reviving, and the * request contains a conflict. If so, we hang * the request off plex->waitlist of the first * plex we find which is reviving */ if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */ &&(!reviveok)) { /* and we don't want to do it now, */ struct sd *sd; struct request *waitlist; /* point to the waitlist */ sd = &SD[rq->sdno]; if (sd->waitlist != NULL) { /* something there already, */ waitlist = sd->waitlist; while (waitlist->next != NULL) /* find the end */ waitlist = waitlist->next; waitlist->next = rq; /* hook our request there */ } else sd->waitlist = rq; /* hook our request at the front */ #if VINUMDEBUG if (debug & DEBUG_REVIVECONFLICT) log(LOG_DEBUG, "Revive conflict sd %d: %x\n%s dev %d.%d, offset 0x%x, length %ld\n", rq->sdno, (u_int) rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); /* XXX */ #endif return 0; /* and get out of here */ } rq->active = 0; /* nothing yet */ /* XXX This is probably due to a bug */ if (rq->rqg == NULL) { /* no request */ log(LOG_ERR, "vinum: null rqg\n"); abortrequest(rq, EINVAL); return -1; } #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, "Request: %x\n%s dev %d.%d, offset 0x%x, length %ld\n", (u_int) rq, rq->bp->b_flags & B_READ ? "Read" : "Write", major(rq->bp->b_dev), minor(rq->bp->b_dev), rq->bp->b_blkno, rq->bp->b_bcount); /* XXX */ vinum_conf.lastrq = (int) rq; vinum_conf.lastbuf = rq->bp; if (debug & DEBUG_LASTREQS) logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp); #endif s = splbio(); for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */ rqg->active = rqg->count; /* they're all active */ rq->active++; /* one more active request group */ for (rqno = 0; rqno < rqg->count; rqno++) { rqe = &rqg->rqe[rqno]; if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */ rqg->active--; /* one less active request */ else if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk isn't bad, we can do it */ if ((rqe->b.b_flags & B_READ) == 0) rqe->b.b_vp->v_numoutput++; /* one more output going */ rqe->b.b_flags |= B_ORDERED; /* XXX chase SCSI driver */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", rqe->b.b_flags & B_READ ? "Read" : "Write", major(rqe->b.b_dev), minor(rqe->b.b_dev), rqe->sdno, (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), rqe->b.b_blkno, rqe->b.b_bcount); /* XXX */ if (debug & DEBUG_NUMOUTPUT) log(LOG_DEBUG, " vinumstart sd %d numoutput %ld\n", rqe->sdno, rqe->b.b_vp->v_numoutput); if (debug & DEBUG_LASTREQS) logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp); #endif /* fire off the request */ (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); } } } splx(s); return 0; } /* * define the low-level requests needed to perform a * high-level I/O operation for a specific plex 'plexno'. * * Return REQUEST_OK if all subdisks involved in the request are up, * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the * request is at least partially outside the bounds of the subdisks. * * Modify the pointer *diskstart to point to the end address. On * read, return on the first bad subdisk, so that the caller * (build_read_request) can try alternatives. * * On entry to this routine, the rqg structures are not assigned. The * assignment is performed by expandrq(). Strictly speaking, the * elements rqe->sdno of all entries should be set to -1, since 0 * (from bzero) is a valid subdisk number. We avoid this problem by * initializing the ones we use, and not looking at the others (index * >= rqg->requests). */ enum requeststatus bre(struct request *rq, int plexno, daddr_t * diskaddr, daddr_t diskend) { int sdno; struct sd *sd; struct rqgroup *rqg; struct buf *bp; /* user's bp */ struct plex *plex; enum requeststatus status; /* return value */ daddr_t plexoffset; /* offset of transfer in plex */ daddr_t stripebase; /* base address of stripe (1st subdisk) */ daddr_t stripeoffset; /* offset in stripe */ daddr_t blockoffset; /* offset in stripe on subdisk */ struct rqelement *rqe; /* point to this request information */ daddr_t diskstart = *diskaddr; /* remember where this transfer starts */ enum requeststatus s; /* temp return value */ bp = rq->bp; /* buffer pointer */ status = REQUEST_OK; /* return value: OK until proven otherwise */ plex = &PLEX[plexno]; /* point to the plex */ switch (plex->organization) { case plex_concat: sd = NULL; /* (keep compiler quiet) */ for (sdno = 0; sdno < plex->subdisks; sdno++) { sd = &SD[plex->sdnos[sdno]]; if (*diskaddr < sd->plexoffset) /* we must have a hole, */ status = REQUEST_DEGRADED; /* note the fact */ if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */ rqg = allocrqg(rq, 1); /* space for the request */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; } rqg->plexno = plexno; rqe = &rqg->rqe[0]; /* point to the element */ rqe->rqg = rqg; /* group */ rqe->sdno = sd->sdno; /* put in the subdisk number */ plexoffset = *diskaddr; /* start offset in plex */ rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */ rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */ rqe->dataoffset = 0; rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */ sd->sectors - rqe->sdoffset); rqe->groupoffset = 0; /* no groups for concatenated plexes */ rqe->grouplen = 0; rqe->buflen = rqe->datalen; /* buffer length is data buffer length */ rqe->flags = 0; rqe->driveno = sd->driveno; if (sd->state != sd_up) { /* *now* we find the sd is down */ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ if (s == REQUEST_DOWN) { /* down? */ if (rq->bp->b_flags & B_READ) /* read request, */ return REQUEST_DEGRADED; /* give up here */ /* * If we're writing, don't give up * because of a bad subdisk. Go * through to the bitter end, but note * which ones we can't access. */ rqe->flags = XFR_BAD_SUBDISK; status = REQUEST_DEGRADED; /* can't do it all */ } } *diskaddr += rqe->datalen; /* bump the address */ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */ /* * We could build the buffer anyway, even if the * subdisk is down, but it's a waste of time and * space. */ if (build_rq_buffer(rqe, plex)) { /* build the buffer */ deallocrqg(rqg); bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; /* can't do it */ } } } if (*diskaddr == diskend) /* we're finished, */ break; /* get out of here */ } /* * We've got to the end of the plex. Have we got to the end of * the transfer? It would seem that having an offset beyond the * end of the subdisk is an error, but in fact it can happen if * the volume has another plex of different size. There's a valid * question as to why you would want to do this, but currently * it's allowed. * * In a previous version, I returned REQUEST_DOWN here. I think * REQUEST_EOF is more appropriate now. */ if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */ status = REQUEST_EOF; break; case plex_striped: { while (*diskaddr < diskend) { /* until we get it all sorted out */ if (*diskaddr >= plex->length) /* beyond the end of the plex */ return REQUEST_EOF; /* can't continue */ /* The offset of the start address from the start of the stripe. */ stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks); /* The plex-relative address of the start of the stripe. */ stripebase = *diskaddr - stripeoffset; /* The number of the subdisk in which the start is located. */ sdno = stripeoffset / plex->stripesize; /* The offset from the beginning of the stripe on this subdisk. */ blockoffset = stripeoffset % plex->stripesize; sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */ rqg = allocrqg(rq, 1); /* space for the request */ if (rqg == NULL) { /* malloc failed */ bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; } rqg->plexno = plexno; rqe = &rqg->rqe[0]; /* point to the element */ rqe->rqg = rqg; rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */ rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */ rqe->dataoffset = 0; rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */ plex->stripesize - blockoffset); /* and the amount left in this stripe */ rqe->groupoffset = 0; /* no groups for striped plexes */ rqe->grouplen = 0; rqe->buflen = rqe->datalen; /* buffer length is data buffer length */ rqe->flags = 0; rqe->sdno = sd->sdno; /* put in the subdisk number */ rqe->driveno = sd->driveno; if (sd->state != sd_up) { /* *now* we find the sd is down */ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ if (s == REQUEST_DOWN) { /* down? */ if (rq->bp->b_flags & B_READ) /* read request, */ return REQUEST_DEGRADED; /* give up here */ /* * If we're writing, don't give up * because of a bad subdisk. Go through * to the bitter end, but note which * ones we can't access. */ rqe->flags = XFR_BAD_SUBDISK; /* yup */ status = REQUEST_DEGRADED; /* can't do it all */ } } /* * It would seem that having an offset * beyond the end of the subdisk is an * error, but in fact it can happen if the * volume has another plex of different * size. There's a valid question as to why * you would want to do this, but currently * it's allowed. */ if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */ rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */ #if VINUMDEBUG if (debug & DEBUG_EOFINFO) { /* tell on the request */ log(LOG_DEBUG, "vinum: EOF on plex %s, sd %s offset %x (user offset %x)\n", plex->name, sd->name, (u_int) sd->sectors, bp->b_blkno); log(LOG_DEBUG, "vinum: stripebase %x, stripeoffset %x, blockoffset %x\n", stripebase, stripeoffset, blockoffset); } #endif } if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */ if (build_rq_buffer(rqe, plex)) { /* build the buffer */ deallocrqg(rqg); bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return REQUEST_ENOMEM; /* can't do it */ } } *diskaddr += rqe->datalen; /* look at the remainder */ if ((*diskaddr < diskend) /* didn't finish the request on this stripe */ &&(*diskaddr < plex->length)) { /* and there's more to come */ plex->multiblock++; /* count another one */ if (sdno == plex->subdisks - 1) /* last subdisk, */ plex->multistripe++; /* another stripe as well */ } } } break; /* * RAID5 is complicated enough to have * its own function */ case plex_raid5: status = bre5(rq, plexno, diskaddr, diskend); break; default: log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization); status = REQUEST_DOWN; /* can't access it */ } return status; } /* * Build up a request structure for reading volumes. * This function is not needed for plex reads, since there's * no recovery if a plex read can't be satisified. */ enum requeststatus build_read_request(struct request *rq, /* request */ int plexindex) { /* index in the volume's plex table */ struct buf *bp; daddr_t startaddr; /* offset of previous part of transfer */ daddr_t diskaddr; /* offset of current part of transfer */ daddr_t diskend; /* and end offset of transfer */ int plexno; /* plex index in vinum_conf */ struct rqgroup *rqg; /* point to the request we're working on */ struct volume *vol; /* volume in question */ off_t oldstart; /* note where we started */ int recovered = 0; /* set if we recover a read */ enum requeststatus status = REQUEST_OK; int plexmask; /* bit mask of plexes, for recovery */ bp = rq->bp; /* buffer pointer */ diskaddr = bp->b_blkno; /* start offset of transfer */ diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */ rqg = &rq->rqg[plexindex]; /* plex request */ vol = &VOL[rq->volplex.volno]; /* point to volume */ while (diskaddr < diskend) { /* build up request components */ startaddr = diskaddr; status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */ switch (status) { case REQUEST_OK: continue; case REQUEST_RECOVERED: /* * XXX FIXME if we have more than one plex, and we can * satisfy the request from another, don't use the * recovered request, since it's more expensive. */ recovered = 1; break; case REQUEST_ENOMEM: return status; /* * If we get here, our request is not complete. Try * to fill in the missing parts from another plex. * This can happen multiple times in this function, * and we reinitialize the plex mask each time, since * we could have a hole in our plexes. */ case REQUEST_EOF: case REQUEST_DOWN: /* can't access the plex */ case REQUEST_DEGRADED: /* can't access the plex */ plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */ &~(1 << plexindex); /* except for the one we were looking at */ for (plexno = 0; plexno < vol->plexes; plexno++) { if (plexmask == 0) /* no plexes left to try */ return REQUEST_DOWN; /* failed */ diskaddr = startaddr; /* start at the beginning again */ oldstart = startaddr; /* and note where that was */ if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */ bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */ if (diskaddr > oldstart) { /* we satisfied another part */ recovered = 1; /* we recovered from the problem */ status = REQUEST_OK; /* don't complain about it */ break; } } } } if (recovered) vol->recovered_reads += recovered; /* adjust our recovery count */ } return status; } /* * Build up a request structure for writes. * Return 0 if all subdisks involved in the request are up, 1 if some * subdisks are not up, and -1 if the request is at least partially * outside the bounds of the subdisks. */ enum requeststatus build_write_request(struct request *rq) { /* request */ struct buf *bp; daddr_t diskstart; /* offset of current part of transfer */ daddr_t diskend; /* and end offset of transfer */ int plexno; /* plex index in vinum_conf */ struct volume *vol; /* volume in question */ enum requeststatus status; bp = rq->bp; /* buffer pointer */ vol = &VOL[rq->volplex.volno]; /* point to volume */ diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */ status = REQUEST_DOWN; /* assume the worst */ for (plexno = 0; plexno < vol->plexes; plexno++) { diskstart = bp->b_blkno; /* start offset of transfer */ /* * Build requests for the plex. * We take the best possible result here (min, * not max): we're happy if we can write at all */ status = min(status, bre(rq, vol->plex[plexno], &diskstart, diskend)); } return status; } /* Fill in the struct buf part of a request element. */ enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex) { struct sd *sd; /* point to subdisk */ struct volume *vol; struct buf *bp; struct buf *ubp; /* user (high level) buffer header */ vol = &VOL[rqe->rqg->rq->volplex.volno]; sd = &SD[rqe->sdno]; /* point to subdisk */ bp = &rqe->b; ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */ /* Initialize the buf struct */ bp->b_flags = ubp->b_flags & (B_NOCACHE | B_READ | B_ASYNC); /* copy these flags from user bp */ bp->b_flags |= B_CALL; /* inform us when it's done */ BUF_LOCKINIT(bp); /* get a lock for the buffer */ BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */ /* * XXX Should we check for reviving plexes here, and * set B_ORDERED if so? */ bp->b_iodone = complete_rqe; /* by calling us here */ bp->b_dev = DRIVE[rqe->driveno].vp->v_rdev; /* drive device */ bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */ bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */ bp->b_resid = bp->b_bcount; /* and it's still all waiting */ bp->b_bufsize = bp->b_bcount; /* and buffer size */ bp->b_vp = DRIVE[rqe->driveno].vp; /* drive vnode */ bp->b_rcred = FSCRED; /* we have the file system credentials */ bp->b_wcred = FSCRED; /* we have the file system credentials */ if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */ bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */ if (bp->b_data == NULL) { /* failed */ Debugger("XXX"); abortrequest(rqe->rqg->rq, ENOMEM); return REQUEST_ENOMEM; /* no memory */ } } else /* * Point directly to user buffer data. This means * that we don't need to do anything when we have * finished the transfer */ bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE; /* * On a recovery read, we perform an XOR of * all blocks to the user buffer. To make * this work, we first clean out the buffer */ if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */ int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */ char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */ bzero(data, length); /* clean it out */ } return 0; } /* * Abort a request: free resources and complete the * user request with the specified error */ int abortrequest(struct request *rq, int error) { struct buf *bp = rq->bp; /* user buffer */ bp->b_flags |= B_ERROR; bp->b_error = error; freerq(rq); /* free everything we're doing */ biodone(bp); return error; /* and give up */ } /* * Check that our transfer will cover the * complete address space of the user request. * * Return 1 if it can, otherwise 0 */ int check_range_covered(struct request *rq) { /* XXX */ return 1; } /* Perform I/O on a subdisk */ void sdio(struct buf *bp) { int s; /* spl */ struct sd *sd; struct sdbuf *sbp; daddr_t endoffset; struct drive *drive; sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ drive = &DRIVE[sd->driveno]; if (drive->state != drive_up) { /* XXX until we get the states fixed */ if (bp->b_flags & B_WRITE) /* writing, */ set_sd_state(Sdno(bp->b_dev), sd_stale, setstate_force); else set_sd_state(Sdno(bp->b_dev), sd_crashed, setstate_force); bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } if (sd->state < sd_empty) { /* nothing to talk to, */ bp->b_flags |= B_ERROR; bp->b_flags = EIO; biodone(bp); return; } /* Get a buffer */ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); if (sbp == NULL) { bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return; } bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ /* * XXX Should we check for reviving plexes here, and * set B_ORDERED if so? */ sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */ sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ sbp->b.b_resid = bp->b_resid; /* and amount waiting */ sbp->b.b_dev = DRIVE[sd->driveno].vp->v_rdev; /* device */ sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->b.b_vp = DRIVE[sd->driveno].vp; /* vnode */ sbp->bp = bp; /* note the address of the original header */ sbp->sdno = sd->sdno; /* note for statistics */ sbp->driveno = sd->driveno; endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ if (endoffset > sd->sectors) { /* beyond the end */ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ bp->b_resid = bp->b_bcount; /* nothing transferred */ /* * XXX Grrr. This doesn't seem to work. Return * an error after all */ bp->b_flags |= B_ERROR; bp->b_error = ENOSPC; biodone(bp); Free(sbp); return; } } if ((sbp->b.b_flags & B_READ) == 0) /* write */ sbp->b.b_vp->v_numoutput++; /* one more output going */ #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", sbp->b.b_flags & B_READ ? "Read" : "Write", major(sbp->b.b_dev), minor(sbp->b.b_dev), sbp->sdno, (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), (int) sbp->b.b_blkno, sbp->b.b_bcount); /* XXX */ if (debug & DEBUG_NUMOUTPUT) log(LOG_DEBUG, " vinumstart sd %d numoutput %ld\n", sbp->sdno, sbp->b.b_vp->v_numoutput); #endif s = splbio(); (*bdevsw(sbp->b.b_dev)->d_strategy) (&sbp->b); splx(s); } /* * Simplified version of bounds_check_with_label * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. * * Volumes are simpler than disk slices: they only contain * one component (though we call them a, b and c to make * system utilities happy), and they always take up the * complete space of the "partition". * * I'm still not happy with this: why should the label be * protected? If it weren't so damned difficult to write * one in the first pleace (because it's protected), it wouldn't * be a problem. */ int vinum_bounds_check(struct buf *bp, struct volume *vol) { int maxsize = vol->size; /* size of the partition (sectors) */ int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */ /* Would this transfer overwrite the disk label? */ if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */ #if LABELSECTOR != 0 && bp->b_blkno + size > LABELSECTOR /* and finishes after */ #endif && (!(vol->flags & VF_RAW)) /* and it's not raw */ &&major(bp->b_dev) == BDEV_MAJOR /* and it's the block device */ && (bp->b_flags & B_READ) == 0 /* and it's a write */ && (!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */ bp->b_error = EROFS; /* read-only */ bp->b_flags |= B_ERROR; return -1; } if (size == 0) /* no transfer specified, */ return 0; /* treat as EOF */ /* beyond partition? */ if (bp->b_blkno < 0 /* negative start */ || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */ /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsize) { bp->b_resid = bp->b_bcount; return 0; } /* or truncate if part of it fits */ size = maxsize - bp->b_blkno; if (size <= 0) { /* nothing to transfer */ bp->b_error = EINVAL; bp->b_flags |= B_ERROR; return -1; } bp->b_bcount = size << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno; return 1; } /* * Allocate a request group and hook * it in in the list for rq */ struct rqgroup * allocrqg(struct request *rq, int elements) { struct rqgroup *rqg; /* the one we're going to allocate */ int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement); rqg = (struct rqgroup *) Malloc(size); if (rqg != NULL) { /* malloc OK, */ if (rq->rqg) /* we already have requests */ rq->lrqg->next = rqg; /* hang it off the end */ else /* first request */ rq->rqg = rqg; /* at the start */ rq->lrqg = rqg; /* this one is the last in the list */ bzero(rqg, size); /* no old junk */ rqg->rq = rq; /* point back to the parent request */ rqg->count = elements; /* number of requests in the group */ } return rqg; } /* * Deallocate a request group out of a chain. We do * this by linear search: the chain is short, this * almost never happens, and currently it can only * happen to the first member of the chain. */ void deallocrqg(struct rqgroup *rqg) { struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */ if (rqgc == rqg) /* we're first in line */ rqg->rq->rqg = rqg->next; /* unhook ourselves */ else { while ((rqgc->next != NULL) /* find the group */ &&(rqgc->next != rqg)) rqgc = rqgc->next; if (rqgc->next == NULL) log(LOG_ERR, "vinum deallocrqg: rqg %p not found in request %p\n", rqg->rq, rqg); else rqgc->next = rqg->next; /* make the chain jump over us */ } Free(rqg); } Index: head/sys/dev/vn/vn.c =================================================================== --- head/sys/dev/vn/vn.c (revision 49534) +++ head/sys/dev/vn/vn.c (revision 49535) @@ -1,758 +1,757 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: vn.c 1.13 94/04/02 * * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 - * $Id: vn.c,v 1.80 1999/05/30 16:51:55 phk Exp $ + * $Id: vn.c,v 1.81 1999/07/20 09:47:33 phk Exp $ */ /* * Vnode disk driver. * * Block/character interface to a vnode. Allows one to treat a file * as a disk (e.g. build a filesystem in it, mount it, etc.). * * NOTE 1: This uses the VOP_BMAP/VOP_STRATEGY interface to the vnode * instead of a simple VOP_RDWR. We do this to avoid distorting the * local buffer cache. * * NOTE 2: There is a security issue involved with this driver. * Once mounted all access to the contents of the "mapped" file via * the special file is controlled by the permissions on the special * file, the protection of the mapped file is ignored (effectively, * by using root credentials in all transactions). * * NOTE 3: Doesn't interact with leases, should it? */ #include "vn.h" #if NVN > 0 /* default is to have 8 VN's */ #if NVN < 8 #undef NVN #define NVN 8 #endif #include "opt_devfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEVFS #include #endif /*DEVFS*/ -#include #include #include #include #include #include #include #include #include #include #include static d_ioctl_t vnioctl; static d_open_t vnopen; static d_close_t vnclose; static d_dump_t vndump; static d_psize_t vnsize; static d_strategy_t vnstrategy; #define CDEV_MAJOR 43 #define BDEV_MAJOR 15 /* * cdevsw * D_DISK we want to look like a disk * ( D_NOCLUSTERRW removed - clustering should be ok ) * D_CANFREE We support B_FREEBUF */ static struct cdevsw vn_cdevsw = { /* open */ vnopen, /* close */ vnclose, /* read */ physread, /* write */ physwrite, /* ioctl */ vnioctl, /* stop */ nostop, /* reset */ noreset, /* devtotty */ nodevtotty, /* poll */ nopoll, /* mmap */ nommap, /* strategy */ vnstrategy, /* name */ "vn", /* parms */ noparms, /* maj */ CDEV_MAJOR, /* dump */ vndump, /* psize */ vnsize, /* flags */ D_DISK|D_CANFREE, /* maxio */ 0, /* bmaj */ BDEV_MAJOR }; #define vnunit(dev) dkunit(dev) #define getvnbuf() \ ((struct buf *)malloc(sizeof(struct buf), M_DEVBUF, M_WAITOK)) #define putvnbuf(bp) \ free((caddr_t)(bp), M_DEVBUF) struct vn_softc { int sc_flags; /* flags */ int sc_size; /* size of vn, sc_secsize scale */ int sc_secsize; /* sector size */ struct diskslices *sc_slices; struct vnode *sc_vp; /* vnode if not NULL */ vm_object_t sc_object; /* backing object if not NULL */ struct ucred *sc_cred; /* credentials */ int sc_maxactive; /* max # of active requests */ struct buf sc_tab; /* transfer queue */ u_long sc_options; /* options */ #ifdef DEVFS void *r_devfs_token; void *devfs_token; #endif }; /* sc_flags */ #define VNF_INITED 0x01 static struct vn_softc *vn_softc[NVN]; static u_long vn_options; #define IFOPT(vn,opt) if (((vn)->sc_options|vn_options) & (opt)) #if 0 static void vniodone (struct buf *bp); #endif static int vnsetcred (struct vn_softc *vn, struct ucred *cred); static void vnclear (struct vn_softc *vn); static int vn_modevent (module_t, int, void *); static int vniocattach_file (struct vn_softc *, struct vn_ioctl *, dev_t dev, int flag, struct proc *p); static int vniocattach_swap (struct vn_softc *, struct vn_ioctl *, dev_t dev, int flag, struct proc *p); static int vnclose(dev_t dev, int flags, int mode, struct proc *p) { struct vn_softc *vn = vn_softc[vnunit(dev)]; IFOPT(vn, VN_LABELS) if (vn->sc_slices != NULL) dsclose(dev, mode, vn->sc_slices); return (0); } static int vnopen(dev_t dev, int flags, int mode, struct proc *p) { int unit = vnunit(dev); struct vn_softc *vn; if (unit >= NVN) { if (vn_options & VN_FOLLOW) printf("vnopen(0x%lx, 0x%x, 0x%x, %p)\n", (u_long)dev, flags, mode, (void *)p); return(ENOENT); } vn = vn_softc[unit]; if (!vn) { vn = malloc(sizeof *vn, M_DEVBUF, M_WAITOK); if (!vn) return (ENOMEM); bzero(vn, sizeof *vn); vn_softc[unit] = vn; } IFOPT(vn, VN_FOLLOW) printf("vnopen(0x%lx, 0x%x, 0x%x, %p)\n", (u_long)dev, flags, mode, (void *)p); IFOPT(vn, VN_LABELS) { if (vn->sc_flags & VNF_INITED) { struct disklabel label; /* Build label for whole disk. */ bzero(&label, sizeof label); label.d_secsize = vn->sc_secsize; label.d_nsectors = 32; label.d_ntracks = 64 / (vn->sc_secsize / DEV_BSIZE); label.d_secpercyl = label.d_nsectors * label.d_ntracks; label.d_ncylinders = vn->sc_size / label.d_secpercyl; label.d_secperunit = vn->sc_size; label.d_partitions[RAW_PART].p_size = vn->sc_size; return (dsopen("vn", dev, mode, 0, &vn->sc_slices, &label, vnstrategy, (ds_setgeom_t *)NULL, &vn_cdevsw)); } if (dkslice(dev) != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART || mode != S_IFCHR) return (ENXIO); } return(0); } /* * vnstrategy: * * Run strategy routine for VN device. We use VOP_READ/VOP_WRITE calls * for vnode-backed vn's, and the new vm_pager_strategy() call for * vm_object-backed vn's. * * Currently B_ASYNC is only partially handled - for OBJT_SWAP I/O only. * * NOTE: bp->b_blkno is DEV_BSIZE'd. We must generate bp->b_pblkno for * our uio or vn_pager_strategy() call that is vn->sc_secsize'd */ static void vnstrategy(struct buf *bp) { int unit = vnunit(bp->b_dev); struct vn_softc *vn = vn_softc[unit]; int error; int isvplocked = 0; long sz; struct uio auio; struct iovec aiov; IFOPT(vn, VN_DEBUG) printf("vnstrategy(%p): unit %d\n", bp, unit); if ((vn->sc_flags & VNF_INITED) == 0) { bp->b_error = ENXIO; bp->b_flags |= B_ERROR; biodone(bp); return; } bp->b_resid = bp->b_bcount; IFOPT(vn, VN_LABELS) { if (vn->sc_slices != NULL && dscheck(bp, vn->sc_slices) <= 0) { bp->b_flags |= B_INVAL; biodone(bp); return; } } else { int pbn; pbn = bp->b_blkno * (vn->sc_secsize / DEV_BSIZE); sz = howmany(bp->b_bcount, vn->sc_secsize); if (pbn < 0 || pbn + sz > vn->sc_size) { if (pbn != vn->sc_size) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR | B_INVAL; } biodone(bp); return; } bp->b_pblkno = pbn; } if (vn->sc_vp && (bp->b_flags & B_FREEBUF)) { /* * Not handled for vnode-backed element yet. */ biodone(bp); } else if (vn->sc_vp) { /* * VNODE I/O */ aiov.iov_base = bp->b_data; aiov.iov_len = bp->b_bcount; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = (vm_ooffset_t)bp->b_pblkno * vn->sc_secsize; auio.uio_segflg = UIO_SYSSPACE; if( bp->b_flags & B_READ) auio.uio_rw = UIO_READ; else auio.uio_rw = UIO_WRITE; auio.uio_resid = bp->b_bcount; auio.uio_procp = curproc; if (!VOP_ISLOCKED(vn->sc_vp)) { isvplocked = 1; vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, curproc); } if( bp->b_flags & B_READ) error = VOP_READ(vn->sc_vp, &auio, 0, vn->sc_cred); else error = VOP_WRITE(vn->sc_vp, &auio, 0, vn->sc_cred); if (isvplocked) { VOP_UNLOCK(vn->sc_vp, 0, curproc); isvplocked = 0; } bp->b_resid = auio.uio_resid; if( error ) { bp->b_error = error; bp->b_flags |= B_ERROR; } biodone(bp); } else if (vn->sc_object) { /* * OBJT_SWAP I/O * * ( handles read, write, freebuf ) */ vm_pager_strategy(vn->sc_object, bp); } else { bp->b_flags |= B_ERROR; bp->b_error = EINVAL; biodone(bp); } } #if 0 void vniodone( struct buf *bp) { bp->b_flags |= B_DONE; wakeup((caddr_t) bp); } #endif /* ARGSUSED */ static int vnioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) { struct vn_softc *vn = vn_softc[vnunit(dev)]; struct vn_ioctl *vio; int error; u_long *f; IFOPT(vn,VN_FOLLOW) printf("vnioctl(0x%lx, 0x%lx, %p, 0x%x, %p): unit %d\n", (u_long)dev, cmd, (void *)data, flag, (void *)p, vnunit(dev)); switch (cmd) { case VNIOCATTACH: case VNIOCDETACH: case VNIOCGSET: case VNIOCGCLEAR: case VNIOCUSET: case VNIOCUCLEAR: goto vn_specific; } IFOPT(vn,VN_LABELS) { if (vn->sc_slices != NULL) { error = dsioctl("vn", dev, cmd, data, flag, &vn->sc_slices, vnstrategy, (ds_setgeom_t *)NULL); if (error != ENOIOCTL) return (error); } if (dkslice(dev) != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART) return (ENOTTY); } vn_specific: error = suser(p); if (error) return (error); vio = (struct vn_ioctl *)data; f = (u_long*)data; switch (cmd) { case VNIOCATTACH: if (vn->sc_flags & VNF_INITED) return(EBUSY); if (vio->vn_file == NULL) error = vniocattach_swap(vn, vio, dev, flag, p); else error = vniocattach_file(vn, vio, dev, flag, p); break; case VNIOCDETACH: if ((vn->sc_flags & VNF_INITED) == 0) return(ENXIO); /* * XXX handle i/o in progress. Return EBUSY, or wait, or * flush the i/o. * XXX handle multiple opens of the device. Return EBUSY, * or revoke the fd's. * How are these problems handled for removable and failing * hardware devices? */ vnclear(vn); IFOPT(vn, VN_FOLLOW) printf("vnioctl: CLRed\n"); break; case VNIOCGSET: vn_options |= *f; *f = vn_options; break; case VNIOCGCLEAR: vn_options &= ~(*f); *f = vn_options; break; case VNIOCUSET: vn->sc_options |= *f; *f = vn->sc_options; break; case VNIOCUCLEAR: vn->sc_options &= ~(*f); *f = vn->sc_options; break; default: error = ENOTTY; break; } return(error); } /* * vniocattach_file: * * Attach a file to a VN partition. Return the size in the vn_size * field. */ static int vniocattach_file(vn, vio, dev, flag, p) struct vn_softc *vn; struct vn_ioctl *vio; dev_t dev; int flag; struct proc *p; { struct vattr vattr; struct nameidata nd; int error; /* * Always open for read and write. * This is probably bogus, but it lets vn_open() * weed out directories, sockets, etc. so we don't * have to worry about them. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, vio->vn_file, p); error = vn_open(&nd, FREAD|FWRITE, 0); if (error) return(error); error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, p); if (error) { VOP_UNLOCK(nd.ni_vp, 0, p); (void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p); return(error); } VOP_UNLOCK(nd.ni_vp, 0, p); vn->sc_secsize = DEV_BSIZE; vn->sc_vp = nd.ni_vp; vn->sc_size = vattr.va_size / vn->sc_secsize; /* note truncation */ error = vnsetcred(vn, p->p_ucred); if (error) { (void) vn_close(nd.ni_vp, FREAD|FWRITE, p->p_ucred, p); return(error); } if (dev->si_bsize_phys < vn->sc_secsize) dev->si_bsize_phys = vn->sc_secsize; if (dev->si_bsize_best < vn->sc_secsize) dev->si_bsize_best = vn->sc_secsize; vn->sc_flags |= VNF_INITED; IFOPT(vn, VN_LABELS) { /* * Reopen so that `ds' knows which devices are open. * If this is the first VNIOCSET, then we've * guaranteed that the device is the cdev and that * no other slices or labels are open. Otherwise, * we rely on VNIOCCLR not being abused. */ error = vnopen(dev, flag, S_IFCHR, p); if (error) vnclear(vn); } IFOPT(vn, VN_FOLLOW) printf("vnioctl: SET vp %p size %x blks\n", vn->sc_vp, vn->sc_size); return(0); } /* * vniocattach_swap: * * Attach swap backing store to a VN partition of the size specified * in vn_size. */ static int vniocattach_swap(vn, vio, dev, flag, p) struct vn_softc *vn; struct vn_ioctl *vio; dev_t dev; int flag; struct proc *p; { int error; /* * Range check. Disallow negative sizes or any size less then the * size of a page. Then round to a page. */ if (vio->vn_size <= 0) return(EDOM); /* * Allocate an OBJT_SWAP object. * * sc_secsize is PAGE_SIZE'd * * vio->vn_size is in PAGE_SIZE'd chunks. * sc_size must be in PAGE_SIZE'd chunks. * Note the truncation. */ vn->sc_secsize = PAGE_SIZE; vn->sc_size = vio->vn_size; vn->sc_object = vm_pager_allocate(OBJT_SWAP, NULL, vn->sc_secsize * (vm_ooffset_t)vio->vn_size, VM_PROT_DEFAULT, 0); vn->sc_flags |= VNF_INITED; error = vnsetcred(vn, p->p_ucred); if (error == 0) { IFOPT(vn, VN_LABELS) { /* * Reopen so that `ds' knows which devices are open. * If this is the first VNIOCSET, then we've * guaranteed that the device is the cdev and that * no other slices or labels are open. Otherwise, * we rely on VNIOCCLR not being abused. */ error = vnopen(dev, flag, S_IFCHR, p); } } if (error == 0) { IFOPT(vn, VN_FOLLOW) { printf("vnioctl: SET vp %p size %x\n", vn->sc_vp, vn->sc_size); } } if (error) vnclear(vn); return(error); } /* * Duplicate the current processes' credentials. Since we are called only * as the result of a SET ioctl and only root can do that, any future access * to this "disk" is essentially as root. Note that credentials may change * if some other uid can write directly to the mapped file (NFS). */ int vnsetcred(struct vn_softc *vn, struct ucred *cred) { struct uio auio; struct iovec aiov; char *tmpbuf; int error = 0; /* * Set credits in our softc */ if (vn->sc_cred) crfree(vn->sc_cred); vn->sc_cred = crdup(cred); /* * Horrible kludge to establish credentials for NFS XXX. */ if (vn->sc_vp) { tmpbuf = malloc(vn->sc_secsize, M_TEMP, M_WAITOK); aiov.iov_base = tmpbuf; aiov.iov_len = vn->sc_secsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = aiov.iov_len; vn_lock(vn->sc_vp, LK_EXCLUSIVE | LK_RETRY, curproc); error = VOP_READ(vn->sc_vp, &auio, 0, vn->sc_cred); VOP_UNLOCK(vn->sc_vp, 0, curproc); free(tmpbuf, M_TEMP); } return (error); } void vnclear(struct vn_softc *vn) { struct proc *p = curproc; /* XXX */ IFOPT(vn, VN_FOLLOW) printf("vnclear(%p): vp=%p\n", vn, vn->sc_vp); if (vn->sc_slices != NULL) dsgone(&vn->sc_slices); vn->sc_flags &= ~VNF_INITED; if (vn->sc_vp != NULL) { (void)vn_close(vn->sc_vp, FREAD|FWRITE, vn->sc_cred, p); vn->sc_vp = NULL; } if (vn->sc_cred) { crfree(vn->sc_cred); vn->sc_cred = NULL; } if (vn->sc_object != NULL) { vm_pager_deallocate(vn->sc_object); vn->sc_object = NULL; } vn->sc_size = 0; } static int vnsize(dev_t dev) { int unit = vnunit(dev); struct vn_softc *vn; if (unit < 0 || unit >= NVN) return(-1); vn = vn_softc[unit]; if ((vn->sc_flags & VNF_INITED) == 0) return(-1); return(vn->sc_size); } static int vndump(dev_t dev) { return (ENODEV); } static int vn_modevent(module_t mod, int type, void *data) { int unit; #ifdef DEVFS struct vn_softc *vn; #endif switch (type) { case MOD_LOAD: #ifdef DEVFS for (unit = 0; unit < NVN; unit++) { vn = malloc(sizeof *vn, M_DEVBUF, M_WAITOK); if (!vn) continue; /* "oops" */ bzero(vn, sizeof *vn); vn_softc[unit] = vn; vn->r_devfs_token = devfs_add_devswf(&vn_cdevsw, dkmakeminor(unit, 0, 0), DV_CHR, UID_ROOT, GID_OPERATOR, 0640, "rvn%d", unit); vn->devfs_token = devfs_add_devswf(&vn_cdevsw, dkmakeminor(unit, 0, 0), DV_BLK, UID_ROOT, GID_OPERATOR, 0640, "vn%d", unit); } #endif break; case MOD_UNLOAD: #ifdef DEVFS for (unit = 0; unit < NVN; unit++) { vn = vn_softc[unit]; if (vn->r_devfs_token) { devfs_remove_dev(vn->r_devfs_token); vn->r_devfs_token = 0; } if (vn->devfs_token) { devfs_remove_dev(vn->devfs_token); vn->devfs_token = 0; } } #endif /* fall through */ case MOD_SHUTDOWN: for (unit = 0; unit < NVN; unit++) if (vn_softc[unit] && vn_softc[unit]->sc_flags & VNF_INITED) vnclear(vn_softc[unit]); break; default: break; } return 0; } DEV_MODULE(vn, CDEV_MAJOR, BDEV_MAJOR, vn_cdevsw, vn_modevent, 0); #endif Index: head/sys/dev/xe/if_xe.c =================================================================== --- head/sys/dev/xe/if_xe.c (revision 49534) +++ head/sys/dev/xe/if_xe.c (revision 49535) @@ -1,2507 +1,2507 @@ /*- * Copyright (c) 1998, 1999 Scott Mitchell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: if_xe.c,v 1.20 1999/06/13 19:17:40 scott Exp $ * $FreeBSD$ */ /* * Portions of this software were derived from Werner Koch's xirc2ps driver * for Linux under the terms of the following license (from v1.30 of the * xirc2ps driver): * * Copyright (c) 1997 by Werner Koch (dd9jn) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * FreeBSD device driver for Xircom CreditCard PCMCIA Ethernet adapters. The * following cards are currently known to work with the driver: * Xircom CreditCard 10/100 (CE3) * Xircom CreditCard Ethernet + Modem 28 (CEM28) * Xircom CreditCard Ethernet 10/100 + Modem 56 (CEM56) * Xircom RealPort Ethernet 10 * Xircom RealPort Ethernet 10/100 * Xircom RealPort Ethernet 10/100 + Modem 56 (REM56, REM56G) * Intel EtherExpress Pro/100 PC Card Mobile Adapter 16 (Pro/100 M16A) * Compaq Netelligent 10/100 PC Card (CPQ-10/100) * * Some other cards *should* work, but support for them is either broken or in * an unknown state at the moment. I'm always interested in hearing from * people who own any of these cards: * Xircom CreditCard 10Base-T (PS-CE2-10) * Xircom CreditCard Ethernet + ModemII (CEM2) * Xircom CEM28 and CEM33 Ethernet/Modem cards (may be variants of CEM2?) * * Thanks to all who assisted with the development and testing of the driver, * especially: Werner Koch, Duke Kamstra, Duncan Barclay, Jason George, Dru * Nelson, Mike Kephart, Bill Rainey and Douglas Rand. Apologies if I've left * out anyone who deserves a mention here. * * Special thanks to Ade Lovett for both hosting the mailing list and doing * the CEM56/REM56 support code; and the FreeBSD UK Users' Group for hosting * the web pages. * * Contact points: * * Driver web page: http://ukug.uk.freebsd.org/~scott/xe_drv/ * * Mailing list: http://www.lovett.com/lists/freebsd-xircom/ * or send "subscribe freebsd-xircom" to * * Author email: */ #ifndef XE_DEBUG #define XE_DEBUG 1 /* Increase for more voluminous output! */ #endif #include "xe.h" #include "card.h" #include "apm.h" #include "bpf.h" #if NXE > 0 #if NCARD > 0 #include #include -#include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #if NBPF > 0 #include #endif /* NBPF > 0 */ #include #include #include #include #if NAPM > 0 #include #endif /* NAPM > 0 */ #include #include #include #include /* * One of these structures per allocated device */ struct xe_softc { struct arpcom arpcom; struct ifmedia ifmedia; struct ifmib_iso_8802_3 mibdata; struct callout_handle chand; struct isa_device *dev; struct pccard_devinfo *crd; struct ifnet *ifp; struct ifmedia *ifm; char *card_type; /* Card model name */ char *vendor; /* Card manufacturer */ int unit; /* Unit number, from dev->id_unit */ int srev; /* Silicon revision */ int tx_queued; /* Packets currently waiting to transmit */ int tx_tpr; /* Last value of TPR reg on card */ int tx_collisions; /* Collisions since last successful send */ int tx_timeouts; /* Count of transmit timeouts */ int autoneg_status; /* Autonegotiation progress state */ int media; /* Private media word */ u_char version; /* Bonding Version register from card */ u_char modem; /* 1 = Card has a modem */ u_char ce2; /* 1 = Card has CE2 silicon */ u_char mohawk; /* 1 = Card has Mohawk (CE3) silicon */ u_char dingo; /* 1 = Card has Dingo (CEM56) silicon */ u_char phy_ok; /* 1 = MII-compliant PHY found and initialised */ u_char gone; /* 1 = Card bailed out */ #if NAPM > 0 struct apmhook suspend_hook; struct apmhook resume_hook; #endif /* NAPM > 0 */ }; static struct xe_softc *sca[MAXSLOT]; /* * MII command structure */ struct xe_mii_frame { u_int8_t mii_stdelim; u_int8_t mii_opcode; u_int8_t mii_phyaddr; u_int8_t mii_regaddr; u_int8_t mii_turnaround; u_int16_t mii_data; }; /* * For accessing card registers */ #define XE_INB(r) inb(scp->dev->id_iobase+(r)) #define XE_INW(r) inw(scp->dev->id_iobase+(r)) #define XE_OUTB(r, b) outb(scp->dev->id_iobase+(r), (b)) #define XE_OUTW(r, w) outw(scp->dev->id_iobase+(r), (w)) #define XE_SELECT_PAGE(p) XE_OUTB(XE_PR, (p)) /* * Horrid stuff for accessing CIS tuples */ #define CARD_MAJOR 50 #define CISTPL_BUFSIZE 512 #define CISTPL_TYPE(tpl) tpl[0] #define CISTPL_LEN(tpl) tpl[2] #define CISTPL_DATA(tpl,pos) tpl[4 + ((pos)<<1)] /* * Media autonegotiation progress constants */ #define XE_AUTONEG_NONE 0 /* No autonegotiation in progress */ #define XE_AUTONEG_WAITING 1 /* Waiting for transmitter to go idle */ #define XE_AUTONEG_STARTED 2 /* Waiting for autonegotiation to complete */ #define XE_AUTONEG_100TX 3 /* Trying to force 100baseTX link */ #define XE_AUTONEG_FAIL 4 /* Autonegotiation failed */ /* * Prototypes start here */ static int xe_probe (struct isa_device *dev); static int xe_card_init (struct pccard_devinfo *devi); static int xe_attach (struct isa_device *dev); static void xe_init (void *xscp); static void xe_start (struct ifnet *ifp); static int xe_ioctl (struct ifnet *ifp, u_long command, caddr_t data); static int xe_card_intr (struct pccard_devinfo *devi); static void xe_watchdog (struct ifnet *ifp); static int xe_media_change (struct ifnet *ifp); static void xe_media_status (struct ifnet *ifp, struct ifmediareq *mrp); static timeout_t xe_setmedia; static void xe_hard_reset (struct xe_softc *scp); static void xe_soft_reset (struct xe_softc *scp); static void xe_stop (struct xe_softc *scp); static void xe_enable_intr (struct xe_softc *scp); static void xe_disable_intr (struct xe_softc *scp); static void xe_setmulti (struct xe_softc *scp); static void xe_setaddrs (struct xe_softc *scp); static int xe_pio_write_packet (struct xe_softc *scp, struct mbuf *mbp); static void xe_card_unload (struct pccard_devinfo *devi); static u_int32_t xe_compute_crc (u_int8_t *data, int len); static int xe_compute_hashbit (u_int32_t crc); /* * MII functions */ static void xe_mii_sync (struct xe_softc *scp); static int xe_mii_init (struct xe_softc *scp); static void xe_mii_send (struct xe_softc *scp, u_int32_t bits, int cnt); static int xe_mii_readreg (struct xe_softc *scp, struct xe_mii_frame *frame); static int xe_mii_writereg (struct xe_softc *scp, struct xe_mii_frame *frame); static u_int16_t xe_phy_readreg (struct xe_softc *scp, u_int16_t reg); static void xe_phy_writereg (struct xe_softc *scp, u_int16_t reg, u_int16_t data); /* * Debug functions */ #ifdef XE_DEBUG #define XE_REG_DUMP(scp) xe_reg_dump((scp)) #define XE_MII_DUMP(scp) xe_mii_dump((scp)) static void xe_reg_dump (struct xe_softc *scp); static void xe_mii_dump (struct xe_softc *scp); #else #define XE_REG_DUMP(scp) #define XE_MII_DUMP(scp) #endif #if NAPM > 0 /* * APM hook functions */ static int xe_suspend (void *xunit); static int xe_resume (void *xunit); #endif /* NAPM > 0 */ /* * PCMCIA driver hooks */ #ifdef PCCARD_MODULE PCCARD_MODULE(xe, xe_card_init, xe_card_unload, xe_card_intr, 0, net_imask); #else static struct pccard_device xe_info = { /* For pre 3.1-STABLE code */ "xe", xe_card_init, xe_card_unload, xe_card_intr, 0, &net_imask }; DATA_SET(pccarddrv_set, xe_info); #endif /* PCCARD_MODULE */ /* * ISA driver hooks. I'd like to do without these but the kernel config stuff * seems to require them. */ struct isa_driver xedriver = { xe_probe, xe_attach, "xe" }; /* * ISA probe routine. * All of the supported devices are PCMCIA cards. I have no idea if it's even * possible to successfully probe/attach these at boot time (pccardd normally * does a lot of setup work) so I don't even bother trying. */ static int xe_probe (struct isa_device *dev) { #ifdef XE_DEBUG printf("xe%d: probe\n", dev->id_unit); #endif bzero(sca, MAXSLOT * sizeof(sca[0])); return 0; } /* * Two routines to read from/write to the attribute memory * the write portion is used only for fixing up the RealPort cards, * the reader portion was needed for debugging info, and duplicated some * code in xe_card_init(), so it appears here instead with suitable * modifications to xe_card_init() * -aDe Lovett */ static int xe_memwrite(struct pccard_devinfo *devi, off_t offset, u_char byte) { struct iovec iov; struct uio uios; iov.iov_base = &byte; iov.iov_len = sizeof(byte); uios.uio_iov = &iov; uios.uio_iovcnt = 1; uios.uio_offset = offset; uios.uio_resid = sizeof(byte); uios.uio_segflg = UIO_SYSSPACE; uios.uio_rw = UIO_WRITE; uios.uio_procp = 0; #if 0 /* THIS IS BOGUS */ return cdevsw[CARD_MAJOR]->d_write(makedev(CARD_MAJOR, devi->slt->slotnum), &uios, 0); #else return (-1); #endif } static int xe_memread(struct pccard_devinfo *devi, off_t offset, u_char *buf, int size) { struct iovec iov; struct uio uios; iov.iov_base = buf; iov.iov_len = size; uios.uio_iov = &iov; uios.uio_iovcnt = 1; uios.uio_offset = offset; uios.uio_resid = size; uios.uio_segflg = UIO_SYSSPACE; uios.uio_rw = UIO_READ; uios.uio_procp = 0; #if 0 /* THIS IS BOGUS */ return cdevsw[CARD_MAJOR]->d_read(makedev(CARD_MAJOR, devi->slt->slotnum), &uios, 0); #else return (-1); #endif } /* * Hacking for RealPort cards */ static int xe_cem56fix(struct xe_softc *scp) { struct pccard_devinfo *devi; struct slot *slt; struct slot_ctrl *ctrl; int ioport, fail; /* initialise a few variables */ devi = scp->crd; slt = devi->slt; ctrl = slt->ctrl; /* allocate a new I/O slot for the ethernet */ /* XXX: ctrl->mapio() always appears to return 0 (success), so * this may cause problems if another device is listening * on 0x300 already. In this case, you should choose a * known free I/O port address in the kernel config line * for the driver. It will be picked up here and used * instead of the autodetected value. */ slt->io[1].window = 1; slt->io[1].flags = IODF_WS|IODF_16BIT|IODF_ZEROWS|IODF_ACTIVE; slt->io[1].size = 0x10; #ifdef XE_IOBASE printf( "xe%d: user requested ioport 0x%x\n", scp->unit, XE_IOBASE ); ioport = XE_IOBASE; slt->io[1].start = ioport; fail = ctrl->mapio(slt, 1); #else for (ioport = 0x300; ioport < 0x400; ioport += 0x10) { slt->io[1].start = ioport; if ((fail = ctrl->mapio( slt, 1 )) == 0) break; } #endif /* did we find one? */ if (fail) { printf( "xe%d: xe_cem56fix: no free address space\n", scp->unit ); return -1; } /* munge the id_iobase entry for use by the rest of the driver */ #if XE_DEBUG > 1 printf( "xe%d: using 0x%x for RealPort ethernet\n", scp->unit, ioport ); #endif scp->dev->id_iobase = ioport; scp->dev->id_alive = 0x10; /* magic to set up the ethernet */ xe_memwrite( devi, DINGO_ECOR, DINGO_ECOR_IRQ_LEVEL|DINGO_ECOR_INT_ENABLE| DINGO_ECOR_IOB_ENABLE|DINGO_ECOR_ETH_ENABLE ); xe_memwrite( devi, DINGO_EBAR0, ioport & 0xff ); xe_memwrite( devi, DINGO_EBAR1, (ioport >> 8) & 0xff ); xe_memwrite( devi, DINGO_DCOR0, DINGO_DCOR0_SF_INT ); xe_memwrite( devi, DINGO_DCOR1, DINGO_DCOR1_INT_LEVEL|DINGO_DCOR1_EEDIO ); xe_memwrite( devi, DINGO_DCOR2, 0x00 ); xe_memwrite( devi, DINGO_DCOR3, 0x00 ); xe_memwrite( devi, DINGO_DCOR4, 0x00 ); /* success! */ return 0; } /* * PCMCIA probe routine. * Probe and identify the device. Called by the slot manager when the card is * inserted or the machine wakes up from suspend mode. Assmes that the slot * structure has been initialised already. */ static int xe_card_init(struct pccard_devinfo *devi) { struct xe_softc *scp; struct isa_device *dev; u_char buf[CISTPL_BUFSIZE]; u_char ver_str[CISTPL_BUFSIZE>>1]; off_t offs; int unit, success, rc, i; unit = devi->isahd.id_unit; scp = sca[unit]; dev = &devi->isahd; success = 0; #ifdef XE_DEBUG printf("xe: Probing for unit %d\n", unit); #endif /* Check that unit number is OK */ if (unit > MAXSLOT) { printf("xe%d: bad unit\n", unit); return (ENODEV); } /* Don't attach an active device */ if (scp && !scp->gone) { printf("xe%d: already attached\n", unit); return (EBUSY); } /* Allocate per-instance storage */ if (!scp) { if ((scp = malloc(sizeof(*scp), M_DEVBUF, M_NOWAIT)) == NULL) { printf("xe%d: failed to allocage driver storage\n", unit); return (ENOMEM); } bzero(scp, sizeof(*scp)); } /* Re-attach an existing device */ if (scp->gone) { scp->gone = 0; return 0; } /* Grep through CIS looking for relevant tuples */ offs = 0; do { u_int16_t vendor; u_int8_t rev, media, prod; /* * Read tuples one at a time into buf. Sucks, but it only happens once. * XXX - This assumes that attribute has been mapped by pccardd, which * XXX - seems to be the default situation. If not, we're well and truly * XXX - FUBAR. This is a general PCCARD problem, not our fault :) */ if ((rc = xe_memread( devi, offs, buf, CISTPL_BUFSIZE )) == 0) { switch (CISTPL_TYPE(buf)) { case 0x15: /* Grab version string (needed to ID some weird CE2's) */ #if XE_DEBUG > 1 printf("xe%d: Got version string (0x15)\n", unit); #endif for (i = 0; i < CISTPL_LEN(buf); ver_str[i] = CISTPL_DATA(buf, i++)); ver_str[i] = '\0'; ver_str[(CISTPL_BUFSIZE>>1) - 1] = CISTPL_LEN(buf); success++; break; case 0x20: /* Figure out what type of card we have */ #if XE_DEBUG > 1 printf("xe%d: Got card ID (0x20)\n", unit); #endif vendor = CISTPL_DATA(buf, 0) + (CISTPL_DATA(buf, 1) << 8); rev = CISTPL_DATA(buf, 2); media = CISTPL_DATA(buf, 3); prod = CISTPL_DATA(buf, 4); switch (vendor) { /* Get vendor ID */ case 0x0105: scp->vendor = "Xircom"; break; case 0x0138: case 0x0183: scp->vendor = "Compaq"; break; case 0x0089: scp->vendor = "Intel"; break; default: scp->vendor = "Unknown"; } if (!((prod & 0x40) && (media & 0x01))) { #if XE_DEBUG > 1 printf("xe%d: Not a PCMCIA Ethernet card!\n", unit); #endif rc = ENODEV; /* Not a PCMCIA Ethernet device */ } else { if (media & 0x10) { /* Ethernet/modem cards */ #if XE_DEBUG > 1 printf("xe%d: Card is Ethernet/modem combo\n", unit); #endif scp->modem = 1; switch (prod & 0x0f) { case 1: scp->card_type = "CEM"; break; case 2: scp->ce2 = 1; scp->card_type = "CEM2"; break; case 3: scp->ce2 = 1; scp->card_type = "CEM3"; break; case 4: scp->ce2 = 1; scp->card_type = "CEM33"; break; case 5: scp->mohawk = 1; scp->card_type = "CEM56M"; break; case 6: case 7: /* Some kind of RealPort card */ scp->mohawk = 1; scp->dingo = 1; scp->card_type = "CEM56"; break; default: rc = ENODEV; } } else { /* Ethernet-only cards */ #if XE_DEBUG > 1 printf("xe%d: Card is Ethernet only\n", unit); #endif switch (prod & 0x0f) { case 1: scp->card_type = "CE"; break; case 2: scp->ce2 = 1; scp->card_type = "CE2"; break; case 3: scp->mohawk = 1; scp->card_type = "CE3"; break; default: rc = ENODEV; } } } success++; break; case 0x22: /* Get MAC address */ if ((CISTPL_LEN(buf) == 8) && (CISTPL_DATA(buf, 0) == 0x04) && (CISTPL_DATA(buf, 1) == ETHER_ADDR_LEN)) { #if XE_DEBUG > 1 printf("xe%d: Got MAC address (0x22)\n", unit); #endif for (i = 0; i < ETHER_ADDR_LEN; scp->arpcom.ac_enaddr[i] = CISTPL_DATA(buf, i+2), i++); } success++; break; default: } } /* Skip to next tuple */ offs += ((CISTPL_LEN(buf) + 2) << 1); } while ((CISTPL_TYPE(buf) != 0xff) && (CISTPL_LEN(buf) != 0xff) && (rc == 0)); /* Die now if something went wrong above */ if ((rc != 0) || (success < 3)) { free(scp, M_DEVBUF); return rc; } /* Check for certain strange CE2's that look like CE's */ if (strcmp(scp->card_type, "CE") == 0) { u_char *str = ver_str; #if XE_DEBUG > 1 printf("xe%d: Checking for weird CE2 string\n", unit); #endif str += strlen(str) + 1; /* Skip forward to 3rd version string */ str += strlen(str) + 1; str += strlen(str) + 1; for (i = 0; i < strlen(str) - 2; i++) { if (bcmp(&str[i], "CE2", 3) ==0) { /* Look for "CE2" string */ scp->card_type = "CE2"; } } } /* Reject unsupported cards */ if (strcmp(scp->card_type, "CE") == 0 || strcmp(scp->card_type, "CEM") == 0) { printf("xe%d: Sorry, your %s card is not supported :(\n", unit, scp->card_type); free(scp, M_DEVBUF); return ENODEV; } /* Fill in some private data */ sca[unit] = scp; scp->dev = &devi->isahd; scp->crd = devi; scp->ifp = &scp->arpcom.ac_if; scp->ifm = &scp->ifmedia; scp->unit = unit; scp->autoneg_status = 0; /* Hack RealPorts into submission */ if (scp->dingo && xe_cem56fix(scp) < 0) { printf( "xe%d: Unable to fix your RealPort\n", unit ); sca[unit] = 0; free(scp, M_DEVBUF); return ENODEV; } /* Hopefully safe to read this here */ XE_SELECT_PAGE(4); scp->version = XE_INB(XE_BOV); /* Attempt to attach the device */ if (!xe_attach(scp->dev)) { sca[unit] = 0; free(scp, M_DEVBUF); return ENXIO; } #if NAPM > 0 /* Establish APM hooks once device attached */ scp->suspend_hook.ah_name = "xe_suspend"; scp->suspend_hook.ah_fun = xe_suspend; scp->suspend_hook.ah_arg = (void *)unit; scp->suspend_hook.ah_order = APM_MIN_ORDER; apm_hook_establish(APM_HOOK_SUSPEND, &scp->suspend_hook); scp->resume_hook.ah_name = "xe_resume"; scp->resume_hook.ah_fun = xe_resume; scp->resume_hook.ah_arg = (void *)unit; scp->resume_hook.ah_order = APM_MIN_ORDER; apm_hook_establish(APM_HOOK_RESUME, &scp->resume_hook); #endif /* NAPM > 0 */ /* Success */ return 0; } /* * Attach a device (called when xe_card_init succeeds). Assume that the probe * routine has set up the softc structure correctly and that we can trust the * unit number. */ static int xe_attach (struct isa_device *dev) { struct xe_softc *scp = sca[dev->id_unit]; int i; #ifdef XE_DEBUG printf("xe%d: attach\n", scp->unit); #endif /* Initialise the ifnet structure */ if (!scp->ifp->if_name) { scp->ifp->if_softc = scp; scp->ifp->if_name = "xe"; scp->ifp->if_unit = scp->unit; scp->ifp->if_timer = 0; scp->ifp->if_flags = (IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); scp->ifp->if_linkmib = &scp->mibdata; scp->ifp->if_linkmiblen = sizeof scp->mibdata; scp->ifp->if_output = ether_output; scp->ifp->if_start = xe_start; scp->ifp->if_ioctl = xe_ioctl; scp->ifp->if_watchdog = xe_watchdog; scp->ifp->if_init = xe_init; scp->ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; } /* Initialise the ifmedia structure */ ifmedia_init(scp->ifm, 0, xe_media_change, xe_media_status); callout_handle_init(&scp->chand); /* * Fill in supported media types. Some cards _do_ support full duplex * operation, but this driver doesn't, yet. Therefore we leave those modes * out of the list. We support some form of autoselection in all cases. */ if (scp->mohawk) { ifmedia_add(scp->ifm, IFM_ETHER|IFM_100_TX, 0, NULL); ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_T, 0, NULL); } else { ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_T, 0, NULL); ifmedia_add(scp->ifm, IFM_ETHER|IFM_10_2, 0, NULL); } ifmedia_add(scp->ifm, IFM_ETHER|IFM_AUTO, 0, NULL); /* Default is to autoselect best supported media type */ ifmedia_set(scp->ifm, IFM_ETHER|IFM_AUTO); /* Print some useful information */ printf("\n"); printf("xe%d: %s %s, bonding version %#x%s%s\n", scp->unit, scp->vendor, scp->card_type, scp->version, scp->mohawk ? ", 100Mbps capable" : "", scp->modem ? ", with modem" : ""); if (scp->mohawk) { XE_SELECT_PAGE(0x10); printf("xe%d: DingoID = %#x, RevisionID = %#x, VendorID = %#x\n", scp->unit, XE_INW(XE_DINGOID), XE_INW(XE_RevID), XE_INW(XE_VendorID)); } if (scp->ce2) { XE_SELECT_PAGE(0x45); printf("xe%d: CE2 version = %#x\n", scp->unit, XE_INB(XE_REV)); } /* Print MAC address */ printf("xe%d: Ethernet address %02x", scp->unit, scp->arpcom.ac_enaddr[0]); for (i = 1; i < ETHER_ADDR_LEN; i++) { printf(":%02x", scp->arpcom.ac_enaddr[i]); } printf("\n"); /* Attach the interface */ if_attach(scp->ifp); ether_ifattach(scp->ifp); #if NBPF > 0 /* If BPF is in the kernel, call the attach for it */ #if XE_DEBUG > 1 printf("xe%d: BPF listener attached\n", scp->unit); #endif bpfattach(scp->ifp, DLT_EN10MB, sizeof(struct ether_header)); #endif /* Done */ return 1; } /* * Initialize device. Completes the reset procedure on the card and starts * output. If there's an autonegotiation in progress we DON'T do anything; * the media selection code will call us again when it's done. */ static void xe_init(void *xscp) { struct xe_softc *scp = xscp; int s; #ifdef XE_DEBUG printf("xe%d: init\n", scp->unit); #endif if (scp->gone) return; if (TAILQ_EMPTY(&scp->ifp->if_addrhead)) return; /* Reset transmitter flags */ scp->tx_queued = 0; scp->tx_tpr = 0; scp->tx_collisions = 0; scp->ifp->if_timer = 0; s = splimp(); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC0, 0x20); /* Disable source insertion (WTF is that?) */ /* * Set the 'local memory dividing line' -- splits the 32K card memory into * 8K for transmit buffers and 24K for receive. This is done automatically * on newer revision cards. */ if (scp->srev != 1) { XE_SELECT_PAGE(2); XE_OUTW(XE_RBS, 0x2000); } /* Set up multicast addresses */ xe_setmulti(scp); /* Fix the data offset register -- reset leaves it off-by-one */ XE_SELECT_PAGE(0); XE_OUTW(XE_DO, 0x2000); /* * Set MAC interrupt masks and clear status regs. The bit names are direct * from the Linux code; I have no idea what most of them do. */ XE_SELECT_PAGE(0x40); /* Bit 7..0 */ XE_OUTB(XE_RX0Msk, 0xff); /* ROK, RAB, rsv, RO, CRC, AE, PTL, MP */ XE_OUTB(XE_TX0Msk, 0xff); /* TOK, TAB, SQE, LL, TU, JAB, EXC, CRS */ XE_OUTB(XE_TX0Msk+1, 0xb0); /* rsv, rsv, PTD, EXT, rsv, rsv, rsv, rsv */ XE_OUTB(XE_RST0, 0x00); /* ROK, RAB, REN, RO, CRC, AE, PTL, MP */ XE_OUTB(XE_TXST0, 0x00); /* TOK, TAB, SQE, LL, TU, JAB, EXC, CRS */ XE_OUTB(XE_TXST1, 0x00); /* TEN, rsv, PTD, EXT, retry_counter:4 */ /* * Check for an in-progress autonegotiation. If one is active, just set * IFF_RUNNING and return. The media selection code will call us again when * it's done. */ if (scp->autoneg_status) { scp->ifp->if_flags |= IFF_RUNNING; } else { /* Enable receiver, put MAC online */ XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_RX_ENABLE|XE_CMD0_ONLINE); /* Set up IMR, enable interrupts */ xe_enable_intr(scp); /* Attempt to start output */ scp->ifp->if_flags |= IFF_RUNNING; scp->ifp->if_flags &= ~IFF_OACTIVE; xe_start(scp->ifp); } (void)splx(s); } /* * Start output on interface. We make two assumptions here: * 1) that the current priority is set to splimp _before_ this code * is called *and* is returned to the appropriate priority after * return * 2) that the IFF_OACTIVE flag is checked before this code is called * (i.e. that the output part of the interface is idle) */ static void xe_start(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; struct mbuf *mbp; if (scp->gone) return; /* * Loop while there are packets to be sent, and space to send them. */ while (1) { IF_DEQUEUE(&ifp->if_snd, mbp); /* Suck a packet off the send queue */ if (mbp == NULL) { /* * We are using the !OACTIVE flag to indicate to the outside world that * we can accept an additional packet rather than that the transmitter * is _actually_ active. Indeed, the transmitter may be active, but if * we haven't filled all the buffers with data then we still want to * accept more. */ ifp->if_flags &= ~IFF_OACTIVE; return; } if (xe_pio_write_packet(scp, mbp) != 0) { IF_PREPEND(&ifp->if_snd, mbp); /* Push the packet back onto the queue */ ifp->if_flags |= IFF_OACTIVE; return; } #if NBPF > 0 /* Tap off here if there is a bpf listener */ if (ifp->if_bpf) { #if XE_DEBUG > 1 printf("xe%d: sending output packet to BPF\n", scp->unit); #endif bpf_mtap(ifp, mbp); } #endif /* NBPF > 0 */ ifp->if_timer = 5; /* In case we don't hear from the card again */ scp->tx_queued++; m_freem(mbp); } } /* * Process an ioctl request. Adapted from the ed driver. */ static int xe_ioctl (register struct ifnet *ifp, u_long command, caddr_t data) { struct xe_softc *scp; int s, error; scp = ifp->if_softc; error = 0; if (scp->gone) { return ENXIO; } s = splimp(); switch (command) { case SIOCSIFADDR: case SIOCGIFADDR: case SIOCSIFMTU: error = ether_ioctl(ifp, command, data); break; case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. If it is * marked down and running, then stop it. */ if (ifp->if_flags & IFF_UP) { if (!(ifp->if_flags & IFF_RUNNING)) { xe_hard_reset(scp); xe_setmedia(scp); xe_init(scp); } } else { if (ifp->if_flags & IFF_RUNNING) xe_stop(scp); } case SIOCADDMULTI: case SIOCDELMULTI: /* * Multicast list has (maybe) changed; set the hardware filter * accordingly. This also serves to deal with promiscuous mode if we have * a BPF listener active. */ xe_setmulti(scp); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: /* * Someone wants to get/set media options. */ error = ifmedia_ioctl(ifp, (struct ifreq *)data, &scp->ifmedia, command); break; default: error = EINVAL; } (void)splx(s); return error; } /* * Card interrupt handler: should return true if the interrupt was for us, in * case we are sharing our IRQ line with other devices (this will probably be * the case for multifunction cards). * * This function is probably more complicated than it needs to be, as it * attempts to deal with the case where multiple packets get sent between * interrupts. This is especially annoying when working out the collision * stats. Not sure whether this case ever really happens or not (maybe on a * slow/heavily loaded machine?) so it's probably best to leave this like it * is. * * Note that the crappy PIO used to get packets on and off the card means that * you will spend a lot of time in this routine -- I can get my P150 to spend * 90% of its time servicing interrupts if I really hammer the network. Could * fix this, but then you'd start dropping/losing packets. The moral of this * story? If you want good network performance _and_ some cycles left over to * get your work done, don't buy a Xircom card. Or convince them to tell me * how to do memory-mapped I/O :) */ static int xe_card_intr(struct pccard_devinfo *devi) { struct xe_softc *scp; struct ifnet *ifp; int unit, result; u_int16_t rx_bytes, rxs, txs; u_int8_t psr, isr, esr, rsr; unit = devi->isahd.id_unit; scp = sca[unit]; ifp = &scp->arpcom.ac_if; rx_bytes = 0; /* Bytes received on this interrupt */ result = 0; /* Set true if the interrupt is for us */ if (scp->gone) return 0; if (scp->mohawk) { XE_OUTB(XE_CR, 0); /* Disable interrupts */ } psr = XE_INB(XE_PR); /* Stash the current register page */ /* * Read ISR to see what caused this interrupt. Note that this clears the * ISR on CE2 type cards. */ if ((isr = XE_INB(XE_ISR)) && isr != 0xff) { result = 1; /* This device did generate an int */ esr = XE_INB(XE_ESR); /* Read the other status registers */ XE_SELECT_PAGE(0x40); rxs = XE_INB(XE_RST0); XE_OUTB(XE_RST0, ~rxs & 0xff); txs = XE_INB(XE_TXST0); txs |= XE_INB(XE_TXST1) << 8; XE_OUTB(XE_TXST0, 0); XE_OUTB(XE_TXST1, 0); XE_SELECT_PAGE(0); #if XE_DEBUG > 2 printf("xe%d: ISR=%#2.2x ESR=%#2.2x RST=%#2.2x TXST=%#4.4x\n", unit, isr, esr, rxs, txs); #endif /* * Handle transmit interrupts */ if (isr & XE_ISR_TX_PACKET) { u_int8_t new_tpr, sent; if ((new_tpr = XE_INB(XE_TPR)) < scp->tx_tpr) /* Update packet count */ sent = (0xff - scp->tx_tpr) + new_tpr; /* TPR rolled over */ else sent = new_tpr - scp->tx_tpr; if (sent > 0) { /* Packets sent since last interrupt */ scp->tx_tpr = new_tpr; scp->tx_queued -= sent; ifp->if_opackets += sent; ifp->if_collisions += scp->tx_collisions; /* * Collision stats are a PITA. If multiples frames have been sent, we * distribute any outstanding collision count equally amongst them. * However, if we're missing interrupts we're quite likely to also * miss some collisions; thus the total count will be off anyway. * Likewise, if we miss a frame dropped due to excessive collisions * any outstanding collisions count will be held against the next * frame to be successfully sent. Hopefully it averages out in the * end! * XXX - This will screw up if tx_collisions/sent > 14. FIX IT! */ switch (scp->tx_collisions) { case 0: break; case 1: scp->mibdata.dot3StatsSingleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[0]++; break; default: if (sent == 1) { scp->mibdata.dot3StatsMultipleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[scp->tx_collisions-1]++; } else { /* Distribute across multiple frames */ scp->mibdata.dot3StatsMultipleCollisionFrames += sent; scp->mibdata. dot3StatsCollFrequencies[scp->tx_collisions/sent] += sent - scp->tx_collisions%sent; scp->mibdata. dot3StatsCollFrequencies[scp->tx_collisions/sent + 1] += scp->tx_collisions%sent; } } scp->tx_collisions = 0; } ifp->if_timer = 0; ifp->if_flags &= ~IFF_OACTIVE; } if (txs & 0x0002) { /* Excessive collisions (packet dropped) */ ifp->if_collisions += 16; ifp->if_oerrors++; scp->tx_collisions = 0; scp->mibdata.dot3StatsExcessiveCollisions++; scp->mibdata.dot3StatsMultipleCollisionFrames++; scp->mibdata.dot3StatsCollFrequencies[15]++; XE_OUTB(XE_CR, XE_CR_RESTART_TX); } if (txs & 0x0040) /* Transmit aborted -- probably collisions */ scp->tx_collisions++; /* * Handle receive interrupts */ while ((esr = XE_INB(XE_ESR)) & XE_ESR_FULL_PACKET_RX) { if ((rsr = XE_INB(XE_RSR)) & XE_RSR_RX_OK) { struct ether_header *ehp; struct mbuf *mbp; u_int16_t len; len = XE_INW(XE_RBC); if (len == 0) continue; #if 0 /* * Limit the amount of time we spend in this loop, dropping packets if * necessary. The Linux code does this with considerably more * finesse, adjusting the threshold dynamically. */ if ((rx_bytes += len) > 22000) { ifp->if_iqdrops++; scp->mibData.dot3StatsMissedFrames++; XE_OUTW(XE_DO, 0x8000); continue; } #endif if (len & 0x01) len++; MGETHDR(mbp, M_DONTWAIT, MT_DATA); /* Allocate a header mbuf */ if (mbp != NULL) { mbp->m_pkthdr.rcvif = ifp; mbp->m_pkthdr.len = mbp->m_len = len; /* * If the mbuf header isn't big enough for the packet, attach an * mbuf cluster to hold it. The +2 is to allow for the nasty little * alignment hack below. */ if (len + 2 > MHLEN) { MCLGET(mbp, M_DONTWAIT); if ((mbp->m_flags & M_EXT) == 0) { m_freem(mbp); mbp = NULL; } } } if (mbp != NULL) { /* * The Ethernet header is 14 bytes long; thus the actual packet data * won't be 32-bit aligned when it's dumped into the mbuf. We * offset everything by 2 bytes to fix this. Apparently the * alignment is important for NFS, damn its eyes. */ mbp->m_data += 2; ehp = mtod(mbp, struct ether_header *); /* * Now get the packet, including the Ethernet header and trailer (?) * We use programmed I/O, because we don't know how to do shared * memory with these cards. So yes, it's real slow, and heavy on * the interrupts (CPU on my P150 maxed out at ~950KBps incoming). */ if (scp->srev == 0) { /* Workaround a bug in old cards */ u_short rhs; XE_SELECT_PAGE(5); rhs = XE_INW(XE_RHSA); XE_SELECT_PAGE(0); rhs += 3; /* Skip control info */ if (rhs >= 0x8000) rhs = 0; if (rhs + len > 0x8000) { int i; /* * XXX - This i-- seems very wrong, but it's what the Linux guys * XXX - do. Need someone with an old CE2 to test this for me. * XXX - 99/3/28: Changed the first i-- to an i++, maybe that'll * XXX - fix it? It seems as though the previous version would * XXX - have caused an infinite loop (what, another one?). */ for (i = 0; i < len; i++, rhs++) { ((char *)ehp)[i] = XE_INB(XE_EDP); if (rhs == 0x8000) { rhs = 0; i--; } } } else insw(scp->dev->id_iobase+XE_EDP, ehp, len >> 1); } else insw(scp->dev->id_iobase+XE_EDP, ehp, len >> 1); #if NBPF > 0 /* * Check if there's a BPF listener on this interface. If so, hand * off the raw packet to bpf. */ if (ifp->if_bpf) { #if XE_DEBUG > 1 printf("xe%d: passing input packet to BPF\n", scp->unit); #endif bpf_mtap(ifp, mbp); /* * Note that the interface cannot be in promiscuous mode if there * are no BPF listeners. And if we are in promiscuous mode, we * have to check if this packet is really ours. */ if ((ifp->if_flags & IFF_PROMISC) && bcmp(ehp->ether_dhost, scp->arpcom.ac_enaddr, sizeof(ehp->ether_dhost)) != 0 && (rsr & XE_RSR_PHYS_PACKET)) { m_freem(mbp); mbp = NULL; } } #endif /* NBPF > 0 */ if (mbp != NULL) { mbp->m_pkthdr.len = mbp->m_len = len - ETHER_HDR_LEN; mbp->m_data += ETHER_HDR_LEN; /* Strip off Ethernet header */ ether_input(ifp, ehp, mbp); /* Send the packet on its way */ ifp->if_ipackets++; /* Success! */ } XE_OUTW(XE_DO, 0x8000); /* skip_rx_packet command */ } } else if (rsr & XE_RSR_LONG_PACKET) { /* Packet length >1518 bytes */ scp->mibdata.dot3StatsFrameTooLongs++; ifp->if_ierrors++; } else if (rsr & XE_RSR_CRC_ERROR) { /* Bad checksum on packet */ scp->mibdata.dot3StatsFCSErrors++; ifp->if_ierrors++; } else if (rsr & XE_RSR_ALIGN_ERROR) { /* Packet alignment error */ scp->mibdata.dot3StatsAlignmentErrors++; ifp->if_ierrors++; } } if (rxs & 0x10) { /* Receiver overrun */ scp->mibdata.dot3StatsInternalMacReceiveErrors++; ifp->if_ierrors++; XE_OUTB(XE_CR, XE_CR_CLEAR_OVERRUN); } } XE_SELECT_PAGE(psr); /* Restore saved page */ XE_OUTB(XE_CR, XE_CR_ENABLE_INTR); /* Re-enable interrupts */ /* Could force an int here, instead of dropping packets? */ /* XE_OUTB(XE_CR, XE_CR_ENABLE_INTR|XE_CE_FORCE_INTR); */ return result; } /* * Device timeout/watchdog routine. Called automatically if we queue a packet * for transmission but don't get an interrupt within a specified timeout * (usually 5 seconds). When this happens we assume the worst and reset the * card. */ static void xe_watchdog(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; if (scp->gone) return; printf("xe%d: watchdog timeout; resetting card\n", scp->unit); scp->tx_timeouts++; ifp->if_oerrors += scp->tx_queued; xe_stop(scp); xe_hard_reset(scp); xe_setmedia(scp); xe_init(scp); } /* * Change media selection. */ static int xe_media_change(struct ifnet *ifp) { struct xe_softc *scp = ifp->if_softc; #ifdef XE_DEBUG printf("xe%d: media_change\n", ifp->if_unit); #endif if (IFM_TYPE(scp->ifm->ifm_media) != IFM_ETHER) return(EINVAL); /* * Some card/media combos aren't always possible -- filter those out here. */ if ((IFM_SUBTYPE(scp->ifm->ifm_media) == IFM_AUTO || IFM_SUBTYPE(scp->ifm->ifm_media) == IFM_100_TX) && !scp->phy_ok) return (EINVAL); xe_setmedia(scp); return 0; } /* * Return current media selection. */ static void xe_media_status(struct ifnet *ifp, struct ifmediareq *mrp) { #ifdef XE_DEBUG printf("xe%d: media_status\n", ifp->if_unit); #endif mrp->ifm_active = ((struct xe_softc *)ifp->if_softc)->media; return; } /* * Select active media. */ static void xe_setmedia(void *xscp) { struct xe_softc *scp = xscp; u_int16_t bmcr, bmsr, anar, lpar; #ifdef XE_DEBUG printf("xe%d: setmedia\n", scp->unit); #endif /* Cancel any pending timeout */ untimeout(xe_setmedia, scp, scp->chand); xe_disable_intr(scp); /* Select media */ scp->media = IFM_ETHER; switch (IFM_SUBTYPE(scp->ifm->ifm_media)) { case IFM_AUTO: /* Autoselect media */ scp->media = IFM_ETHER|IFM_AUTO; /* * Autoselection is really awful. It goes something like this: * * Wait until the transmitter goes idle (2sec timeout). * Reset card * IF a 100Mbit PHY exists * Start NWAY autonegotiation (3.5sec timeout) * IF that succeeds * Select 100baseTX or 10baseT, whichever was detected * ELSE * Reset card * IF a 100Mbit PHY exists * Try to force a 100baseTX link (3sec timeout) * IF that succeeds * Select 100baseTX * ELSE * Disable the PHY * ENDIF * ENDIF * ENDIF * ENDIF * IF nothing selected so far * IF a 100Mbit PHY exists * Select 10baseT * ELSE * Select 10baseT or 10base2, whichever is connected * ENDIF * ENDIF */ switch (scp->autoneg_status) { case XE_AUTONEG_NONE: #if XE_DEBUG > 1 printf("xe%d: Waiting for idle transmitter\n", scp->unit); #endif scp->arpcom.ac_if.if_flags |= IFF_OACTIVE; scp->autoneg_status = XE_AUTONEG_WAITING; scp->chand = timeout(xe_setmedia, scp, hz * 2); return; case XE_AUTONEG_WAITING: xe_soft_reset(scp); if (scp->phy_ok) { #if XE_DEBUG > 1 printf("xe%d: Starting autonegotiation\n", scp->unit); #endif bmcr = xe_phy_readreg(scp, PHY_BMCR); bmcr &= ~(PHY_BMCR_AUTONEGENBL); xe_phy_writereg(scp, PHY_BMCR, bmcr); anar = xe_phy_readreg(scp, PHY_ANAR); anar &= ~(PHY_ANAR_100BT4|PHY_ANAR_100BTXFULL|PHY_ANAR_10BTFULL); anar |= PHY_ANAR_100BTXHALF|PHY_ANAR_10BTHALF; xe_phy_writereg(scp, PHY_ANAR, anar); bmcr |= PHY_BMCR_AUTONEGENBL|PHY_BMCR_AUTONEGRSTR; xe_phy_writereg(scp, PHY_BMCR, bmcr); scp->autoneg_status = XE_AUTONEG_STARTED; scp->chand = timeout(xe_setmedia, scp, hz * 7/2); return; } else { scp->autoneg_status = XE_AUTONEG_FAIL; } break; case XE_AUTONEG_STARTED: bmsr = xe_phy_readreg(scp, PHY_BMSR); lpar = xe_phy_readreg(scp, PHY_LPAR); if (bmsr & (PHY_BMSR_AUTONEGCOMP|PHY_BMSR_LINKSTAT)) { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation complete!\n", scp->unit); #endif /* * XXX - Shouldn't have to do this, but (on my hub at least) the * XXX - transmitter won't work after a successful autoneg. So we see * XXX - what the negotiation result was and force that mode. I'm * XXX - sure there is an easy fix for this. */ if (lpar & PHY_LPAR_100BTXHALF) { xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); XE_MII_DUMP(scp); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media = IFM_ETHER|IFM_100_TX; scp->autoneg_status = XE_AUTONEG_NONE; } else { /* * XXX - Bit of a hack going on in here. * XXX - This is derived from Ken Hughes patch to the Linux driver * XXX - to make it work with 10Mbit _autonegotiated_ links on CE3B * XXX - cards. What's a CE3B and how's it differ from a plain CE3? * XXX - these are the things we need to find out. */ xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); /* BEGIN HACK */ XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media = IFM_ETHER|IFM_10_T; scp->autoneg_status = XE_AUTONEG_NONE; /* END HACK */ /*XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08);*/ /* Disable PHY? */ /*scp->autoneg_status = XE_AUTONEG_FAIL;*/ } } else { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation failed; trying 100baseTX\n", scp->unit); #endif XE_MII_DUMP(scp); xe_soft_reset(scp); if (scp->phy_ok) { xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); scp->autoneg_status = XE_AUTONEG_100TX; scp->chand = timeout(xe_setmedia, scp, hz * 3); return; } else { scp->autoneg_status = XE_AUTONEG_FAIL; } } break; case XE_AUTONEG_100TX: (void)xe_phy_readreg(scp, PHY_BMSR); bmsr = xe_phy_readreg(scp, PHY_BMSR); if (bmsr & PHY_BMSR_LINKSTAT) { #if XE_DEBUG > 1 printf("xe%d: Got 100baseTX link!\n", scp->unit); #endif XE_MII_DUMP(scp); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media = IFM_ETHER|IFM_100_TX; scp->autoneg_status = XE_AUTONEG_NONE; } else { #if XE_DEBUG > 1 printf("xe%d: Autonegotiation failed; disabling PHY\n", scp->unit); #endif XE_MII_DUMP(scp); xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08); /* Disable PHY? */ scp->autoneg_status = XE_AUTONEG_FAIL; } break; } /* * If we got down here _and_ autoneg_status is XE_AUTONEG_FAIL, then * either autonegotiation failed, or never got started to begin with. In * either case, select a suitable 10Mbit media and hope it works. We * don't need to reset the card again, since it will have been done * already by the big switch above. */ if (scp->autoneg_status == XE_AUTONEG_FAIL) { #if XE_DEBUG > 1 printf("xe%d: Selecting 10baseX\n", scp->unit); #endif if (scp->mohawk) { XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media = IFM_ETHER|IFM_10_T; scp->autoneg_status = XE_AUTONEG_NONE; } else { XE_SELECT_PAGE(4); XE_OUTB(XE_GPR0, 4); DELAY(50000); XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, (XE_INB(XE_ESR) & XE_ESR_MEDIA_SELECT) ? 0x80 : 0xc0); scp->media = IFM_ETHER|((XE_INB(XE_ESR) & XE_ESR_MEDIA_SELECT) ? IFM_10_T : IFM_10_2); scp->autoneg_status = XE_AUTONEG_NONE; } } break; /* * If a specific media has been requested, we just reset the card and * select it (one small exception -- if 100baseTX is requested by there is * no PHY, we fall back to 10baseT operation). */ case IFM_100_TX: /* Force 100baseTX */ xe_soft_reset(scp); if (scp->phy_ok) { #if XE_DEBUG > 1 printf("xe%d: Selecting 100baseTX\n", scp->unit); #endif XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0); xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_SPEEDSEL); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) | 0x08); scp->media |= IFM_100_TX; break; } /* FALLTHROUGH */ case IFM_10_T: /* Force 10baseT */ xe_soft_reset(scp); #if XE_DEBUG > 1 printf("xe%d: Selecting 10baseT\n", scp->unit); #endif if (scp->phy_ok) { xe_phy_writereg(scp, PHY_BMCR, 0x0000); XE_SELECT_PAGE(2); XE_OUTB(XE_MSR, XE_INB(XE_MSR) & ~0x08); /* Disable PHY */ } XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x80); scp->media |= IFM_10_T; break; case IFM_10_2: xe_soft_reset(scp); #if XE_DEBUG > 1 printf("xe%d: Selecting 10base2\n", scp->unit); #endif XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0xc0); scp->media |= IFM_10_2; break; } /* * Finally, the LEDs are set to match whatever media was chosen and the * transmitter is unblocked. */ #if XE_DEBUG > 1 printf("xe%d: Setting LEDs\n", scp->unit); #endif XE_SELECT_PAGE(2); switch (IFM_SUBTYPE(scp->media)) { case IFM_100_TX: case IFM_10_T: XE_OUTB(XE_LED, 0x3b); if (scp->dingo) XE_OUTB(0x0b, 0x04); /* 100Mbit LED */ break; case IFM_10_2: XE_OUTB(XE_LED, 0x3a); break; } /* Restart output? */ scp->ifp->if_flags &= ~IFF_OACTIVE; xe_init(scp); } /* * Hard reset (power cycle) the card. */ static void xe_hard_reset(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: hard_reset\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Power cycle the card. */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR1, 0); /* Power off */ DELAY(40000); if (scp->mohawk) XE_OUTB(XE_GPR1, 1); /* And back on again */ else XE_OUTB(XE_GPR1, 5); /* Also set AIC bit, whatever that is */ DELAY(40000); XE_SELECT_PAGE(0); (void)splx(s); } /* * Soft reset the card. Also makes sure that the ML6692 and 10Mbit controller * are powered up, sets the silicon revision number in softc, disables * interrupts and checks for the prescence of a 100Mbit PHY. This should * leave us in a position where we can access the PHY and do media * selection. The function imposes a 0.5s delay while the hardware powers up. */ static void xe_soft_reset(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: soft_reset\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Reset the card, (again). */ XE_SELECT_PAGE(0); XE_OUTB(XE_CR, XE_CR_SOFT_RESET); DELAY(40000); XE_OUTB(XE_CR, 0); DELAY(40000); if (scp->mohawk) { /* * set GP1 and GP2 as outputs (bits 2 & 3) * set GP1 low to power on the ML6692 (bit 0) * set GP2 high to power on the 10Mhz chip (bit 1) */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR0, 0x0e); } /* * Wait for everything to wake up. */ DELAY(500000); /* * Get silicon revision number. */ XE_SELECT_PAGE(4); if (scp->mohawk) scp->srev = (XE_INB(XE_BOV) & 0x70) >> 4; else scp->srev = (XE_INB(XE_BOV) & 0x30) >> 4; #ifdef XE_DEBUG printf("xe%d: silicon revision = %d\n", scp->unit, scp->srev); #endif /* * Shut off interrupts. */ xe_disable_intr(scp); /* * Check for PHY. */ if (scp->mohawk) { scp->phy_ok = xe_mii_init(scp); } XE_SELECT_PAGE(0); (void)splx(s); } /* * Take interface offline. This is done by powering down the device, which I * assume means just shutting down the transceiver and Ethernet logic. This * requires a _hard_ reset to recover from, as we need to power up again. */ static void xe_stop(struct xe_softc *scp) { int s; #ifdef XE_DEBUG printf("xe%d: stop\n", scp->unit); #endif if (scp->gone) return; s = splimp(); /* * Shut off interrupts. */ xe_disable_intr(scp); /* * Power down. */ XE_SELECT_PAGE(4); XE_OUTB(XE_GPR1, 0); XE_SELECT_PAGE(0); /* * ~IFF_RUNNING == interface down. */ scp->ifp->if_flags &= ~IFF_RUNNING; scp->ifp->if_flags &= ~IFF_OACTIVE; scp->ifp->if_timer = 0; (void)splx(s); } /* * Enable Ethernet interrupts from the card. */ static void xe_enable_intr(struct xe_softc *scp) { #ifdef XE_DEBUG printf("xe%d: enable_intr\n", scp->unit); #endif XE_SELECT_PAGE(1); XE_OUTB(XE_IMR0, 0xff); /* Unmask everything */ XE_OUTB(XE_IMR1, 0x01); /* Unmask TX underrun detection */ DELAY(1); XE_SELECT_PAGE(0); XE_OUTB(XE_CR, XE_CR_ENABLE_INTR); /* Enable interrupts */ if (scp->modem && !scp->dingo) { /* This bit is just magic */ if (!(XE_INB(0x10) & 0x01)) { XE_OUTB(0x10, 0x11); /* Unmask master int enable bit */ } } } /* * Disable all Ethernet interrupts from the card. */ static void xe_disable_intr(struct xe_softc *scp) { #ifdef XE_DEBUG printf("xe%d: disable_intr\n", scp->unit); #endif XE_SELECT_PAGE(0); XE_OUTB(XE_CR, 0); /* Disable interrupts */ if (scp->modem && !scp->dingo) { /* More magic (does this work?) */ XE_OUTB(0x10, 0x10); /* Mask the master int enable bit */ } XE_SELECT_PAGE(1); XE_OUTB(XE_IMR0, 0); /* Forbid all interrupts */ XE_OUTB(XE_IMR1, 0); XE_SELECT_PAGE(0); } /* * Set up multicast filter and promiscuous mode */ static void xe_setmulti(struct xe_softc *scp) { struct ifnet *ifp; struct ifmultiaddr *maddr; int count; ifp = &scp->arpcom.ac_if; maddr = ifp->if_multiaddrs.lh_first; /* Get length of multicast list */ for (count = 0; maddr != NULL; maddr = maddr->ifma_link.le_next, count++); if ((ifp->if_flags & IFF_PROMISC) || (ifp->if_flags & IFF_ALLMULTI) || (count > 9)) { /* * Go into promiscuous mode if either of the PROMISC or ALLMULTI flags are * set, or if we have been asked to deal with more than 9 multicast * addresses. To do this: set MPE and PME in SWC1 */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x06); } else if ((ifp->if_flags & IFF_MULTICAST) && (count > 0)) { /* * Program the filters for up to 9 addresses */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0x01); XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_OFFLINE); /*xe_reg_dump(scp);*/ xe_setaddrs(scp); /*xe_reg_dump(scp);*/ XE_SELECT_PAGE(0x40); XE_OUTB(XE_CMD0, XE_CMD0_RX_ENABLE|XE_CMD0_ONLINE); } else { /* * No multicast operation (default) */ XE_SELECT_PAGE(0x42); XE_OUTB(XE_SWC1, 0); } XE_SELECT_PAGE(0); } /* * Set up all on-chip addresses (for multicast). AFAICS, there are 10 * of these things; the first is our MAC address, the other 9 are mcast * addresses, padded with the MAC address if there aren't enough. * XXX - This doesn't work right, but I'm not sure why yet. We seem to be * XXX - doing much the same as the Linux code, which is weird enough that * XXX - it's probably right (despite my earlier comments to the contrary). */ static void xe_setaddrs(struct xe_softc *scp) { struct ifmultiaddr *maddr; u_int8_t *addr; u_int8_t page, slot, byte, i; maddr = scp->arpcom.ac_if.if_multiaddrs.lh_first; XE_SELECT_PAGE(page = 0x50); for (slot = 0, byte = 8; slot < 10; slot++) { if (slot == 0) addr = (u_int8_t *)(&scp->arpcom.ac_enaddr); else { while (maddr != NULL && maddr->ifma_addr->sa_family != AF_LINK) maddr = maddr->ifma_link.le_next; if (maddr != NULL) addr = LLADDR((struct sockaddr_dl *)maddr->ifma_addr); else addr = (u_int8_t *)(&scp->arpcom.ac_enaddr); } for (i = 0; i < 6; i++, byte++) { #if XE_DEBUG > 2 if (i) printf(":%x", addr[i]); else printf("xe%d: individual addresses %d: %x", scp->unit, slot, addr[0]); #endif if (byte > 15) { page++; byte = 8; XE_SELECT_PAGE(page); } if (scp->mohawk) XE_OUTB(byte, addr[5 - i]); else XE_OUTB(byte, addr[i]); } #if XE_DEBUG > 2 printf("\n"); #endif } XE_SELECT_PAGE(0); } /* * Write an outgoing packet to the card using programmed I/O. */ static int xe_pio_write_packet(struct xe_softc *scp, struct mbuf *mbp) { struct mbuf *mbp2; u_int16_t len, pad, free, ok; u_int8_t *data; u_int8_t savebyte[2], wantbyte; /* Get total packet length */ for (len = 0, mbp2 = mbp; mbp2 != NULL; len += mbp2->m_len, mbp2 = mbp2->m_next); /* Packets < minimum length may need to be padded out */ pad = 0; if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) { pad = (ETHER_MIN_LEN - ETHER_CRC_LEN - len + 1) >> 1; len = ETHER_MIN_LEN - ETHER_CRC_LEN; } /* Check transmit buffer space */ XE_SELECT_PAGE(0); XE_OUTW(XE_TRS, len+2); free = XE_INW(XE_TSO); ok = free & 0x8000; free &= 0x7fff; if (free <= len + 2) return 1; /* Send packet length to card */ XE_OUTW(XE_EDP, len); /* * Write packet to card using PIO (code stolen from the ed driver) */ wantbyte = 0; while (mbp != NULL) { len = mbp->m_len; if (len > 0) { data = mtod(mbp, caddr_t); if (wantbyte) { /* Finish the last word */ savebyte[1] = *data; XE_OUTW(XE_EDP, *(u_short *)savebyte); data++; len--; wantbyte = 0; } if (len > 1) { /* Output contiguous words */ outsw(scp->dev->id_iobase+XE_EDP, data, len >> 1); data += len & ~1; len &= 1; } if (len == 1) { /* Save last byte, if necessary */ savebyte[0] = *data; wantbyte = 1; } } mbp = mbp->m_next; } if (wantbyte) /* Last byte for odd-length packets */ XE_OUTW(XE_EDP, *(u_short *)savebyte); /* * For CE3 cards, just tell 'em to send -- apparently the card will pad out * short packets with random cruft. Otherwise, write nonsense words to fill * out the packet. I guess it is then sent automatically (?) */ if (scp->mohawk) XE_OUTB(XE_CR, XE_CR_TX_PACKET|XE_CR_ENABLE_INTR); else while (pad > 0) { XE_OUTW(XE_EDP, 0xdead); pad--; } return 0; } /* * The device entry is being removed, probably because someone ejected the * card. The interface should have been brought down manually before calling * this function; if not you may well lose packets. In any case, I shut down * the card and the interface, and hope for the best. The 'gone' flag is set, * so hopefully no-one else will try to access the missing card. */ static void xe_card_unload(struct pccard_devinfo *devi) { struct xe_softc *scp; struct ifnet *ifp; int unit; unit = devi->isahd.id_unit; scp = sca[unit]; ifp = &scp->arpcom.ac_if; if (scp->gone) { printf("xe%d: already unloaded\n", unit); return; } if_down(ifp); ifp->if_flags &= ~(IFF_RUNNING|IFF_OACTIVE); xe_stop(scp); scp->gone = 1; } /* * Compute the 32-bit Ethernet CRC for the given buffer. */ static u_int32_t xe_compute_crc(u_int8_t *data, int len) { u_int32_t crc = 0xffffffff; u_int32_t poly = 0x04c11db6; u_int8_t current, crc31, bit; int i, k; for (i = 0; i < len; i++) { current = data[i]; for (k = 1; k <= 8; k++) { if (crc & 0x80000000) { crc31 = 0x01; } else { crc31 = 0; } bit = crc31 ^ (current & 0x01); crc <<= 1; current >>= 1; if (bit) { crc = (crc ^ poly)|1; } } } return crc; } /* * Convert a CRC into an index into the multicast hash table. What we do is * take the most-significant 6 bits of the CRC, reverse them, and use that as * the bit number in the hash table. Bits 5:3 of the result give the byte * within the table (0-7); bits 2:0 give the bit number within that byte (also * 0-7), ie. the number of shifts needed to get it into the lsb position. */ static int xe_compute_hashbit(u_int32_t crc) { u_int8_t hashbit = 0; int i; for (i = 0; i < 6; i++) { hashbit >>= 1; if (crc & 0x80000000) { hashbit &= 0x80; } crc <<= 1; } return (hashbit >> 2); } /************************************************************** * * * M I I F U N C T I O N S * * * **************************************************************/ /* * Alternative MII/PHY handling code adapted from the xl driver. It doesn't * seem to work any better than the xirc2_ps stuff, but it's cleaner code. * XXX - this stuff shouldn't be here. It should all be abstracted off to * XXX - some kind of common MII-handling code, shared by all drivers. But * XXX - that's a whole other mission. */ #define XE_MII_SET(x) XE_OUTB(XE_GPR2, (XE_INB(XE_GPR2) | 0x04) | (x)) #define XE_MII_CLR(x) XE_OUTB(XE_GPR2, (XE_INB(XE_GPR2) | 0x04) & ~(x)) /* * Sync the PHYs by setting data bit and strobing the clock 32 times. */ static void xe_mii_sync(struct xe_softc *scp) { register int i; XE_SELECT_PAGE(2); XE_MII_SET(XE_MII_DIR|XE_MII_WRD); for (i = 0; i < 32; i++) { XE_MII_SET(XE_MII_CLK); DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); } } /* * Look for a MII-compliant PHY. If we find one, reset it. */ static int xe_mii_init(struct xe_softc *scp) { u_int16_t status; status = xe_phy_readreg(scp, PHY_BMSR); if ((status & 0xff00) != 0x7800) { #if XE_DEBUG > 1 printf("xe%d: no PHY found, %0x\n", scp->unit, status); #endif return 0; } else { #if XE_DEBUG > 1 printf("xe%d: PHY OK!\n", scp->unit); #endif /* Reset the PHY */ xe_phy_writereg(scp, PHY_BMCR, PHY_BMCR_RESET); DELAY(500); while(xe_phy_readreg(scp, PHY_BMCR) & PHY_BMCR_RESET); XE_MII_DUMP(scp); return 1; } } /* * Clock a series of bits through the MII. */ static void xe_mii_send(struct xe_softc *scp, u_int32_t bits, int cnt) { int i; XE_SELECT_PAGE(2); XE_MII_CLR(XE_MII_CLK); for (i = (0x1 << (cnt - 1)); i; i >>= 1) { if (bits & i) { XE_MII_SET(XE_MII_WRD); } else { XE_MII_CLR(XE_MII_WRD); } DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); } } /* * Read an PHY register through the MII. */ static int xe_mii_readreg(struct xe_softc *scp, struct xe_mii_frame *frame) { int i, ack, s; s = splimp(); /* * Set up frame for RX. */ frame->mii_stdelim = XE_MII_STARTDELIM; frame->mii_opcode = XE_MII_READOP; frame->mii_turnaround = 0; frame->mii_data = 0; XE_SELECT_PAGE(2); XE_OUTB(XE_GPR2, 0); /* * Turn on data xmit. */ XE_MII_SET(XE_MII_DIR); xe_mii_sync(scp); /* * Send command/address info. */ xe_mii_send(scp, frame->mii_stdelim, 2); xe_mii_send(scp, frame->mii_opcode, 2); xe_mii_send(scp, frame->mii_phyaddr, 5); xe_mii_send(scp, frame->mii_regaddr, 5); /* Idle bit */ XE_MII_CLR((XE_MII_CLK|XE_MII_WRD)); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); /* Turn off xmit. */ XE_MII_CLR(XE_MII_DIR); /* Check for ack */ XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); ack = XE_INB(XE_GPR2) & XE_MII_RDD; /* * Now try reading data bits. If the ack failed, we still * need to clock through 16 cycles to keep the PHY(s) in sync. */ if (ack) { for(i = 0; i < 16; i++) { XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); } goto fail; } for (i = 0x8000; i; i >>= 1) { XE_MII_CLR(XE_MII_CLK); DELAY(1); if (!ack) { if (XE_INB(XE_GPR2) & XE_MII_RDD) frame->mii_data |= i; DELAY(1); } XE_MII_SET(XE_MII_CLK); DELAY(1); } fail: XE_MII_CLR(XE_MII_CLK); DELAY(1); XE_MII_SET(XE_MII_CLK); DELAY(1); splx(s); if (ack) return(1); return(0); } /* * Write to a PHY register through the MII. */ static int xe_mii_writereg(struct xe_softc *scp, struct xe_mii_frame *frame) { int s; s = splimp(); /* * Set up frame for TX. */ frame->mii_stdelim = XE_MII_STARTDELIM; frame->mii_opcode = XE_MII_WRITEOP; frame->mii_turnaround = XE_MII_TURNAROUND; XE_SELECT_PAGE(2); /* * Turn on data output. */ XE_MII_SET(XE_MII_DIR); xe_mii_sync(scp); xe_mii_send(scp, frame->mii_stdelim, 2); xe_mii_send(scp, frame->mii_opcode, 2); xe_mii_send(scp, frame->mii_phyaddr, 5); xe_mii_send(scp, frame->mii_regaddr, 5); xe_mii_send(scp, frame->mii_turnaround, 2); xe_mii_send(scp, frame->mii_data, 16); /* Idle bit. */ XE_MII_SET(XE_MII_CLK); DELAY(1); XE_MII_CLR(XE_MII_CLK); DELAY(1); /* * Turn off xmit. */ XE_MII_CLR(XE_MII_DIR); splx(s); return(0); } /* * Read a register from the PHY. */ static u_int16_t xe_phy_readreg(struct xe_softc *scp, u_int16_t reg) { struct xe_mii_frame frame; bzero((char *)&frame, sizeof(frame)); frame.mii_phyaddr = 0; frame.mii_regaddr = reg; xe_mii_readreg(scp, &frame); return(frame.mii_data); } /* * Write to a PHY register. */ static void xe_phy_writereg(struct xe_softc *scp, u_int16_t reg, u_int16_t data) { struct xe_mii_frame frame; bzero((char *)&frame, sizeof(frame)); frame.mii_phyaddr = 0; frame.mii_regaddr = reg; frame.mii_data = data; xe_mii_writereg(scp, &frame); return; } #ifdef XE_DEBUG /* * A bit of debugging code. */ static void xe_mii_dump(struct xe_softc *scp) { int i, s; s = splimp(); printf("xe%d: MII registers: ", scp->unit); for (i = 0; i < 2; i++) { printf(" %d:%04x", i, xe_phy_readreg(scp, i)); } for (i = 4; i < 7; i++) { printf(" %d:%04x", i, xe_phy_readreg(scp, i)); } printf("\n"); (void)splx(s); } static void xe_reg_dump(struct xe_softc *scp) { int page, i, s; s = splimp(); printf("xe%d: Common registers: ", scp->unit); for (i = 0; i < 8; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); for (page = 0; page <= 8; page++) { printf("xe%d: Register page %2.2x: ", scp->unit, page); XE_SELECT_PAGE(page); for (i = 8; i < 16; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); } for (page = 0x10; page < 0x5f; page++) { if ((page >= 0x11 && page <= 0x3f) || (page == 0x41) || (page >= 0x43 && page <= 0x4f) || (page >= 0x59)) continue; printf("xe%d: Register page %2.2x: ", scp->unit, page); XE_SELECT_PAGE(page); for (i = 8; i < 16; i++) { printf(" %2.2x", XE_INB(i)); } printf("\n"); } (void)splx(s); } #endif #if NAPM > 0 /************************************************************** * * * A P M F U N C T I O N S * * * **************************************************************/ /* * This is called when we go into suspend/standby mode */ static int xe_suspend(void *xunit) { #ifdef XE_DEBUG struct xe_softc *scp = sca[(int)xunit]; printf("xe%d: APM suspend\n", scp->unit); #endif return 0; } /* * This is called when we wake up again */ static int xe_resume(void *xunit) { #ifdef XE_DEBUG struct xe_softc *scp = sca[(int)xunit]; printf("xe%d: APM resume\n", scp->unit); #endif return 0; } #endif /* NAPM > 0 */ #endif /* NCARD > 0 */ #endif /* NXE > 0 */ Index: head/sys/fs/cd9660/cd9660_vfsops.c =================================================================== --- head/sys/fs/cd9660/cd9660_vfsops.c (revision 49534) +++ head/sys/fs/cd9660/cd9660_vfsops.c (revision 49535) @@ -1,956 +1,955 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 - * $Id: cd9660_vfsops.c,v 1.55 1999/05/08 06:39:32 phk Exp $ + * $Id: cd9660_vfsops.c,v 1.56 1999/05/31 11:27:21 phk Exp $ */ #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ISOFSMNT, "ISOFS mount", "ISOFS mount structure"); MALLOC_DEFINE(M_ISOFSNODE, "ISOFS node", "ISOFS vnode private part"); static int cd9660_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int cd9660_start __P((struct mount *, int, struct proc *)); static int cd9660_unmount __P((struct mount *, int, struct proc *)); static int cd9660_root __P((struct mount *, struct vnode **)); static int cd9660_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int cd9660_statfs __P((struct mount *, struct statfs *, struct proc *)); static int cd9660_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int cd9660_vget __P((struct mount *, ino_t, struct vnode **)); static int cd9660_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int cd9660_vptofh __P((struct vnode *, struct fid *)); static struct vfsops cd9660_vfsops = { cd9660_mount, cd9660_start, cd9660_unmount, cd9660_root, cd9660_quotactl, cd9660_statfs, cd9660_sync, cd9660_vget, cd9660_fhtovp, cd9660_vptofh, cd9660_init }; VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY); /* * Called by vfs_mountroot when iso is going to be mounted as root. */ static int iso_get_ssector __P((dev_t dev, struct proc *p)); static int iso_mountfs __P((struct vnode *devvp, struct mount *mp, struct proc *p, struct iso_args *argp)); /* * Try to find the start of the last data track on this CD-ROM. This * is used to mount the last session of a multi-session CD. Bail out * and return 0 if we fail, this is always a safe bet. */ static int iso_get_ssector(dev, p) dev_t dev; struct proc *p; { struct ioc_toc_header h; struct ioc_read_toc_single_entry t; int i; struct cdevsw *bd; d_ioctl_t *ioctlp; bd = bdevsw(dev); ioctlp = bd->d_ioctl; if (ioctlp == NULL) return 0; if (ioctlp(dev, CDIOREADTOCHEADER, (caddr_t)&h, FREAD, p) != 0) return 0; for (i = h.ending_track; i >= 0; i--) { t.address_format = CD_LBA_FORMAT; t.track = i; if (ioctlp(dev, CDIOREADTOCENTRY, (caddr_t)&t, FREAD, p) != 0) return 0; if ((t.entry.control & 4) != 0) /* found a data track */ break; } if (i < 0) return 0; return ntohl(t.entry.addr.lba); } static int iso_mountroot __P((struct mount *mp, struct proc *p)); static int iso_mountroot(mp, p) struct mount *mp; struct proc *p; { struct iso_args args; int error; if ((error = bdevvp(rootdev, &rootvp))) { printf("iso_mountroot: can't find rootvp"); return (error); } args.flags = ISOFSMNT_ROOT; args.ssector = iso_get_ssector(rootdev, p); if (bootverbose) printf("iso_mountroot(): using session at block %d\n", args.ssector); if ((error = iso_mountfs(rootvp, mp, p, &args)) != 0) return (error); (void)cd9660_statfs(mp, &mp->mnt_stat, p); return (0); } /* * VFS Operations. * * mount system call */ static int cd9660_mount(mp, path, data, ndp, p) register struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct proc *p; { struct vnode *devvp; struct iso_args args; size_t size; int error; mode_t accessmode; struct iso_mnt *imp = 0; if ((mp->mnt_flag & MNT_ROOTFS) != 0) { if (bdevsw(rootdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; return (iso_mountroot(mp, p)); } if ((error = copyin(data, (caddr_t)&args, sizeof (struct iso_args)))) return (error); if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EROFS); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. * Disallow clearing MNT_NOCLUSTERR flag, if block device requests. */ if (mp->mnt_flag & MNT_UPDATE) { imp = VFSTOISOFS(mp); if (bdevsw(imp->im_devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (args.fspec == 0) return (vfs_export(mp, &imp->im_export, &args.export)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); if ((error = namei(ndp))) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return ENOTBLK; } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return ENXIO; } /* * Verify that user has necessary permissions on the device, * or has superuser abilities */ accessmode = VREAD; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p); if (error) error = suser(p); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); if ((mp->mnt_flag & MNT_UPDATE) == 0) { if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; error = iso_mountfs(devvp, mp, p, &args); } else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return error; } imp = VFSTOISOFS(mp); (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void) cd9660_statfs(mp, &mp->mnt_stat, p); return 0; } /* * Common code for mount and mountroot */ static int iso_mountfs(devvp, mp, p, argp) register struct vnode *devvp; struct mount *mp; struct proc *p; struct iso_args *argp; { register struct iso_mnt *isomp = (struct iso_mnt *)0; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; dev_t dev = devvp->v_rdev; int error = EINVAL; int needclose = 0; int high_sierra = 0; int iso_bsize; int iso_blknum; int joliet_level; struct iso_volume_descriptor *vdp = 0; struct iso_primary_descriptor *pri = NULL; struct iso_sierra_primary_descriptor *pri_sierra = NULL; struct iso_supplementary_descriptor *sup = NULL; struct iso_directory_record *rootp; int logical_block_size; if (!(mp->mnt_flag & MNT_RDONLY)) return EROFS; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ if ((error = vfs_mountedon(devvp))) return error; if (vcount(devvp) > 1 && devvp != rootvp) return EBUSY; if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0))) return (error); if ((error = VOP_OPEN(devvp, FREAD, FSCRED, p))) return error; needclose = 1; /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. For now, we'll just use a constant. */ iso_bsize = ISO_DEFAULT_BLOCK_SIZE; joliet_level = 0; for (iso_blknum = 16 + argp->ssector; iso_blknum < 100 + argp->ssector; iso_blknum++) { if ((error = bread(devvp, iso_blknum * btodb(iso_bsize), iso_bsize, NOCRED, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { if (bcmp (vdp->id_sierra, ISO_SIERRA_ID, sizeof vdp->id) != 0) { error = EINVAL; goto out; } else high_sierra = 1; } switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){ case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; pri = (struct iso_primary_descriptor *)vdp; pri_sierra = (struct iso_sierra_primary_descriptor *)vdp; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; sup = (struct iso_supplementary_descriptor *)vdp; if (!(argp->flags & ISOFSMNT_NOJOLIET)) { if (bcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (bcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (bcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; if (isonum_711 (sup->flags) & 1) joliet_level = 0; } } break; case ISO_VD_END: goto vd_end; default: break; } if (bp) { brelse(bp); bp = NULL; } } vd_end: if (bp) { brelse(bp); bp = NULL; } if (pri == NULL) { error = EINVAL; goto out; } logical_block_size = isonum_723 (high_sierra? pri_sierra->logical_block_size: pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) { error = EINVAL; goto out; } rootp = (struct iso_directory_record *) (high_sierra? pri_sierra->root_directory_record: pri->root_directory_record); isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK); bzero((caddr_t)isomp, sizeof *isomp); isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (high_sierra? pri_sierra->volume_space_size: pri->volume_space_size); isomp->joliet_level = 0; /* * Since an ISO9660 multi-session CD can also access previous * sessions, we have to include them into the space consider- * ations. This doesn't yield a very accurate number since * parts of the old sessions might be inaccessible now, but we * can't do much better. This is also important for the NFS * filehandle validation. */ isomp->volume_space_size += argp->ssector; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = ffs(logical_block_size) - 1; pribp->b_flags |= B_AGE; brelse(pribp); pribp = NULL; mp->mnt_data = (qaddr_t)isomp; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; devvp->v_specmountpoint = mp; /* Check the Rock Ridge Extention support */ if (!(argp->flags & ISOFSMNT_NORRIP)) { if ((error = bread(isomp->im_devvp, (isomp->root_extent + isonum_711(rootp->ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, NOCRED, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { argp->flags |= ISOFSMNT_NORRIP; } else { argp->flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ bp->b_flags |= B_AGE; brelse(bp); bp = NULL; } isomp->im_flags = argp->flags & (ISOFSMNT_NORRIP | ISOFSMNT_GENS | ISOFSMNT_EXTATT | ISOFSMNT_NOJOLIET); if (high_sierra) { /* this effectively ignores all the mount flags */ log(LOG_INFO, "cd9660: High Sierra Format\n"); isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA; } else switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { default: isomp->iso_ftype = ISO_FTYPE_DEFAULT; break; case ISOFSMNT_GENS|ISOFSMNT_NORRIP: isomp->iso_ftype = ISO_FTYPE_9660; break; case 0: log(LOG_INFO, "cd9660: RockRidge Extension\n"); isomp->iso_ftype = ISO_FTYPE_RRIP; break; } /* Decide whether to use the Joliet descriptor */ if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) { log(LOG_INFO, "cd9660: Joliet Extension\n"); rootp = (struct iso_directory_record *) sup->root_directory_record; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->joliet_level = joliet_level; supbp->b_flags |= B_AGE; } if (supbp) { brelse(supbp); supbp = NULL; } return 0; out: devvp->v_specmountpoint = NULL; if (bp) brelse(bp); if (pribp) brelse(pribp); if (supbp) brelse(supbp); if (needclose) (void)VOP_CLOSE(devvp, FREAD, NOCRED, p); if (isomp) { free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; } return error; } /* * Make a filesystem operational. * Nothing to do at the moment. */ /* ARGSUSED */ static int cd9660_start(mp, flags, p) struct mount *mp; int flags; struct proc *p; { return 0; } /* * unmount system call */ static int cd9660_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; #if 0 mntflushbuf(mp, 0); if (mntinvalbuf(mp)) return EBUSY; #endif if ((error = vflush(mp, NULLVP, flags))) return (error); isomp = VFSTOISOFS(mp); isomp->im_devvp->v_specmountpoint = NULL; error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p); vrele(isomp->im_devvp); free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Return root of a filesystem */ static int cd9660_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ return (cd9660_vget_internal(mp, ino, vpp, imp->iso_ftype == ISO_FTYPE_RRIP, dp)); } /* * Do operations associated with quotas, not supported */ /* ARGSUSED */ static int cd9660_quotactl(mp, cmd, uid, arg, p) struct mount *mp; int cmd; uid_t uid; caddr_t arg; struct proc *p; { return (EOPNOTSUPP); } /* * Get file system statistics. */ int cd9660_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { register struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } return 0; } /* ARGSUSED */ static int cd9660_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ struct ifid { ushort ifid_len; ushort ifid_pad; int ifid_ino; long ifid_start; }; /* ARGSUSED */ int cd9660_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { struct ifid *ifhp = (struct ifid *)fhp; register struct iso_node *ip; register struct netcred *np; register struct iso_mnt *imp = VFSTOISOFS(mp); struct vnode *nvp; int error; #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifhp->ifid_ino, ifhp->ifid_start); #endif /* * Get the export permission structure for this tuple. */ np = vfs_export_lookup(mp, &imp->im_export, nam); if (np == NULL) return (EACCES); if ((error = VFS_VGET(mp, ifhp->ifid_ino, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; *exflagsp = np->netc_exflags; *credanonp = &np->netc_anon; return (0); } int cd9660_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { /* * XXXX * It would be nice if we didn't always set the `relocated' flag * and force the extra read, but I don't want to think about fixing * that right now. */ return (cd9660_vget_internal(mp, ino, vpp, #if 0 VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP, #else 0, #endif (struct iso_directory_record *)0)); } int cd9660_vget_internal(mp, ino, vpp, relocated, isodir) struct mount *mp; ino_t ino; struct vnode **vpp; int relocated; struct iso_directory_record *isodir; { struct iso_mnt *imp; struct iso_node *ip; struct buf *bp; struct vnode *vp, *nvp; dev_t dev; int error; imp = VFSTOISOFS(mp); dev = imp->im_dev; if ((*vpp = cd9660_ihashget(dev, ino)) != NULLVP) return (0); /* Allocate a new vnode/iso_node. */ if ((error = getnewvnode(VT_ISOFS, mp, cd9660_vnodeop_p, &vp)) != 0) { *vpp = NULLVP; return (error); } MALLOC(ip, struct iso_node *, sizeof(struct iso_node), M_ISOFSNODE, M_WAITOK); bzero((caddr_t)ip, sizeof(struct iso_node)); lockinit(&ip->i_lock, PINOD, "isonode", 0, 0); vp->v_data = ip; ip->i_vnode = vp; ip->i_dev = dev; ip->i_number = ino; /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ cd9660_ihashins(ip); if (isodir == 0) { int lbn, off; lbn = lblkno(imp, ino); if (lbn >= imp->volume_space_size) { vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { vput(vp); brelse(bp); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)(bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { vput(vp); if (bp != 0) brelse(bp); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { if (bp != 0) brelse(bp); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif } else bp = 0; ip->i_mnt = imp; ip->i_devvp = imp->im_devvp; VREF(ip->i_devvp); if (relocated) { /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; if (bp != 0) brelse(bp); if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) { vput(vp); return (error); } isodir = (struct iso_directory_record *)bp->b_data; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; /* * Setup time stamp, attribute */ vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; int off; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660); cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660); if (bp2) brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } if (bp != 0) brelse(bp); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = cd9660_fifoop_p; break; case VCHR: case VBLK: /* * if device, look at device number table for translation */ vp->v_op = cd9660_specop_p; if ((nvp = checkalias(vp, ip->inode.iso_rdev, mp)) != NULL) { /* * Discard unneeded vnode, but save its iso_node. * Note that the lock is carried over in the iso_node * to the replacement vnode. */ nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased inode. */ vp = nvp; ip->i_vnode = vp; } break; default: break; } if (ip->iso_extent == imp->root_extent) vp->v_flag |= VROOT; /* * XXX need generation number? */ *vpp = vp; return (0); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int cd9660_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct iso_node *ip = VTOI(vp); register struct ifid *ifhp; ifhp = (struct ifid *)fhp; ifhp->ifid_len = sizeof(struct ifid); ifhp->ifid_ino = ip->i_number; ifhp->ifid_start = ip->iso_start; #ifdef ISOFS_DBG printf("vptofh: ino %d, start %ld\n", ifhp->ifid_ino,ifhp->ifid_start); #endif return 0; } Index: head/sys/fs/cd9660/cd9660_vnops.c =================================================================== --- head/sys/fs/cd9660/cd9660_vnops.c (revision 49534) +++ head/sys/fs/cd9660/cd9660_vnops.c (revision 49535) @@ -1,917 +1,917 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vnops.c 8.19 (Berkeley) 5/27/95 - * $Id: cd9660_vnops.c,v 1.55 1999/04/18 10:58:02 dcs Exp $ + * $Id: cd9660_vnops.c,v 1.56 1999/05/11 19:54:25 phk Exp $ */ #include #include #include #include #include #include #include #include -#include #include #include #include #include +#include #include #include #include #include #include #include static int cd9660_setattr __P((struct vop_setattr_args *)); static int cd9660_access __P((struct vop_access_args *)); static int cd9660_getattr __P((struct vop_getattr_args *)); static int cd9660_pathconf __P((struct vop_pathconf_args *)); static int cd9660_read __P((struct vop_read_args *)); struct isoreaddir; static int iso_uiodir __P((struct isoreaddir *idp, struct dirent *dp, off_t off)); static int iso_shipdir __P((struct isoreaddir *idp)); static int cd9660_readdir __P((struct vop_readdir_args *)); static int cd9660_readlink __P((struct vop_readlink_args *ap)); static int cd9660_abortop __P((struct vop_abortop_args *)); static int cd9660_strategy __P((struct vop_strategy_args *)); static int cd9660_print __P((struct vop_print_args *)); static int cd9660_getpages __P((struct vop_getpages_args *)); static int cd9660_putpages __P((struct vop_putpages_args *)); /* * Setattr call. Only allowed for block and character special devices. */ int cd9660_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: return (0); } } return (0); } /* * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. * The mode is shifted to select the owner/group/other fields. The * super user is granted all permissions. */ /* ARGSUSED */ static int cd9660_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; gid_t *gp; int i; /* * Disallow write attempts unless the file is a socket, * fifo, or a block or character device resident on the * file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } /* User id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->inode.iso_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->inode.iso_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } static int cd9660_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; register struct vattr *vap = ap->a_vap; register struct iso_node *ip = VTOI(vp); vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->inode.iso_mode; vap->va_nlink = ip->inode.iso_links; vap->va_uid = ip->inode.iso_uid; vap->va_gid = ip->inode.iso_gid; vap->va_atime = ip->inode.iso_atime; vap->va_mtime = ip->inode.iso_mtime; vap->va_ctime = ip->inode.iso_ctime; vap->va_rdev = ip->inode.iso_rdev; vap->va_size = (u_quad_t) ip->i_size; if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { struct vop_readlink_args rdlnk; struct iovec aiov; struct uio auio; char *cp; MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = ap->a_p; auio.uio_resid = MAXPATHLEN; rdlnk.a_uio = &auio; rdlnk.a_vp = ap->a_vp; rdlnk.a_cred = ap->a_cred; if (cd9660_readlink(&rdlnk) == 0) vap->va_size = MAXPATHLEN - auio.uio_resid; FREE(cp, M_TEMP); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = ip->i_mnt->logical_block_size; vap->va_bytes = (u_quad_t) ip->i_size; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Vnode op for reading. */ static int cd9660_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; register struct iso_node *ip = VTOI(vp); register struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; int rasize, error = 0; long size, n, on; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = min((u_int)(imp->logical_block_size - on), uio->uio_resid); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { if (vp->v_lastr + 1 == lbn && lblktosize(imp, rablock) < ip->i_size) { rasize = blksize(imp, ip, rablock); error = breadn(vp, lbn, size, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); } vp->v_lastr = lbn; n = min(n, size - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Structure for reading directories */ struct isoreaddir { struct dirent saveent; struct dirent assocent; struct dirent current; off_t saveoff; off_t assocoff; off_t curroff; struct uio *uio; off_t uio_off; int eofflag; u_long *cookies; int ncookies; }; int iso_uiodir(idp,dp,off) struct isoreaddir *idp; struct dirent *dp; off_t off; { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = GENERIC_DIRSIZ(dp); if (idp->uio->uio_resid < dp->d_reclen) { idp->eofflag = 0; return (-1); } if (idp->cookies) { if (idp->ncookies <= 0) { idp->eofflag = 0; return (-1); } *idp->cookies++ = off; --idp->ncookies; } if ((error = uiomove((caddr_t) dp,dp->d_reclen,idp->uio)) != 0) return (error); idp->uio_off = off; return (0); } int iso_shipdir(idp) struct isoreaddir *idp; { struct dirent *dp; int cl, sl, assoc; int error; char *cname, *sname; cl = idp->current.d_namlen; cname = idp->current.d_name; assoc = (cl > 1) && (*cname == ASSOCCHAR); if (assoc) { cl--; cname++; } dp = &idp->saveent; sname = dp->d_name; if (!(sl = dp->d_namlen)) { dp = &idp->assocent; sname = dp->d_name + 1; sl = dp->d_namlen - 1; } if (sl > 0) { if (sl != cl || bcmp(sname,cname,sl)) { if (idp->assocent.d_namlen) { if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0) return (error); idp->assocent.d_namlen = 0; } if (idp->saveent.d_namlen) { if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0) return (error); idp->saveent.d_namlen = 0; } } } idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current); if (assoc) { idp->assocoff = idp->curroff; bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); } else { idp->saveoff = idp->curroff; bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); } return (0); } /* * Vnode op for readdir */ static int cd9660_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long *a_cookies; } */ *ap; { register struct uio *uio = ap->a_uio; struct isoreaddir *idp; struct vnode *vdp = ap->a_vp; struct iso_node *dp; struct iso_mnt *imp; struct buf *bp = NULL; struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; u_long bmask; int error = 0; int reclen; u_short namelen; int ncookies = 0; u_long *cookies = NULL; dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_bmask; MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = idp->assocent.d_namlen = 0; /* * XXX * Is it worth trying to figure out the type? */ idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type = DT_UNKNOWN; idp->uio = uio; if (ap->a_ncookies == NULL) { idp->cookies = NULL; } else { /* * Guess the number of cookies needed. */ ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_int), M_TEMP, M_WAITOK); idp->cookies = cookies; idp->ncookies = ncookies; } idp->eofflag = 1; idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) { FREE(idp, M_TEMP); return (error); } endsearch = dp->i_size; while (idp->curroff < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0) break; entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ idp->curroff = (idp->curroff & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) { error = EINVAL; /* illegal entry, stop */ break; } if (entryoffsetinblock + reclen > imp->logical_block_size) { error = EINVAL; /* illegal directory, so stop looking */ break; } idp->current.d_namlen = isonum_711(ep->name_len); if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { error = EINVAL; /* illegal entry, stop */ break; } if (isonum_711(ep->flags)&2) idp->current.d_fileno = isodirino(ep, imp); else idp->current.d_fileno = dbtob(bp->b_blkno) + entryoffsetinblock; idp->curroff += reclen; switch (imp->iso_ftype) { case ISO_FTYPE_RRIP: cd9660_rrip_getname(ep,idp->current.d_name, &namelen, &idp->current.d_fileno,imp); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); break; default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/ strcpy(idp->current.d_name,".."); if (idp->current.d_namlen == 1 && ep->name[0] == 0) { idp->current.d_namlen = 1; error = iso_uiodir(idp,&idp->current,idp->curroff); } else if (idp->current.d_namlen == 1 && ep->name[0] == 1) { idp->current.d_namlen = 2; error = iso_uiodir(idp,&idp->current,idp->curroff); } else { isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, imp->iso_ftype == ISO_FTYPE_9660, isonum_711(ep->flags)&4, imp->joliet_level); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); else error = iso_uiodir(idp,&idp->current,idp->curroff); } } if (error) break; entryoffsetinblock += reclen; } if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { idp->current.d_namlen = 0; error = iso_shipdir(idp); } if (error < 0) error = 0; if (ap->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { /* * Work out the number of cookies actually used. */ *ap->a_ncookies = ncookies - idp->ncookies; *ap->a_cookies = cookies; } } if (bp) brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; FREE(idp, M_TEMP); return (error); } /* * Return target name of a symbolic link * Shouldn't we get the parent vnode and read the data from there? * This could eventually result in deadlocks in cd9660_lookup. * But otherwise the block read here is in the block buffer two times. */ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; static int cd9660_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { ISONODE *ip; ISODIR *dirp; ISOMNT *imp; struct buf *bp; struct uio *uio; u_short symlen; int error; char *symname; ip = VTOI(ap->a_vp); imp = ip->i_mnt; uio = ap->a_uio; if (imp->iso_ftype != ISO_FTYPE_RRIP) return (EINVAL); /* * Get parents directory record block that this inode included. */ error = bread(imp->im_devvp, (ip->i_number >> imp->im_bshift) << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... * 1: Check not cross boundary on block */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > (unsigned)imp->logical_block_size) { brelse(bp); return (EINVAL); } /* * Now get a buffer * Abuse a namei buffer for now. */ if (uio->uio_segflg == UIO_SYSSPACE) symname = uio->uio_iov->iov_base; else symname = zalloc(namei_zone); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { if (uio->uio_segflg != UIO_SYSSPACE) zfree(namei_zone, symname); brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ brelse(bp); /* * return with the symbolic name to caller's. */ if (uio->uio_segflg != UIO_SYSSPACE) { error = uiomove(symname, symlen, uio); zfree(namei_zone, symname); return (error); } uio->uio_resid -= symlen; uio->uio_iov->iov_base += symlen; uio->uio_iov->iov_len -= symlen; return (0); } /* * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually * done. If a buffer has been saved in anticipation of a CREATE, delete it. */ static int cd9660_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ static int cd9660_strategy(ap) struct vop_strategy_args /* { struct buf *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct iso_node *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if ((error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL))) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } /* * Print out the contents of an inode. */ static int cd9660_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_ISOFS, isofs vnode\n"); return (0); } /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ static int cd9660_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) *ap->a_retval = NAME_MAX; else *ap->a_retval = 37; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * get page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int cd9660_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * put page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int cd9660_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } /* * Global vfs data structures for cd9660 */ vop_t **cd9660_vnodeop_p; static struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) cd9660_abortop }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_bmap_desc, (vop_t *) cd9660_bmap }, { &vop_cachedlookup_desc, (vop_t *) cd9660_lookup }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_pathconf_desc, (vop_t *) cd9660_pathconf }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_read_desc, (vop_t *) cd9660_read }, { &vop_readdir_desc, (vop_t *) cd9660_readdir }, { &vop_readlink_desc, (vop_t *) cd9660_readlink }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_strategy_desc, (vop_t *) cd9660_strategy }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_getpages_desc, (vop_t *) cd9660_getpages }, { &vop_putpages_desc, (vop_t *) cd9660_putpages }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_vnodeop_opv_desc = { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; VNODEOP_SET(cd9660_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **cd9660_specop_p; static struct vnodeopv_entry_desc cd9660_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_specop_opv_desc = { &cd9660_specop_p, cd9660_specop_entries }; VNODEOP_SET(cd9660_specop_opv_desc); vop_t **cd9660_fifoop_p; static struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_fifoop_opv_desc = { &cd9660_fifoop_p, cd9660_fifoop_entries }; VNODEOP_SET(cd9660_fifoop_opv_desc); Index: head/sys/fs/coda/coda_vfsops.c =================================================================== --- head/sys/fs/coda/coda_vfsops.c (revision 49534) +++ head/sys/fs/coda/coda_vfsops.c (revision 49535) @@ -1,589 +1,587 @@ /* * * Coda: an Experimental Distributed File System * Release 3.1 * * Copyright (c) 1987-1998 Carnegie Mellon University * All Rights Reserved * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation, and * that credit is given to Carnegie Mellon University in all documents * and publicity pertaining to direct or indirect use of this code or its * derivatives. * * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS, * SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON * DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF * ANY DERIVATIVE WORK. * * Carnegie Mellon encourages users of this software to return any * improvements or extensions that they make, and to grant Carnegie * Mellon the rights to redistribute these changes without encumbrance. * * @(#) src/sys/cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:14:52 rvb Exp $ - * $Id: coda_vfsops.c,v 1.15 1999/07/20 07:18:17 phk Exp $ + * $Id: coda_vfsops.c,v 1.16 1999/07/21 12:51:36 phk Exp $ * */ /* * Mach Operating System * Copyright (c) 1989 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ /* * This code was written for the Coda file system at Carnegie Mellon * University. Contributers include David Steere, James Kistler, and * M. Satyanarayanan. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include MALLOC_DEFINE(M_CODA, "CODA storage", "Various Coda Structures"); int codadebug = 0; int coda_vfsop_print_entry = 0; #define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__FUNCTION__)) struct vnode *coda_ctlvp; struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */ /* structure to keep statistics of internally generated/satisfied calls */ struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE]; #define MARK_ENTRY(op) (coda_vfsopstats[op].entries++) #define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++) #define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++) #define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++) extern int coda_nc_initialized; /* Set if cache has been initialized */ extern int vc_nb_open __P((dev_t, int, int, struct proc *)); int coda_vfsopstats_init(void) { register int i; for (i=0;ini_vp; if (error) { MARK_INT_FAIL(CODA_MOUNT_STATS); return (error); } if (dvp->v_type != VCHR) { MARK_INT_FAIL(CODA_MOUNT_STATS); vrele(dvp); return(ENXIO); } dev = dvp->v_rdev; vrele(dvp); /* * See if the device table matches our expectations. */ if (devsw(dev)->d_open != vc_nb_open) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } if (minor(dev) >= NVCODA || minor(dev) < 0) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENXIO); } /* * Initialize the mount record and link it to the vfs struct */ mi = &coda_mnttbl[minor(dev)]; if (!VC_OPEN(&mi->mi_vcomm)) { MARK_INT_FAIL(CODA_MOUNT_STATS); return(ENODEV); } /* No initialization (here) of mi_vcomm! */ vfsp->mnt_data = (qaddr_t)mi; vfs_getnewfsid (vfsp); mi->mi_vfsp = vfsp; /* * Make a root vnode to placate the Vnode interface, but don't * actually make the CODA_ROOT call to venus until the first call * to coda_root in case a server is down while venus is starting. */ rootfid.Volume = 0; rootfid.Vnode = 0; rootfid.Unique = 0; cp = make_coda_node(&rootfid, vfsp, VDIR); rootvp = CTOV(cp); rootvp->v_flag |= VROOT; ctlfid.Volume = CTL_VOL; ctlfid.Vnode = CTL_VNO; ctlfid.Unique = CTL_UNI; /* cp = make_coda_node(&ctlfid, vfsp, VCHR); The above code seems to cause a loop in the cnode links. I don't totally understand when it happens, it is caught when closing down the system. */ cp = make_coda_node(&ctlfid, 0, VCHR); coda_ctlvp = CTOV(cp); /* Add vfs and rootvp to chain of vfs hanging off mntinfo */ mi->mi_vfsp = vfsp; mi->mi_rootvp = rootvp; /* set filesystem block size */ vfsp->mnt_stat.f_bsize = 8192; /* XXX -JJK */ /* Set f_iosize. XXX -- inamura@isl.ntt.co.jp. For vnode_pager_haspage() references. The value should be obtained from underlying UFS. */ /* Checked UFS. iosize is set as 8192 */ vfsp->mnt_stat.f_iosize = 8192; /* error is currently guaranteed to be zero, but in case some code changes... */ CODADEBUG(1, myprintf(("coda_mount returned %d\n",error));); if (error) MARK_INT_FAIL(CODA_MOUNT_STATS); else MARK_INT_SAT(CODA_MOUNT_STATS); return(error); } int coda_start(vfsp, flags, p) struct mount *vfsp; int flags; struct proc *p; { ENTRY; return (0); } int coda_unmount(vfsp, mntflags, p) struct mount *vfsp; int mntflags; struct proc *p; { struct coda_mntinfo *mi = vftomi(vfsp); int active, error = 0; ENTRY; MARK_ENTRY(CODA_UMOUNT_STATS); if (!CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_UMOUNT_STATS); return(EINVAL); } if (mi->mi_vfsp == vfsp) { /* We found the victim */ if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp))) return (EBUSY); /* Venus is still running */ #ifdef DEBUG printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp)); #endif vrele(mi->mi_rootvp); active = coda_kill(vfsp, NOT_DOWNCALL); mi->mi_rootvp->v_flag &= ~VROOT; error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE); printf("coda_unmount: active = %d, vflush active %d\n", active, error); error = 0; /* I'm going to take this out to allow lookups to go through. I'm * not sure it's important anyway. -- DCS 2/2/94 */ /* vfsp->VFS_DATA = NULL; */ /* No more vfsp's to hold onto */ mi->mi_vfsp = NULL; mi->mi_rootvp = NULL; if (error) MARK_INT_FAIL(CODA_UMOUNT_STATS); else MARK_INT_SAT(CODA_UMOUNT_STATS); return(error); } return (EINVAL); } /* * find root of cfs */ int coda_root(vfsp, vpp) struct mount *vfsp; struct vnode **vpp; { struct coda_mntinfo *mi = vftomi(vfsp); struct vnode **result; int error; struct proc *p = curproc; /* XXX - bnoble */ ViceFid VFid; ENTRY; MARK_ENTRY(CODA_ROOT_STATS); result = NULL; if (vfsp == mi->mi_vfsp) { if ((VTOC(mi->mi_rootvp)->c_fid.Volume != 0) || (VTOC(mi->mi_rootvp)->c_fid.Vnode != 0) || (VTOC(mi->mi_rootvp)->c_fid.Unique != 0)) { /* Found valid root. */ *vpp = mi->mi_rootvp; /* On Mach, this is vref. On NetBSD, VOP_LOCK */ #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_SAT(CODA_ROOT_STATS); return(0); } } error = venus_root(vftomi(vfsp), p->p_cred->pc_ucred, p, &VFid); if (!error) { /* * Save the new rootfid in the cnode, and rehash the cnode into the * cnode hash with the new fid key. */ coda_unsave(VTOC(mi->mi_rootvp)); VTOC(mi->mi_rootvp)->c_fid = VFid; coda_save(VTOC(mi->mi_rootvp)); *vpp = mi->mi_rootvp; #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_SAT(CODA_ROOT_STATS); goto exit; } else if (error == ENODEV || error == EINTR) { /* Gross hack here! */ /* * If Venus fails to respond to the CODA_ROOT call, coda_call returns * ENODEV. Return the uninitialized root vnode to allow vfs * operations such as unmount to continue. Without this hack, * there is no way to do an unmount if Venus dies before a * successful CODA_ROOT call is done. All vnode operations * will fail. */ *vpp = mi->mi_rootvp; #if 1 vref(*vpp); vn_lock(*vpp, LK_EXCLUSIVE, p); #else vget(*vpp, LK_EXCLUSIVE, p); #endif MARK_INT_FAIL(CODA_ROOT_STATS); error = 0; goto exit; } else { CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); ); MARK_INT_FAIL(CODA_ROOT_STATS); goto exit; } exit: return(error); } int coda_quotactl(vfsp, cmd, uid, arg, p) struct mount *vfsp; int cmd; uid_t uid; caddr_t arg; struct proc *p; { ENTRY; return (EOPNOTSUPP); } /* * Get file system statistics. */ int coda_nb_statfs(vfsp, sbp, p) register struct mount *vfsp; struct statfs *sbp; struct proc *p; { ENTRY; /* MARK_ENTRY(CODA_STATFS_STATS); */ if (!CODA_MOUNTED(vfsp)) { /* MARK_INT_FAIL(CODA_STATFS_STATS);*/ return(EINVAL); } bzero(sbp, sizeof(struct statfs)); /* XXX - what to do about f_flags, others? --bnoble */ /* Below This is what AFS does #define NB_SFS_SIZ 0x895440 */ /* Note: Normal fs's have a bsize of 0x400 == 1024 */ sbp->f_type = vfsp->mnt_vfc->vfc_typenum; sbp->f_bsize = 8192; /* XXX */ sbp->f_iosize = 8192; /* XXX */ #define NB_SFS_SIZ 0x8AB75D sbp->f_blocks = NB_SFS_SIZ; sbp->f_bfree = NB_SFS_SIZ; sbp->f_bavail = NB_SFS_SIZ; sbp->f_files = NB_SFS_SIZ; sbp->f_ffree = NB_SFS_SIZ; bcopy((caddr_t)&(vfsp->mnt_stat.f_fsid), (caddr_t)&(sbp->f_fsid), sizeof (fsid_t)); snprintf(sbp->f_mntonname, sizeof(sbp->f_mntonname), "/coda"); snprintf(sbp->f_mntfromname, sizeof(sbp->f_mntfromname), "CODA"); /* MARK_INT_SAT(CODA_STATFS_STATS); */ return(0); } /* * Flush any pending I/O. */ int coda_sync(vfsp, waitfor, cred, p) struct mount *vfsp; int waitfor; struct ucred *cred; struct proc *p; { ENTRY; MARK_ENTRY(CODA_SYNC_STATS); MARK_INT_SAT(CODA_SYNC_STATS); return(0); } int coda_vget(vfsp, ino, vpp) struct mount *vfsp; ino_t ino; struct vnode **vpp; { ENTRY; return (EOPNOTSUPP); } /* * fhtovp is now what vget used to be in 4.3-derived systems. For * some silly reason, vget is now keyed by a 32 bit ino_t, rather than * a type-specific fid. */ int coda_fhtovp(vfsp, fhp, nam, vpp, exflagsp, creadanonp) register struct mount *vfsp; struct fid *fhp; struct mbuf *nam; struct vnode **vpp; int *exflagsp; struct ucred **creadanonp; { struct cfid *cfid = (struct cfid *)fhp; struct cnode *cp = 0; int error; struct proc *p = curproc; /* XXX -mach */ ViceFid VFid; int vtype; ENTRY; MARK_ENTRY(CODA_VGET_STATS); /* Check for vget of control object. */ if (IS_CTL_FID(&cfid->cfid_fid)) { *vpp = coda_ctlvp; vref(coda_ctlvp); MARK_INT_SAT(CODA_VGET_STATS); return(0); } error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, p->p_cred->pc_ucred, p, &VFid, &vtype); if (error) { CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));) *vpp = (struct vnode *)0; } else { CODADEBUG(CODA_VGET, myprintf(("vget: vol %lx vno %lx uni %lx type %d result %d\n", VFid.Volume, VFid.Vnode, VFid.Unique, vtype, error)); ) cp = make_coda_node(&VFid, vfsp, vtype); *vpp = CTOV(cp); } return(error); } int coda_vptofh(vnp, fidp) struct vnode *vnp; struct fid *fidp; { ENTRY; return (EOPNOTSUPP); } int coda_init(struct vfsconf *vfsp) { ENTRY; return 0; } /* * To allow for greater ease of use, some vnodes may be orphaned when * Venus dies. Certain operations should still be allowed to go * through, but without propagating ophan-ness. So this function will * get a new vnode for the file from the current run of Venus. */ int getNewVnode(vpp) struct vnode **vpp; { struct cfid cfid; struct coda_mntinfo *mi = vftomi((*vpp)->v_mount); ENTRY; cfid.cfid_len = (short)sizeof(ViceFid); cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */ /* XXX ? */ /* We're guessing that if set, the 1st element on the list is a * valid vnode to use. If not, return ENODEV as venus is dead. */ if (mi->mi_vfsp == NULL) return ENODEV; return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp, NULL, NULL); } #include #include /* get the mount structure corresponding to a given device. Assume * device corresponds to a UFS. Return NULL if no device is found. */ struct mount *devtomp(dev) dev_t dev; { struct mount *mp, *nmp; for (mp = mountlist.cqh_first; mp != (void*)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_next; if (((VFSTOUFS(mp))->um_dev == dev)) { /* mount corresponds to UFS and the device matches one we want */ return(mp); } } /* mount structure wasn't found */ return(NULL); } struct vfsops coda_vfsops = { coda_mount, coda_start, coda_unmount, coda_root, coda_quotactl, coda_nb_statfs, coda_sync, coda_vget, (int (*) (struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)) eopnotsupp, (int (*) (struct vnode *, struct fid *)) eopnotsupp, coda_init, }; VFS_SET(coda_vfsops, coda, VFCF_NETWORK); Index: head/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 49534) +++ head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 49535) @@ -1,1017 +1,1016 @@ -/* $Id: msdosfs_vfsops.c,v 1.44 1999/05/08 06:40:00 phk Exp $ */ +/* $Id: msdosfs_vfsops.c,v 1.45 1999/05/31 11:28:02 phk Exp $ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include -#include /* XXX */ /* defines v_rdev */ #include #include #include #include #include /* defines ALLPERMS */ #include #include #include #include #include #include MALLOC_DEFINE(M_MSDOSFSMNT, "MSDOSFS mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "MSDOSFS FAT", "MSDOSFS file allocation table"); static int update_mp __P((struct mount *mp, struct msdosfs_args *argp)); static int mountmsdosfs __P((struct vnode *devvp, struct mount *mp, struct proc *p, struct msdosfs_args *argp)); static int msdosfs_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int msdosfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int msdosfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int msdosfs_root __P((struct mount *, struct vnode **)); static int msdosfs_start __P((struct mount *, int, struct proc *)); static int msdosfs_statfs __P((struct mount *, struct statfs *, struct proc *)); static int msdosfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int msdosfs_unmount __P((struct mount *, int, struct proc *)); static int msdosfs_vget __P((struct mount *mp, ino_t ino, struct vnode **vpp)); static int msdosfs_vptofh __P((struct vnode *, struct fid *)); static int update_mp(mp, argp) struct mount *mp; struct msdosfs_args *argp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error; pmp->pm_gid = argp->gid; pmp->pm_uid = argp->uid; pmp->pm_mask = argp->mask & ALLPERMS; pmp->pm_flags |= argp->flags & MSDOSFSMNT_MNTOPT; if (pmp->pm_flags & MSDOSFSMNT_U2WTABLE) { bcopy(argp->u2w, pmp->pm_u2w, sizeof(pmp->pm_u2w)); bcopy(argp->d2u, pmp->pm_d2u, sizeof(pmp->pm_d2u)); bcopy(argp->u2d, pmp->pm_u2d, sizeof(pmp->pm_u2d)); } if (pmp->pm_flags & MSDOSFSMNT_ULTABLE) { bcopy(argp->ul, pmp->pm_ul, sizeof(pmp->pm_ul)); bcopy(argp->lu, pmp->pm_lu, sizeof(pmp->pm_lu)); } #ifndef __FreeBSD__ /* * GEMDOS knows nothing (yet) about win95 */ if (pmp->pm_flags & MSDOSFSMNT_GEMDOSFS) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; #endif if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else if (!(pmp->pm_flags & (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) { struct vnode *rootvp; /* * Try to divine whether to support Win'95 long filenames */ if (FAT32(pmp)) pmp->pm_flags |= MSDOSFSMNT_LONGNAME; else { if ((error = msdosfs_root(mp, &rootvp)) != 0) return error; pmp->pm_flags |= findwin95(VTODE(rootvp)) ? MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME; vput(rootvp); } } return 0; } #ifndef __FreeBSD__ int msdosfs_mountroot() { register struct mount *mp; struct proc *p = curproc; /* XXX */ size_t size; int error; struct msdosfs_args args; if (root_device->dv_class != DV_DISK) return (ENODEV); /* * Get vnodes for swapdev and rootdev. */ if (bdevvp(rootdev, &rootvp)) panic("msdosfs_mountroot: can't setup rootvp"); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); mp->mnt_op = &msdosfs_vfsops; mp->mnt_flag = 0; LIST_INIT(&mp->mnt_vnodelist); args.flags = 0; args.uid = 0; args.gid = 0; args.mask = 0777; if ((error = mountmsdosfs(rootvp, mp, p, &args)) != 0) { free(mp, M_MOUNT); return (error); } if ((error = update_mp(mp, &args)) != 0) { (void)msdosfs_unmount(mp, 0, p); free(mp, M_MOUNT); return (error); } if ((error = vfs_lock(mp)) != 0) { (void)msdosfs_unmount(mp, 0, p); free(mp, M_MOUNT); return (error); } CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); mp->mnt_vnodecovered = NULLVP; (void) copystr("/", mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)msdosfs_statfs(mp, &mp->mnt_stat, p); vfs_unlock(mp); return (0); } #endif /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(mp, path, data, ndp, p) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct proc *p; { struct vnode *devvp; /* vnode for blk device to mount */ struct msdosfs_args args; /* will hold data from mount request */ /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; size_t size; int error, flags; mode_t accessmode; error = copyin(data, (caddr_t)&args, sizeof(struct msdosfs_args)); if (error) return (error); if (args.magic != MSDOSFS_ARGSMAGIC) args.flags = 0; /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); error = 0; if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, NULLVP, flags); } if (!error && (mp->mnt_flag & MNT_RELOAD)) /* not yet implemented */ error = EOPNOTSUPP; if (error) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (p->p_ucred->cr_uid != 0) { devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p); if (error) { VOP_UNLOCK(devvp, 0, p); return (error); } VOP_UNLOCK(devvp, 0, p); } pmp->pm_flags &= ~MSDOSFSMNT_RONLY; } if (args.fspec == 0) { #ifdef __notyet__ /* doesn't work correctly with current mountd XXX */ if (args.flags & MSDOSFSMNT_MNTOPT) { pmp->pm_flags &= ~MSDOSFSMNT_MNTOPT; pmp->pm_flags |= args.flags & MSDOSFSMNT_MNTOPT; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; } #endif /* * Process export requests. */ return (vfs_export(mp, &pmp->pm_export, &args.export)); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); error = namei(ndp); if (error) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return (ENOTBLK); } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return (ENXIO); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (p->p_ucred->cr_uid != 0) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp, p, &args); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { if (devvp != pmp->pm_devvp) error = EINVAL; /* XXX needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return (error); } error = update_mp(mp, &args); if (error) { msdosfs_unmount(mp, MNT_FORCE, p); return error; } (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void) msdosfs_statfs(mp, &mp->mnt_stat, p); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(devvp, mp, p, argp) struct vnode *devvp; struct mount *mp; struct proc *p; struct msdosfs_args *argp; { struct msdosfsmount *pmp; struct buf *bp; dev_t dev = devvp->v_rdev; #ifndef __FreeBSD__ struct partinfo dpart; int bsize = 0, dtype = 0, tmp; #endif union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; u_int8_t SecPerClust; int ronly, error; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); if (vcount(devvp) > 1 && devvp != rootvp) return (EBUSY); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); bp = NULL; /* both used in error_exit */ pmp = NULL; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { /* * We need the disklabel to calculate the size of a FAT entry * later on. Also make sure the partition contains a filesystem * of type FS_MSDOS. This doesn't work for floppies, so we have * to check for them too. * * At least some parts of the msdos fs driver seem to assume * that the size of a disk block will always be 512 bytes. * Let's check it... */ error = VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p); if (error) goto error_exit; tmp = dpart.part->p_fstype; dtype = dpart.disklab->d_type; bsize = dpart.disklab->d_secsize; if (bsize != 512 || (dtype!=DTYPE_FLOPPY && tmp!=FS_MSDOS)) { error = EINVAL; goto error_exit; } } #endif /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. */ #ifdef PC98 error = bread(devvp, 0, 1024, NOCRED, &bp); #else error = bread(devvp, 0, 512, NOCRED, &bp); #endif if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsPBP; #ifndef __FreeBSD__ if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) { #endif #ifdef PC98 if ((bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) && (bsp->bs50.bsBootSectSig0 != 0 /* PC98 DOS 3.3x */ || bsp->bs50.bsBootSectSig1 != 0) && (bsp->bs50.bsBootSectSig0 != 0x90 /* PC98 DOS 5.0 */ || bsp->bs50.bsBootSectSig1 != 0x3d) && (bsp->bs50.bsBootSectSig0 != 0x46 /* PC98 DOS 3.3B */ || bsp->bs50.bsBootSectSig1 != 0xfa)) { #else if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { #endif error = EINVAL; goto error_exit; } #ifndef __FreeBSD__ } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK); bzero((caddr_t)pmp, sizeof *pmp); pmp->pm_mountp = mp; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; #ifndef __FreeBSD__ if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) { #endif /* XXX - We should probably check more values here */ if (!pmp->pm_BytesPerSec || !SecPerClust || !pmp->pm_Heads || pmp->pm_Heads > 255 #ifdef PC98 || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 255) { #else || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 63) { #endif error = EINVAL; goto error_exit; } #ifndef __FreeBSD__ } #endif if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (pmp->pm_HugeSectors > 0xffffffff / (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) { /* * We cannot deal currently with this size of disk * due to fileid limitations (see msdosfs_getattr and * msdosfs_readdir) */ error = EINVAL; printf("mountmsdosfs(): disk too big, sorry\n"); goto error_exit; } if (pmp->pm_RootDirEnts == 0) { if (bsp->bs710.bsBootSectSig2 != BOOTSIG2 || bsp->bs710.bsBootSectSig3 != BOOTSIG3 || pmp->pm_Sectors || pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; printf("mountmsdosfs(): bad FAT32 filesystem\n"); goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if (FAT32(pmp)) { /* * GEMDOS doesn't know fat32. */ error = EINVAL; goto error_exit; } /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < bsize) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_HugeSectors * (pmp->pm_BytesPerSec / bsize) > dpart.part->p_size) ) { error = EINVAL; goto error_exit; } /* * XXX - Many parts of the msdos fs driver seem to assume that * the number of bytes per logical sector (BytesPerSec) will * always be the same as the number of bytes per disk block * Let's pretend it is. */ tmp = pmp->pm_BytesPerSec / bsize; pmp->pm_BytesPerSec = bsize; pmp->pm_HugeSectors *= tmp; pmp->pm_HiddenSects *= tmp; pmp->pm_ResSectors *= tmp; pmp->pm_Sectors *= tmp; pmp->pm_FATsecs *= tmp; SecPerClust *= tmp; } #endif pmp->pm_fatblk = pmp->pm_ResSectors; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo); } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + pmp->pm_BytesPerSec - 1) / pmp->pm_BytesPerSec;/* in sectors */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_nmbrofclusters = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust; pmp->pm_maxcluster = pmp->pm_nmbrofclusters + 1; pmp->pm_fatsize = pmp->pm_FATsecs * pmp->pm_BytesPerSec; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if ((pmp->pm_nmbrofclusters <= (0xff0 - 2)) && ((dtype == DTYPE_FLOPPY) || ((dtype == DTYPE_VNODE) && ((pmp->pm_Heads == 1) || (pmp->pm_Heads == 2)))) ) { pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } else #endif if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one fat entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * pmp->pm_BytesPerSec; else pmp->pm_fatblocksize = DFLTBSIZE; pmp->pm_fatblocksec = pmp->pm_fatblocksize / pmp->pm_BytesPerSec; pmp->pm_bnshift = ffs(pmp->pm_BytesPerSec) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * pmp->pm_BytesPerSec; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check FSInfo. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, 1024, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4) && !bcmp(fp->fsisig4, "\0\0\125\252", 4)) pmp->pm_nxtfree = getulong(fp->fsinxtfree); else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Check and validate (or perhaps invalidate?) the fsinfo structure? XXX */ /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(((pmp->pm_maxcluster + N_INUSEBITS - 1) / N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_dev = dev; pmp->pm_devvp = devvp; /* * Have the inuse map filled in. */ if ((error = fillinusemap(pmp)) != 0) goto error_exit; /* * If they want fat updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the fat being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else pmp->pm_fmod = 1; mp->mnt_data = (qaddr_t) pmp; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_flag |= MNT_LOCAL; devvp->v_specmountpoint = mp; return 0; error_exit: if (bp) brelse(bp); (void) VOP_CLOSE(devvp, ronly ? FREAD : FREAD | FWRITE, NOCRED, p); if (pmp) { if (pmp->pm_inusemap) free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } static int msdosfs_start(mp, flags, p) struct mount *mp; int flags; struct proc *p; { return (0); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { struct msdosfsmount *pmp; int error, flags; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, NULLVP, flags); if (error) return error; pmp = VFSTOMSDOSFS(mp); pmp->pm_devvp->v_specmountpoint = NULL; #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; printf("msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("flag %08lx, usecount %d, writecount %d, holdcnt %ld\n", vp->v_flag, vp->v_usecount, vp->v_writecount, vp->v_holdcnt); printf("lastr %d, id %lu, mount %p, op %p\n", vp->v_lastr, vp->v_id, vp->v_mount, vp->v_op); printf("freef %p, freeb %p, mount %p\n", vp->v_freelist.tqe_next, vp->v_freelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_cleanblkhd), TAILQ_FIRST(&vp->v_dirtyblkhd), vp->v_numoutput, vp->v_type); printf("union %p, tag %d, data[0] %08x, data[1] %08x\n", vp->v_socket, vp->v_tag, ((u_int *)vp->v_data)[0], ((u_int *)vp->v_data)[1]); } #endif error = VOP_CLOSE(pmp->pm_devvp, (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE, NOCRED, p); vrele(pmp->pm_devvp); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } static int msdosfs_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_quotactl(mp, cmds, uid, arg, p) struct mount *mp; int cmds; uid_t uid; caddr_t arg; struct proc *p; { return EOPNOTSUPP; } static int msdosfs_statfs(mp, sbp, p) struct mount *mp; struct statfs *sbp; struct proc *p; { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_nmbrofclusters; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN); return (0); } static int msdosfs_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { struct vnode *vp, *nvp; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; /* * If we ever switch to not updating all of the fats all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update fats here */ } } /* * Write back each (modified) denode. */ simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; dep = VTODE(vp); if (vp->v_type == VNON || ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (TAILQ_EMPTY(&vp->v_dirtyblkhd) || waitfor == MNT_LAZY))) { simple_unlock(&vp->v_interlock); continue; } simple_unlock(&mntvnode_slock); error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, cred, waitfor, p); if (error) allerror = error; VOP_UNLOCK(vp, 0, p); vrele(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(pmp->pm_devvp, cred, waitfor, p); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0, p); } return (allerror); } static int msdosfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; struct netcred *np; int error; np = vfs_export_lookup(mp, &pmp->pm_export, nam); if (np == NULL) return (EACCES); error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); *exflagsp = np->netc_exflags; *credanonp = &np->netc_anon; return (0); } static int msdosfs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { struct denode *dep; struct defid *defhp; dep = VTODE(vp); defhp = (struct defid *)fhp; defhp->defid_len = sizeof(struct defid); defhp->defid_dirclust = dep->de_dirclust; defhp->defid_dirofs = dep->de_diroffset; /* defhp->defid_gen = dep->de_gen; */ return (0); } static int msdosfs_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { return EOPNOTSUPP; } static struct vfsops msdosfs_vfsops = { msdosfs_mount, msdosfs_start, msdosfs_unmount, msdosfs_root, msdosfs_quotactl, msdosfs_statfs, msdosfs_sync, msdosfs_vget, msdosfs_fhtovp, msdosfs_vptofh, msdosfs_init }; VFS_SET(msdosfs_vfsops, msdos, 0); Index: head/sys/fs/msdosfs/msdosfs_vnops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vnops.c (revision 49534) +++ head/sys/fs/msdosfs/msdosfs_vnops.c (revision 49535) @@ -1,1986 +1,1985 @@ -/* $Id: msdosfs_vnops.c,v 1.86 1999/06/26 02:46:26 mckusick Exp $ */ +/* $Id: msdosfs_vnops.c,v 1.87 1999/07/25 04:01:32 bde Exp $ */ /* $NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include /* defines plimit structure in proc struct */ #include #include #include #include #include #include #include -#include /* XXX */ /* defines v_rdev */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for MSDOSFS vnode operations */ static int msdosfs_create __P((struct vop_create_args *)); static int msdosfs_mknod __P((struct vop_mknod_args *)); static int msdosfs_close __P((struct vop_close_args *)); static int msdosfs_access __P((struct vop_access_args *)); static int msdosfs_getattr __P((struct vop_getattr_args *)); static int msdosfs_setattr __P((struct vop_setattr_args *)); static int msdosfs_read __P((struct vop_read_args *)); static int msdosfs_write __P((struct vop_write_args *)); static int msdosfs_fsync __P((struct vop_fsync_args *)); static int msdosfs_remove __P((struct vop_remove_args *)); static int msdosfs_link __P((struct vop_link_args *)); static int msdosfs_rename __P((struct vop_rename_args *)); static int msdosfs_mkdir __P((struct vop_mkdir_args *)); static int msdosfs_rmdir __P((struct vop_rmdir_args *)); static int msdosfs_symlink __P((struct vop_symlink_args *)); static int msdosfs_readdir __P((struct vop_readdir_args *)); static int msdosfs_abortop __P((struct vop_abortop_args *)); static int msdosfs_bmap __P((struct vop_bmap_args *)); static int msdosfs_strategy __P((struct vop_strategy_args *)); static int msdosfs_print __P((struct vop_print_args *)); static int msdosfs_pathconf __P((struct vop_pathconf_args *ap)); static int msdosfs_getpages __P((struct vop_getpages_args *)); static int msdosfs_putpages __P((struct vop_putpages_args *)); /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retreive the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int msdosfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode ndirent; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct timespec ts; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap); #endif /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_create: no name"); #endif bzero(&ndirent, sizeof(ndirent)); error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = (ap->a_vap->va_mode & VWRITE) ? ATTR_ARCHIVE : ATTR_ARCHIVE | ATTR_READONLY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; ndirent.de_devvp = pdep->de_devvp; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; if ((cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); *ap->a_vpp = DETOV(dep); return (0); bad: zfree(namei_zone, cnp->cn_pnbuf); return (error); } static int msdosfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { switch (ap->a_vap->va_type) { case VDIR: return (msdosfs_mkdir((struct vop_mkdir_args *)ap)); break; case VREG: return (msdosfs_create((struct vop_create_args *)ap)); break; default: zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (EINVAL); } /* NOTREACHED */ } static int msdosfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct timespec ts; simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); } simple_unlock(&vp->v_interlock); return 0; } static int msdosfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; mode_t mask, file_mode, mode = ap->a_mode; register gid_t *gp; int i; file_mode = (S_IXUSR|S_IXGRP|S_IXOTH) | (S_IRUSR|S_IRGRP|S_IROTH) | ((dep->de_Attributes & ATTR_READONLY) ? 0 : (S_IWUSR|S_IWGRP|S_IWOTH)); file_mode &= pmp->pm_mask; /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* User id 0 always gets access. */ if (cred->cr_uid == 0) return 0; mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == pmp->pm_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return (file_mode & mask) == mask ? 0 : EACCES; } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (pmp->pm_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return (file_mode & mask) == mask ? 0 : EACCES; } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return (file_mode & mask) == mask ? 0 : EACCES; } static int msdosfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; mode_t mode; struct timespec ts; u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); u_long fileid; getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); vap->va_fsid = dev2udev(dep->de_dev); /* * The following computation of the fileid must be the same as that * used in msdosfs_readdir() to compute d_fileno. If not, pwd * doesn't work. */ if (dep->de_Attributes & ATTR_DIRECTORY) { fileid = cntobn(pmp, dep->de_StartCluster) * dirsperblk; if (dep->de_StartCluster == MSDOSFSROOT) fileid = 1; } else { fileid = cntobn(pmp, dep->de_dirclust) * dirsperblk; if (dep->de_dirclust == MSDOSFSROOT) fileid = roottobn(pmp, 0) * dirsperblk; fileid += dep->de_diroffset / sizeof(struct direntry); } vap->va_fileid = fileid; if ((dep->de_Attributes & ATTR_READONLY) == 0) mode = S_IRWXU|S_IRWXG|S_IRWXO; else mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_mode = mode & pmp->pm_mask; vap->va_uid = pmp->pm_uid; vap->va_gid = pmp->pm_gid; vap->va_nlink = 1; vap->va_rdev = 0; vap->va_size = dep->de_FileSize; dos2unixtime(dep->de_MDate, dep->de_MTime, 0, &vap->va_mtime); if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) { dos2unixtime(dep->de_ADate, 0, 0, &vap->va_atime); dos2unixtime(dep->de_CDate, dep->de_CTime, dep->de_CHun, &vap->va_ctime); } else { vap->va_atime = vap->va_mtime; vap->va_ctime = vap->va_mtime; } vap->va_flags = 0; if ((dep->de_Attributes & ATTR_ARCHIVE) == 0) vap->va_flags |= SF_ARCHIVED; vap->va_gen = 0; vap->va_blocksize = pmp->pm_bpcluster; vap->va_bytes = (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask; vap->va_type = ap->a_vp->v_type; vap->va_filerev = dep->de_modrev; return (0); } static int msdosfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): vp %p, vap %p, cred %p, p %p\n", ap->a_vp, vap, cred, ap->a_p); #endif /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): returning EINVAL\n"); printf(" va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n", vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid); printf(" va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n", vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen); printf(" va_uid %x, va_gid %x\n", vap->va_uid, vap->va_gid); #endif return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return (error); /* * We are very inconsistent about handling unsupported * attributes. We ignored the access time and the * read and execute bits. We were strict for the other * attributes. * * Here we are strict, stricter than ufs in not allowing * users to attempt to set SF_SETTABLE bits or anyone to * set unsupported bits. However, we ignore attempts to * set ATTR_ARCHIVE for directories `cp -pr' from a more * sensible file system attempts it a lot. */ if (cred->cr_uid != 0) { if (vap->va_flags & SF_SETTABLE) return EPERM; } if (vap->va_flags & ~SF_ARCHIVED) return EOPNOTSUPP; if (vap->va_flags & SF_ARCHIVED) dep->de_Attributes &= ~ATTR_ARCHIVE; else if (!(dep->de_Attributes & ATTR_DIRECTORY)) dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { uid_t uid; gid_t gid; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); uid = vap->va_uid; if (uid == (uid_t)VNOVAL) uid = pmp->pm_uid; gid = vap->va_gid; if (gid == (gid_t)VNOVAL) gid = pmp->pm_gid; if ((cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid || (gid != pmp->pm_gid && !groupmember(gid, cred))) && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return error; if (uid != pmp->pm_uid || gid != pmp->pm_gid) return EINVAL; } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); /* NOT REACHED */ case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } error = detrunc(dep, vap->va_size, 0, cred, ap->a_p); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(ap->a_vp, VWRITE, cred, ap->a_p)))) return (error); if (vp->v_type != VDIR) { if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 && vap->va_atime.tv_sec != VNOVAL) unix2dostime(&vap->va_atime, &dep->de_ADate, NULL, NULL); if (vap->va_mtime.tv_sec != VNOVAL) unix2dostime(&vap->va_mtime, &dep->de_MDate, &dep->de_MTime, NULL); dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } /* * DOS files only have the ability to have their writability * attribute set, so we use the owner write bit to set the readonly * attribute. */ if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return (error); if (vp->v_type != VDIR) { /* We ignore the read and execute bits. */ if (vap->va_mode & VWRITE) dep->de_Attributes &= ~ATTR_READONLY; else dep->de_Attributes |= ATTR_READONLY; dep->de_flag |= DE_MODIFIED; } } return (deupdat(dep, 1)); } static int msdosfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error = 0; int diff; int blsize; int isadir; int orig_resid; long n; long on; daddr_t lbn; daddr_t rablock; int rasize; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * If they didn't ask for any data, then we are done. */ orig_resid = uio->uio_resid; if (orig_resid <= 0) return (0); isadir = dep->de_Attributes & ATTR_DIRECTORY; do { lbn = de_cluster(pmp, uio->uio_offset); on = uio->uio_offset & pmp->pm_crbomask; n = min((u_long) (pmp->pm_bpcluster - on), uio->uio_resid); diff = dep->de_FileSize - uio->uio_offset; if (diff <= 0) break; if (diff < n) n = diff; /* convert cluster # to block # if a directory */ if (isadir) { error = pcbmap(dep, lbn, &lbn, 0, &blsize); if (error) break; } /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ if (isadir) { error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); } else { rablock = lbn + 1; if (vp->v_lastr + 1 == lbn && de_cn2off(pmp, rablock) < dep->de_FileSize) { rasize = pmp->pm_bpcluster; error = breadn(vp, lbn, pmp->pm_bpcluster, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, pmp->pm_bpcluster, NOCRED, &bp); vp->v_lastr = lbn; } n = min(n, pmp->pm_bpcluster - bp->b_resid); if (error) { brelse(bp); break; } error = uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) dep->de_flag |= DE_ACCESS; return (error); } /* * Write data to a file or directory. */ static int msdosfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int n; int croffset; int resid; u_long osize; int error = 0; u_long count; daddr_t bn, lastcn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; #ifdef MSDOSFS_DEBUG printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n", vp, uio, ioflag, cred); printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n", dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; thisvp = vp; break; case VDIR: return EISDIR; default: panic("msdosfs_write(): bad file type"); } if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); /* * If they've exceeded their filesize limit, tell them about it. */ if (p && ((uio->uio_offset + uio->uio_resid) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) { psignal(p, SIGXFSZ); return (EFBIG); } /* * If the offset we are starting the write at is beyond the end of * the file, then they've done a seek. Unix filesystems allow * files with holes in them, DOS doesn't so we must fill the hole * with zeroed blocks. */ if (uio->uio_offset > dep->de_FileSize) { error = deextend(dep, uio->uio_offset, cred); if (error) return (error); } /* * Remember some values in case the write fails. */ resid = uio->uio_resid; osize = dep->de_FileSize; /* * If we write beyond the end of the file, extend it to its ultimate * size ahead of the time to hopefully get a contiguous area. */ if (uio->uio_offset + resid > osize) { count = de_clcount(pmp, uio->uio_offset + resid) - de_clcount(pmp, osize); error = extendfile(dep, count, NULL, NULL, 0); if (error && (error != ENOSPC || (ioflag & IO_UNIT))) goto errexit; lastcn = dep->de_fc[FC_LASTFC].fc_frcn; } else lastcn = de_clcount(pmp, osize) - 1; do { if (de_cluster(pmp, uio->uio_offset) > lastcn) { error = ENOSPC; break; } croffset = uio->uio_offset & pmp->pm_crbomask; n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; /* The object size needs to be set before buffer is allocated */ vnode_pager_setsize(vp, dep->de_FileSize); } bn = de_cluster(pmp, uio->uio_offset); if ((uio->uio_offset & pmp->pm_crbomask) == 0 && (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > de_cluster(pmp, uio->uio_offset) || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0); clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the fat table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) bp->b_blkno = -1; } if (bp->b_blkno == -1) { brelse(bp); if (!error) error = EIO; /* XXX */ break; } } else { /* * The block we need to write into exists, so read it in. */ error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp); if (error) { brelse(bp); break; } } /* * Should these vnode_pager_* functions be done on dir * files? */ /* * Copy the data from user space into the buf header. */ error = uiomove(bp->b_data + croffset, n, uio); /* * If they want this synchronous then write it and wait for * it. Otherwise, if on a cluster boundary write it * asynchronously so we can move on to the next block * without delay. Otherwise do a delayed write because we * may want to write somemore into the block later. */ if (ioflag & IO_SYNC) (void) bwrite(bp); else if (n + croffset == pmp->pm_bpcluster) bawrite(bp); else bdwrite(bp); dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); /* * If the write failed and they want us to, truncate the file back * to the size it was before the write was attempted. */ errexit: if (error) { if (ioflag & IO_UNIT) { detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL); if (uio->uio_resid != resid) error = 0; } } else if (ioflag & IO_SYNC) error = deupdat(dep, 1); return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int msdosfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; int s; struct buf *bp, *nbp; /* * Flush all dirty buffers associated with a vnode. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("msdosfs_fsync: not dirty"); bremfree(bp); splx(s); (void) bwrite(bp); goto loop; } while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "msdosfsn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("msdosfs_fsync: dirty", vp); goto loop; } #endif splx(s); return (deupdat(VTODE(vp), ap->a_waitfor == MNT_WAIT)); } static int msdosfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct denode *ddep = VTODE(ap->a_dvp); int error; if (ap->a_vp->v_type == VDIR) error = EPERM; else error = removede(ddep, dep); #ifdef MSDOSFS_DEBUG printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount); #endif return (error); } /* * DOS filesystems don't know what links are. But since we already called * msdosfs_lookup() with create and lockparent, the parent is locked so we * have to free it before we return the error. */ static int msdosfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { VOP_ABORTOP(ap->a_tdvp, ap->a_cnp); return (EOPNOTSUPP); } /* * Renames on files require moving the denode to a new hash queue since the * denode's location is used to compute which hash queue to put the file * in. Unless it is a rename in place. For example "mv a b". * * What follows is the basic algorithm: * * if (file move) { * if (dest file exists) { * remove dest file * } * if (dest and src in same directory) { * rewrite name in existing directory slot * } else { * write new entry in dest directory * update offset and dirclust in denode * move denode to new hash chain * clear old directory entry * } * } else { * directory move * if (dest directory exists) { * if (dest is not empty) { * return ENOTEMPTY * } * remove dest directory * } * if (dest and src in same directory) { * rewrite name in existing entry * } else { * be sure dest is not a child of src directory * write entry in dest directory * update "." and ".." in moved directory * clear old directory entry for moved directory * } * } * * On entry: * source's parent directory is unlocked * source file or directory is unlocked * destination's parent directory is locked * destination file or directory is locked if it exists * * On exit: * all denodes should be released * * Notes: * I'm not sure how the memory containing the pathnames pointed at by the * componentname structures is freed, there may be some memory bleeding * for each rename done. */ static int msdosfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; struct denode *ip, *xp, *dp, *zp; u_char toname[11], oldname[11]; u_long from_diroffset, to_diroffset; u_char to_count; int doingdirectory = 0, newparent = 0; int error; u_long cn; daddr_t bn; struct denode *fddep; /* from file's parent directory */ struct denode *fdep; /* from file or directory */ struct denode *tddep; /* to file's parent directory */ struct denode *tdep; /* to file or directory */ struct msdosfsmount *pmp; struct direntry *dotdotp; struct buf *bp; fddep = VTODE(ap->a_fdvp); fdep = VTODE(ap->a_fvp); tddep = VTODE(ap->a_tdvp); tdep = tvp ? VTODE(tvp) : NULL; pmp = fddep->de_pmp; pmp = VFSTOMSDOSFS(fdvp->v_mount); #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("msdosfs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); return (error); } /* * If source and dest are the same, do nothing. */ if (tvp == fvp) { error = 0; goto abortit; } error = vn_lock(fvp, LK_EXCLUSIVE, p); if (error) goto abortit; dp = VTODE(fdvp); ip = VTODE(fvp); /* * Be sure we are not renaming ".", "..", or an alias of ".". This * leads to a crippled directory tree. It's pretty tough to do a * "ls" or "pwd" with the "." directory entry missing, and "cd .." * doesn't work if the ".." entry is missing. */ if (ip->de_Attributes & ATTR_DIRECTORY) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) || (ip->de_flag & DE_RENAME)) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } ip->de_flag |= DE_RENAME; doingdirectory++; } /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; /* * Remember direntry place to use for destination */ to_diroffset = dp->de_fndoffset; to_count = dp->de_fndcnt; /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to doscheckpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp, 0, p); if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster) newparent = 1; vrele(fdvp); if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); /* * doscheckpath() vput()'s dp, * so we have to do a relookup afterwards */ error = doscheckpath(ip, dp); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost to startdir"); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; } if (xp != NULL) { /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (xp->de_Attributes & ATTR_DIRECTORY) { if (!dosdirempty(xp)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = removede(dp, xp); if (error) goto bad; vput(tvp); xp = NULL; } /* * Convert the filename in tcnp into a dos filename. We copy this * into the denode and directory entry for the destination * file/directory. */ error = uniqdosname(VTODE(tdvp), tcnp, toname); if (error) goto abortit; /* * Since from wasn't locked at various places above, * have to do a relookup here. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost from startdir"); if (!newparent) VOP_UNLOCK(tdvp, 0, p); (void) relookup(fdvp, &fvp, fcnp); if (fvp == NULL) { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); vrele(ap->a_fvp); if (newparent) VOP_UNLOCK(tdvp, 0, p); vrele(tdvp); return 0; } xp = VTODE(fvp); zp = VTODE(fdvp); from_diroffset = zp->de_fndoffset; /* * Ensure that the directory entry still exists and has not * changed till now. If the source is a file the entry may * have been unlinked or renamed. In either case there is * no further work to be done. If the source is a directory * then it cannot have been rmdir'ed or renamed; this is * prohibited by the DE_RENAME flag. */ if (xp != ip) { if (doingdirectory) panic("rename: lost dir entry"); vrele(ap->a_fvp); VOP_UNLOCK(fvp, 0, p); if (newparent) VOP_UNLOCK(fdvp, 0, p); xp = NULL; } else { vrele(fvp); xp = NULL; /* * First write a new entry in the destination * directory and mark the entry in the source directory * as deleted. Then move the denode to the correct hash * chain for its new location in the filesystem. And, if * we moved a directory, then update its .. entry to point * to the new parent directory. */ bcopy(ip->de_Name, oldname, 11); bcopy(toname, ip->de_Name, 11); /* update denode */ dp->de_fndoffset = to_diroffset; dp->de_fndcnt = to_count; error = createde(ip, dp, (struct denode **)0, tcnp); if (error) { bcopy(oldname, ip->de_Name, 11); if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } ip->de_refcnt++; zp->de_fndoffset = from_diroffset; error = removede(zp, ip); if (error) { /* XXX should really panic here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } if (!doingdirectory) { error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0, &ip->de_dirclust, 0); if (error) { /* XXX should really panic here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } if (ip->de_dirclust == MSDOSFSROOT) ip->de_diroffset = to_diroffset; else ip->de_diroffset = to_diroffset & pmp->pm_crbomask; } reinsert(ip); if (newparent) VOP_UNLOCK(fdvp, 0, p); } /* * If we moved a directory to a new parent directory, then we must * fixup the ".." entry in the moved directory. */ if (doingdirectory && newparent) { cn = ip->de_StartCluster; if (cn == MSDOSFSROOT) { /* this should never happen */ panic("msdosfs_rename(): updating .. in root directory?"); } else bn = cntobn(pmp, cn); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); if (error) { /* XXX should really panic here, fs is corrupt */ brelse(bp); VOP_UNLOCK(fvp, 0, p); goto bad; } dotdotp = (struct direntry *)bp->b_data + 1; putushort(dotdotp->deStartCluster, dp->de_StartCluster); if (FAT32(pmp)) putushort(dotdotp->deHighClust, dp->de_StartCluster >> 16); error = bwrite(bp); if (error) { /* XXX should really panic here, fs is corrupt */ VOP_UNLOCK(fvp, 0, p); goto bad; } } VOP_UNLOCK(fvp, 0, p); bad: if (xp) vput(tvp); vput(tdvp); out: ip->de_flag &= ~DE_RENAME; vrele(fdvp); vrele(fvp); return (error); } static struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", " ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", " ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; static int msdosfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct direntry *denp; struct msdosfsmount *pmp = pdep->de_pmp; struct buf *bp; u_long newcluster, pcl; int bn; int error; struct denode ndirent; struct timespec ts; /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; bzero(&ndirent, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0); bzero(bp->b_data, pmp->pm_bpcluster); bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = 0; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pdep->de_StartCluster >> 16); } error = bwrite(bp); if (error) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_mkdir: no name"); #endif error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; ndirent.de_devvp = pdep->de_devvp; error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; if ((cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); *ap->a_vpp = DETOV(dep); return (0); bad: clusterfree(pmp, newcluster, NULL); bad2: zfree(namei_zone, cnp->cn_pnbuf); return (error); } static int msdosfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct denode *ip, *dp; struct proc *p = cnp->cn_proc; int error; ip = VTODE(vp); dp = VTODE(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) { error = ENOTEMPTY; goto out; } /* * Delete the entry from the directory. For dos filesystems this * gets rid of the directory entry on disk, the in memory copy * still exists but the de_refcnt is <= 0. This prevents it from * being found by deget(). When the vput() on dep is done we give * up access and eventually msdosfs_reclaim() will be called which * will remove it from the denode cache. */ error = removede(dp, ip); if (error) goto out; /* * This is where we decrement the link count in the parent * directory. Since dos filesystems don't do this we just purge * the name cache. */ cache_purge(dvp); VOP_UNLOCK(dvp, 0, p); /* * Truncate the directory that is being deleted. */ error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, p); cache_purge(vp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); out: return (error); } /* * DOS filesystems don't know what symlinks are. */ static int msdosfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { zfree(namei_zone, ap->a_cnp->cn_pnbuf); /* VOP_ABORTOP(ap->a_dvp, ap->a_cnp); ??? */ return (EOPNOTSUPP); } static int msdosfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { int error = 0; int diff; long n; int blsize; long on; u_long cn; u_long fileno; u_long dirsperblk; long bias = 0; daddr_t bn, lbn; struct buf *bp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; struct dirent dirbuf; struct uio *uio = ap->a_uio; u_long *cookies = NULL; int ncookies = 0; off_t offset, off; int chksum = -1; #ifdef MSDOSFS_DEBUG printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n", ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the the filesystem vnode, and hence can * retrieve the wrong block from the buffer cache for a plain file. * So, fail attempts to readdir() on a plain file. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); /* * To be safe, initialize dirbuf */ bzero(dirbuf.d_name, sizeof(dirbuf.d_name)); /* * If the user buffer is smaller than the size of one dos directory * entry or the file offset is not a multiple of the size of a * directory entry, then we fail the read. */ off = offset = uio->uio_offset; if (uio->uio_resid < sizeof(struct direntry) || (offset & (sizeof(struct direntry) - 1))) return (EINVAL); if (ap->a_ncookies) { ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); /* * If they are reading from the root directory then, we simulate * the . and .. entries since these don't exist in the root * directory. We also set the offset bias to make up for having to * simulate these entries. By this I mean that at file offset 64 we * read the first entry in the root directory that lives on disk. */ if (dep->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) { #if 0 printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n", offset); #endif bias = 2 * sizeof(struct direntry); if (offset < bias) { for (n = (int)offset / sizeof(struct direntry); n < 2; n++) { if (FAT32(pmp)) dirbuf.d_fileno = cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else dirbuf.d_fileno = 1; dirbuf.d_type = DT_DIR; switch (n) { case 0: dirbuf.d_namlen = 1; strcpy(dirbuf.d_name, "."); break; case 1: dirbuf.d_namlen = 2; strcpy(dirbuf.d_name, ".."); break; } dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) goto out; error = uiomove((caddr_t) &dirbuf, dirbuf.d_reclen, uio); if (error) goto out; offset += sizeof(struct direntry); off = offset; if (cookies) { *cookies++ = offset; if (--ncookies <= 0) goto out; } } } } off = offset; while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, blsize - bp->b_resid); /* * Convert from dos directory entries to fs-independent * directory entries. */ for (dentp = (struct direntry *)(bp->b_data + on); (char *)dentp < bp->b_data + on + n; dentp++, offset += sizeof(struct direntry)) { #if 0 printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n", dentp, prev, crnt, dentp->deName[0], dentp->deAttributes); #endif /* * If this is an unused entry, we can stop. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); goto out; } /* * Skip deleted entries. */ if (dentp->deName[0] == SLOT_DELETED) { chksum = -1; continue; } /* * Handle Win95 long directory entries */ if (dentp->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn((struct winentry *)dentp, &dirbuf, chksum, pmp->pm_flags & MSDOSFSMNT_U2WTABLE, pmp->pm_u2w); continue; } /* * Skip volume labels */ if (dentp->deAttributes & ATTR_VOLUME) { chksum = -1; continue; } /* * This computation of d_fileno must match * the computation of va_fileid in * msdosfs_getattr. */ if (dentp->deAttributes & ATTR_DIRECTORY) { fileno = getushort(dentp->deStartCluster); if (FAT32(pmp)) fileno |= getushort(dentp->deHighClust) << 16; /* if this is the root directory */ if (fileno == MSDOSFSROOT) if (FAT32(pmp)) fileno = cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; else fileno = cntobn(pmp, fileno) * dirsperblk; dirbuf.d_fileno = fileno; dirbuf.d_type = DT_DIR; } else { dirbuf.d_fileno = offset / sizeof(struct direntry); dirbuf.d_type = DT_REG; } if (chksum != winChksum(dentp->deName)) dirbuf.d_namlen = dos2unixfn(dentp->deName, (u_char *)dirbuf.d_name, dentp->deLowerCase | ((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ? (LCASE_BASE | LCASE_EXT) : 0), pmp->pm_flags & MSDOSFSMNT_U2WTABLE, pmp->pm_d2u, pmp->pm_flags & MSDOSFSMNT_ULTABLE, pmp->pm_ul); else dirbuf.d_name[dirbuf.d_namlen] = 0; chksum = -1; dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) { brelse(bp); goto out; } error = uiomove((caddr_t) &dirbuf, dirbuf.d_reclen, uio); if (error) { brelse(bp); goto out; } if (cookies) { *cookies++ = offset + sizeof(struct direntry); if (--ncookies <= 0) { brelse(bp); goto out; } } off = offset + sizeof(struct direntry); } brelse(bp); } out: /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; uio->uio_offset = off; /* * Set the eofflag (NFS uses it) */ if (ap->a_eofflag) { if (dep->de_FileSize - (offset - bias) <= 0) *ap->a_eofflag = 1; else *ap->a_eofflag = 0; } return (error); } static int msdosfs_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (0); } /* * vp - address of vnode file the file * bn - which cluster we are interested in mapping to a filesystem block number. * vpp - returns the vnode for the block special file holding the filesystem * containing the file of interest * bnp - address of where to return the filesystem relative block number */ static int msdosfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); if (ap->a_vpp != NULL) *ap->a_vpp = dep->de_devvp; if (ap->a_bnp == NULL) return (0); if (ap->a_runp) { /* * Sequential clusters should be counted here. */ *ap->a_runp = 0; } if (ap->a_runb) { *ap->a_runb = 0; } return (pcbmap(dep, ap->a_bn, ap->a_bnp, 0, 0)); } static int msdosfs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct denode *dep = VTODE(bp->b_vp); struct vnode *vp; int error = 0; if (bp->b_vp->v_type == VBLK || bp->b_vp->v_type == VCHR) panic("msdosfs_strategy: spec"); /* * If we don't already know the filesystem relative block number * then get it using pcbmap(). If pcbmap() returns the block * number as -1 then we've got a hole in the file. DOS filesystems * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if (bp->b_blkno == -1) { biodone(bp); return (0); } /* * Read/write the block from/to the disk that contains the desired * file block. */ vp = dep->de_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } static int msdosfs_print(ap) struct vop_print_args /* { struct vnode *vp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); printf( "tag VT_MSDOSFS, startcluster %lu, dircluster %lu, diroffset %lu ", dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset); printf(" dev %d, %d", major(dep->de_dev), minor(dep->de_dev)); lockmgr_printinfo(&dep->de_lock); printf("\n"); return (0); } static int msdosfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * get page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int msdosfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * put page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int msdosfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } /* Global vfs data structures for msdosfs */ vop_t **msdosfs_vnodeop_p; static struct vnodeopv_entry_desc msdosfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) msdosfs_abortop }, { &vop_access_desc, (vop_t *) msdosfs_access }, { &vop_bmap_desc, (vop_t *) msdosfs_bmap }, { &vop_cachedlookup_desc, (vop_t *) msdosfs_lookup }, { &vop_close_desc, (vop_t *) msdosfs_close }, { &vop_create_desc, (vop_t *) msdosfs_create }, { &vop_fsync_desc, (vop_t *) msdosfs_fsync }, { &vop_getattr_desc, (vop_t *) msdosfs_getattr }, { &vop_inactive_desc, (vop_t *) msdosfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_link_desc, (vop_t *) msdosfs_link }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_mkdir_desc, (vop_t *) msdosfs_mkdir }, { &vop_mknod_desc, (vop_t *) msdosfs_mknod }, { &vop_pathconf_desc, (vop_t *) msdosfs_pathconf }, { &vop_print_desc, (vop_t *) msdosfs_print }, { &vop_read_desc, (vop_t *) msdosfs_read }, { &vop_readdir_desc, (vop_t *) msdosfs_readdir }, { &vop_reclaim_desc, (vop_t *) msdosfs_reclaim }, { &vop_remove_desc, (vop_t *) msdosfs_remove }, { &vop_rename_desc, (vop_t *) msdosfs_rename }, { &vop_rmdir_desc, (vop_t *) msdosfs_rmdir }, { &vop_setattr_desc, (vop_t *) msdosfs_setattr }, { &vop_strategy_desc, (vop_t *) msdosfs_strategy }, { &vop_symlink_desc, (vop_t *) msdosfs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) msdosfs_write }, { &vop_getpages_desc, (vop_t *) msdosfs_getpages }, { &vop_putpages_desc, (vop_t *) msdosfs_putpages }, { NULL, NULL } }; static struct vnodeopv_desc msdosfs_vnodeop_opv_desc = { &msdosfs_vnodeop_p, msdosfs_vnodeop_entries }; VNODEOP_SET(msdosfs_vnodeop_opv_desc); Index: head/sys/fs/ntfs/ntfs_compr.c =================================================================== --- head/sys/fs/ntfs/ntfs_compr.c (revision 49534) +++ head/sys/fs/ntfs/ntfs_compr.c (revision 49535) @@ -1,120 +1,118 @@ /* $NetBSD: ntfs_compr.c,v 1.2 1999/05/06 15:43:18 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_compr.c,v 1.3 1999/04/20 21:06:43 semenu Exp $ + * $Id: ntfs_compr.c,v 1.4 1999/05/12 09:42:54 semenu Exp $ */ #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #include #endif - -#include #include #include #define GET_UINT16(addr) (*((u_int16_t *)(addr))) int ntfs_uncompblock( u_int8_t * buf, u_int8_t * cbuf) { u_int32_t ctag; int len, dshift, lmask; int blen, boff; int i, j; int pos, cpos; len = GET_UINT16(cbuf) & 0xFFF; dprintf(("ntfs_uncompblock: block length: %d + 3, 0x%x,0x%04x\n", len, len, GET_UINT16(cbuf))); if (!(GET_UINT16(cbuf) & 0x8000)) { if ((len + 1) != NTFS_COMPBLOCK_SIZE) { dprintf(("ntfs_uncompblock: len: %x instead of %d\n", len, 0xfff)); } memcpy(buf, cbuf + 2, len + 1); bzero(buf + len + 1, NTFS_COMPBLOCK_SIZE - 1 - len); return len + 3; } cpos = 2; pos = 0; while ((cpos < len + 3) && (pos < NTFS_COMPBLOCK_SIZE)) { ctag = cbuf[cpos++]; for (i = 0; (i < 8) && (pos < NTFS_COMPBLOCK_SIZE); i++) { if (ctag & 1) { for (j = pos - 1, lmask = 0xFFF, dshift = 12; j >= 0x10; j >>= 1) { dshift--; lmask >>= 1; } boff = -1 - (GET_UINT16(cbuf + cpos) >> dshift); blen = 3 + (GET_UINT16(cbuf + cpos) & lmask); for (j = 0; (j < blen) && (pos < NTFS_COMPBLOCK_SIZE); j++) { buf[pos] = buf[pos + boff]; pos++; } cpos += 2; } else { buf[pos++] = cbuf[cpos++]; } ctag >>= 1; } } return len + 3; } int ntfs_uncompunit( struct ntfsmount * ntmp, u_int8_t * uup, u_int8_t * cup) { int i; int off = 0; int new; for (i = 0; i * NTFS_COMPBLOCK_SIZE < ntfs_cntob(NTFS_COMPUNIT_CL); i++) { new = ntfs_uncompblock(uup + i * NTFS_COMPBLOCK_SIZE, cup + off); if (new == 0) return (EINVAL); off += new; } return (0); } Index: head/sys/fs/ntfs/ntfs_subr.c =================================================================== --- head/sys/fs/ntfs/ntfs_subr.c (revision 49534) +++ head/sys/fs/ntfs/ntfs_subr.c (revision 49535) @@ -1,1901 +1,1899 @@ /* $NetBSD: ntfs_subr.c,v 1.2 1999/05/06 15:43:19 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko (semenu@FreeBSD.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_subr.c,v 1.3 1999/04/20 21:06:43 semenu Exp $ + * $Id: ntfs_subr.c,v 1.4 1999/05/12 09:43:01 semenu Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #endif - -#include /* #define NTFS_DEBUG 1 */ #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) MALLOC_DEFINE(M_NTFSNTVATTR, "NTFS vattr", "NTFS file attribute information"); MALLOC_DEFINE(M_NTFSRDATA, "NTFS res data", "NTFS resident data"); MALLOC_DEFINE(M_NTFSRUN, "NTFS vrun", "NTFS vrun storage"); MALLOC_DEFINE(M_NTFSDECOMP, "NTFS decomp", "NTFS decompression temporary"); #endif /* * */ int ntfs_ntvattrrele( struct ntvattr * vap) { dprintf(("ntfs_ntvattrrele: ino: %d, type: 0x%x\n", vap->va_ip->i_number, vap->va_type)); ntfs_ntrele(vap->va_ip); return (0); } /* * Search attribute specifed in ntnode (load ntnode if nessecary). * If not found but ATTR_A_ATTRLIST present, read it in and search throught. * VOP_VGET node needed, and lookup througth it's ntnode (load if nessesary). * * ntnode should be locked */ int ntfs_ntvattrget( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t type, char *name, cn_t vcn, struct ntvattr ** vapp) { int error; struct ntvattr *vap; struct ntvattr *lvap = NULL; struct attr_attrlist *aalp; struct attr_attrlist *nextaalp; caddr_t alpool; int len, namelen; *vapp = NULL; if (name) { dprintf(("ntfs_ntvattrget: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); namelen = strlen(name); } else { dprintf(("ntfs_ntvattrget: " \ "ino: %d, type: 0x%x, vcn: %d\n", \ ip->i_number, type, (u_int32_t) vcn)); name = ""; namelen = 0; } if((ip->i_flag & IN_LOADED) == 0) { dprintf(("ntfs_ntvattrget: node not loaded, ino: %d\n", ip->i_number)); error = ntfs_loadntnode(ntmp,ip); if(error) { printf("ntfs_ntvattrget: FAILED TO LOAD INO: %d\n", ip->i_number); return (error); } } for (vap = ip->i_valist.lh_first; vap; vap = vap->va_list.le_next) { ddprintf(("type: 0x%x, vcn: %d - %d\n", \ vap->va_type, (u_int32_t) vap->va_vcnstart, \ (u_int32_t) vap->va_vcnend)); if ((vap->va_type == type) && (vap->va_vcnstart <= vcn) && (vap->va_vcnend >= vcn) && (vap->va_namelen == namelen) && (!strncmp(name, vap->va_name, namelen))) { *vapp = vap; ntfs_ntref(vap->va_ip); return (0); } if (vap->va_type == NTFS_A_ATTRLIST) lvap = vap; } if (!lvap) { dprintf(("ntfs_ntvattrget: UNEXISTED ATTRIBUTE: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); return (ENOENT); } /* Scan $ATTRIBUTE_LIST for requested attribute */ len = lvap->va_datalen; MALLOC(alpool, caddr_t, len, M_TEMP, M_WAITOK); error = ntfs_readntvattr_plain(ntmp, ip, lvap, 0, len, alpool, &len); if (error) goto out; aalp = (struct attr_attrlist *) alpool; nextaalp = NULL; while (len > 0) { dprintf(("ntfs_ntvattrget: " \ "attrlist: ino: %d, attr: 0x%x, vcn: %d\n", \ aalp->al_inumber, aalp->al_type, \ (u_int32_t) aalp->al_vcnstart)); if (len > aalp->reclen) { nextaalp = NTFS_NEXTREC(aalp, struct attr_attrlist *); } else { nextaalp = NULL; } len -= aalp->reclen; #define AALPCMP(aalp,type,name,namelen) ( \ (aalp->al_type == type) && (aalp->al_namelen == namelen) && \ !uastrcmp(aalp->al_name,aalp->al_namelen,name,namelen) ) if (AALPCMP(aalp, type, name, namelen) && (!nextaalp || (nextaalp->al_vcnstart > vcn) || !AALPCMP(nextaalp, type, name, namelen))) { struct vnode *newvp; struct ntnode *newip; dprintf(("ntfs_ntvattrget: attrbute in ino: %d\n", aalp->al_inumber)); /* error = VFS_VGET(ntmp->ntm_mountp, aalp->al_inumber, &newvp); */ error = ntfs_vgetex(ntmp->ntm_mountp, aalp->al_inumber, NTFS_A_DATA, NULL, LK_EXCLUSIVE, VG_EXT, curproc, &newvp); if (error) { printf("ntfs_ntvattrget: CAN'T VGET INO: %d\n", aalp->al_inumber); goto out; } newip = VTONT(newvp); /* XXX have to lock ntnode */ if(~newip->i_flag & IN_LOADED) { dprintf(("ntfs_ntvattrget: node not loaded," \ " ino: %d\n", newip->i_number)); error = ntfs_loadntnode(ntmp,ip); if(error) { printf("ntfs_ntvattrget: CAN'T LOAD " \ "INO: %d\n", newip->i_number); vput(newvp); goto out; } } for (vap = newip->i_valist.lh_first; vap; vap = vap->va_list.le_next) { if ((vap->va_type == type) && (vap->va_vcnstart <= vcn) && (vap->va_vcnend >= vcn) && (vap->va_namelen == namelen) && (!strncmp(name, vap->va_name, namelen))) { *vapp = vap; ntfs_ntref(vap->va_ip); vput(newvp); error = 0; goto out; } if (vap->va_type == NTFS_A_ATTRLIST) lvap = vap; } printf("ntfs_ntvattrget: ATTRLIST ERROR.\n"); vput(newvp); break; } #undef AALPCMP aalp = nextaalp; } error = ENOENT; dprintf(("ntfs_ntvattrget: UNEXISTED ATTRIBUTE: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); out: FREE(alpool, M_TEMP); return (error); } /* * Read ntnode from disk, make ntvattr list. * * ntnode should be locked */ int ntfs_loadntnode( struct ntfsmount * ntmp, struct ntnode * ip) { struct filerec *mfrp; daddr_t bn; int error,off; struct attr *ap; struct ntvattr *nvap; dprintf(("ntfs_loadnode: loading ino: %d\n",ip->i_number)); MALLOC(mfrp, struct filerec *, ntfs_bntob(ntmp->ntm_bpmftrec), M_TEMP, M_WAITOK); if (ip->i_number < NTFS_SYSNODESNUM) { struct buf *bp; dprintf(("ntfs_loadnode: read system node\n")); bn = ntfs_cntobn(ntmp->ntm_mftcn) + ntmp->ntm_bpmftrec * ip->i_number; error = bread(ntmp->ntm_devvp, bn, ntfs_bntob(ntmp->ntm_bpmftrec), NOCRED, &bp); if (error) { printf("ntfs_loadnode: BREAD FAILED\n"); brelse(bp); goto out; } memcpy(mfrp, bp->b_data, ntfs_bntob(ntmp->ntm_bpmftrec)); bqrelse(bp); } else { struct vnode *vp; vp = ntmp->ntm_sysvn[NTFS_MFTINO]; error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, ip->i_number * ntfs_bntob(ntmp->ntm_bpmftrec), ntfs_bntob(ntmp->ntm_bpmftrec), mfrp); if (error) { printf("ntfs_loadnode: ntfs_readattr failed\n"); goto out; } } /* Check if magic and fixups are correct */ error = ntfs_procfixups(ntmp, NTFS_FILEMAGIC, (caddr_t)mfrp, ntfs_bntob(ntmp->ntm_bpmftrec)); if (error) { printf("ntfs_loadnode: BAD MFT RECORD %d\n", (u_int32_t) ip->i_number); goto out; } dprintf(("ntfs_loadnode: load attrs for ino: %d\n",ip->i_number)); off = mfrp->fr_attroff; ap = (struct attr *) ((caddr_t)mfrp + off); LIST_INIT(&ip->i_valist); while (ap->a_hdr.a_type != -1) { error = ntfs_attrtontvattr(ntmp, &nvap, ap); if (error) break; nvap->va_ip = ip; LIST_INSERT_HEAD(&ip->i_valist, nvap, va_list); off += ap->a_hdr.reclen; ap = (struct attr *) ((caddr_t)mfrp + off); } if (error) { printf("ntfs_loadnode: failed to load attr ino: %d\n", ip->i_number); goto out; } ip->i_mainrec = mfrp->fr_mainrec; ip->i_nlink = mfrp->fr_nlink; ip->i_frflag = mfrp->fr_flags; ip->i_flag |= IN_LOADED; out: FREE(mfrp, M_TEMP); return (error); } /* * Routine locks ntnode and increase usecount, just opposite of * ntfs_ntput. */ int ntfs_ntget( struct ntnode *ip) { dprintf(("ntfs_ntget: get ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount++; restart: if (ip->i_lock) { while (ip->i_lock) { ip->i_lock = -1; tsleep(&ip->i_lock, PVM, "ntnode", 0); } goto restart; } ip->i_lock = 1; return 0; } /* * Routine search ntnode in hash, if found: lock, inc usecount and return. * If not in hash allocate structure for ntnode, prefill it, lock, * inc count and return. * * ntnode returned locked */ static int ntfs_ntnode_hash_lock; int ntfs_ntlookup( struct ntfsmount * ntmp, ino_t ino, struct ntnode ** ipp) { struct ntnode *ip; dprintf(("ntfs_ntlookup: for ntnode %d\n", ino)); *ipp = NULL; restart: ip = ntfs_nthashlookup(ntmp->ntm_dev, ino); /* XXX */ if (ip) { ntfs_ntget(ip); *ipp = ip; dprintf(("ntfs_ntlookup: ntnode %d: %p, usecount: %d\n", ino, ip, ip->i_usecount)); return (0); } if (ntfs_ntnode_hash_lock) { while(ntfs_ntnode_hash_lock) { ntfs_ntnode_hash_lock = -1; tsleep(&ntfs_ntnode_hash_lock, PVM, "ntfsntgt", 0); } goto restart; } ntfs_ntnode_hash_lock = 1; MALLOC(ip, struct ntnode *, sizeof(struct ntnode), M_NTFSNTNODE, M_WAITOK); ddprintf(("ntfs_ntlookup: allocating ntnode: %d: %p\n", ino, ip)); bzero((caddr_t) ip, sizeof(struct ntnode)); /* Generic initialization */ ip->i_number = ino; ip->i_mp = ntmp; ip->i_dev = ntmp->ntm_dev; ip->i_uid = ntmp->ntm_uid; ip->i_gid = ntmp->ntm_gid; ip->i_mode = ntmp->ntm_mode; ip->i_usecount++; ip->i_lock = 1; LIST_INIT(&ip->i_fnlist); ntfs_nthashins(ip); if (ntfs_ntnode_hash_lock < 0) wakeup(&ntfs_ntnode_hash_lock); ntfs_ntnode_hash_lock = 0; *ipp = ip; dprintf(("ntfs_ntlookup: ntnode %d: %p, usecount: %d\n", ino, ip, ip->i_usecount)); return (0); } /* * Decrement usecount of ntnode and unlock it, if usecount reach zero, * deallocate ntnode. * * ntnode should be locked on entry, and unlocked on return. */ void ntfs_ntput( struct ntnode *ip) { struct ntvattr *vap; if (!ip->i_lock) printf("ntfs_ntput: NOT LOCKED"); dprintf(("ntfs_ntput: rele ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount--; if (ip->i_usecount < 0) { panic("ntfs_ntput: ino: %d usecount: %d \n", ip->i_number,ip->i_usecount); } else if (ip->i_usecount == 0) { dprintf(("ntfs_ntput: deallocating ntnode: %d\n", ip->i_number)); if (ip->i_fnlist.lh_first) panic("ntfs_ntput: ntnode has fnodes\n"); ntfs_nthashrem(ip); while (ip->i_valist.lh_first != NULL) { vap = ip->i_valist.lh_first; LIST_REMOVE(vap,va_list); ntfs_freentvattr(vap); } FREE(ip, M_NTFSNTNODE); } else { if (ip->i_lock < 0) wakeup(&ip->i_lock); ip->i_lock = 0; } } /* * Decrement usecount of ntnode. */ void ntfs_ntrele( struct ntnode * ip) { dprintf(("ntfs_ntrele: rele ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount--; if (ip->i_usecount < 0) panic("ntfs_ntrele: ino: %d usecount: %d \n", ip->i_number,ip->i_usecount); } /* * Deallocate all memory allocated for ntvattr by call to * ntfs_attrtontvattr and some other functions. */ void ntfs_freentvattr( struct ntvattr * vap) { if (vap->va_flag & NTFS_AF_INRUN) { if (vap->va_vruncn) FREE(vap->va_vruncn, M_NTFSRUN); if (vap->va_vruncl) FREE(vap->va_vruncl, M_NTFSRUN); } else { if (vap->va_datap) FREE(vap->va_datap, M_NTFSRDATA); } FREE(vap, M_NTFSNTVATTR); } /* * Convert disk image of attribute into ntvattr structure, * runs are expanded also. */ int ntfs_attrtontvattr( struct ntfsmount * ntmp, struct ntvattr ** rvapp, struct attr * rap) { int error, i; struct ntvattr *vap; error = 0; *rvapp = NULL; MALLOC(vap, struct ntvattr *, sizeof(struct ntvattr), M_NTFSNTVATTR, M_WAITOK); bzero(vap, sizeof(struct ntvattr)); vap->va_ip = NULL; vap->va_flag = rap->a_hdr.a_flag; vap->va_type = rap->a_hdr.a_type; vap->va_compression = rap->a_hdr.a_compression; vap->va_index = rap->a_hdr.a_index; ddprintf(("type: 0x%x, index: %d", vap->va_type, vap->va_index)); vap->va_namelen = rap->a_hdr.a_namelen; if (rap->a_hdr.a_namelen) { wchar *unp = (wchar *) ((caddr_t) rap + rap->a_hdr.a_nameoff); ddprintf((", name:[")); for (i = 0; i < vap->va_namelen; i++) { vap->va_name[i] = unp[i]; ddprintf(("%c", vap->va_name[i])); } ddprintf(("]")); } if (vap->va_flag & NTFS_AF_INRUN) { ddprintf((", nonres.")); vap->va_datalen = rap->a_nr.a_datalen; vap->va_allocated = rap->a_nr.a_allocated; vap->va_vcnstart = rap->a_nr.a_vcnstart; vap->va_vcnend = rap->a_nr.a_vcnend; vap->va_compressalg = rap->a_nr.a_compressalg; error = ntfs_runtovrun(&(vap->va_vruncn), &(vap->va_vruncl), &(vap->va_vruncnt), (caddr_t) rap + rap->a_nr.a_dataoff); } else { vap->va_compressalg = 0; ddprintf((", res.")); vap->va_datalen = rap->a_r.a_datalen; vap->va_allocated = rap->a_r.a_datalen; vap->va_vcnstart = 0; vap->va_vcnend = ntfs_btocn(vap->va_allocated); MALLOC(vap->va_datap, caddr_t, vap->va_datalen, M_NTFSRDATA, M_WAITOK); memcpy(vap->va_datap, (caddr_t) rap + rap->a_r.a_dataoff, rap->a_r.a_datalen); } ddprintf((", len: %d", vap->va_datalen)); if (error) FREE(vap, M_NTFSNTVATTR); else *rvapp = vap; ddprintf(("\n")); return (error); } /* * Expand run into more utilizable and more memory eating format. */ int ntfs_runtovrun( cn_t ** rcnp, cn_t ** rclp, u_long * rcntp, u_int8_t * run) { u_int32_t off; u_int32_t sz, i; cn_t *cn; cn_t *cl; u_long cnt; cn_t prev; cn_t tmp; off = 0; cnt = 0; i = 0; while (run[off]) { off += (run[off] & 0xF) + ((run[off] >> 4) & 0xF) + 1; cnt++; } MALLOC(cn, cn_t *, cnt * sizeof(cn_t), M_NTFSRUN, M_WAITOK); MALLOC(cl, cn_t *, cnt * sizeof(cn_t), M_NTFSRUN, M_WAITOK); off = 0; cnt = 0; prev = 0; while (run[off]) { sz = run[off++]; cl[cnt] = 0; for (i = 0; i < (sz & 0xF); i++) cl[cnt] += (u_int32_t) run[off++] << (i << 3); sz >>= 4; if (run[off + sz - 1] & 0x80) { tmp = ((u_int64_t) - 1) << (sz << 3); for (i = 0; i < sz; i++) tmp |= (u_int64_t) run[off++] << (i << 3); } else { tmp = 0; for (i = 0; i < sz; i++) tmp |= (u_int64_t) run[off++] << (i << 3); } if (tmp) prev = cn[cnt] = prev + tmp; else cn[cnt] = tmp; cnt++; } *rcnp = cn; *rclp = cl; *rcntp = cnt; return (0); } /* * Convert wchar to uppercase wchar, should be macros? */ wchar ntfs_toupper( struct ntfsmount * ntmp, wchar wc) { return (ntmp->ntm_upcase[wc & 0xFF]); } /* * Compare to unicode strings case insensible. */ int ntfs_uustricmp( struct ntfsmount * ntmp, wchar * str1, int str1len, wchar * str2, int str2len) { int i; int res; for (i = 0; i < str1len && i < str2len; i++) { res = (int) ntfs_toupper(ntmp, str1[i]) - (int) ntfs_toupper(ntmp, str2[i]); if (res) return res; } return (str1len - str2len); } /* * Compare unicode and ascii string case insens. */ int ntfs_uastricmp( struct ntfsmount * ntmp, const wchar *str1, int str1len, const char *str2, int str2len) { int i; int res; for (i = 0; i < str1len && i < str2len; i++) { res = (int) ntfs_toupper(ntmp, str1[i]) - (int) ntfs_toupper(ntmp, (wchar) str2[i]); if (res) return res; } return (str1len - str2len); } /* * Compare unicode and ascii string case sens. */ int ntfs_uastrcmp( struct ntfsmount *ntmp, const wchar *str1, int str1len, const char *str2, int str2len) { int i; int res; for (i = 0; (i < str1len) && (i < str2len); i++) { res = ((int) str1[i]) - ((int) str2[i]); if (res) return res; } return (str1len - str2len); } /* * Search fnode in ntnode, if not found allocate and preinitialize. * * ntnode should be locked on entry. */ int ntfs_fget( struct ntfsmount *ntmp, struct ntnode *ip, int attrtype, char *attrname, struct fnode **fpp) { struct fnode *fp; dprintf(("ntfs_fget: ino: %d, attrtype: 0x%x, attrname: %s\n", ip->i_number,attrtype, attrname?attrname:"")); *fpp = NULL; for (fp = ip->i_fnlist.lh_first; fp != NULL; fp = fp->f_fnlist.le_next){ dprintf(("ntfs_fget: fnode: attrtype: %d, attrname: %s\n", fp->f_attrtype, fp->f_attrname?fp->f_attrname:"")); if ((attrtype == fp->f_attrtype) && ((!attrname && !fp->f_attrname) || (attrname && fp->f_attrname && !strcmp(attrname,fp->f_attrname)))){ dprintf(("ntfs_fget: found existed: %p\n",fp)); *fpp = fp; } } if (*fpp) return (0); MALLOC(fp, struct fnode *, sizeof(struct fnode), M_NTFSFNODE, M_WAITOK); bzero(fp, sizeof(struct fnode)); dprintf(("ntfs_fget: allocating fnode: %p\n",fp)); fp->f_devvp = ntmp->ntm_devvp; fp->f_dev = ntmp->ntm_dev; fp->f_mp = ntmp; fp->f_ip = ip; fp->f_attrname = attrname; if (fp->f_attrname) fp->f_flag |= FN_AATTRNAME; fp->f_attrtype = attrtype; ntfs_ntref(ip); LIST_INSERT_HEAD(&ip->i_fnlist, fp, f_fnlist); *fpp = fp; return (0); } /* * Deallocate fnode, remove it from ntnode's fnode list. * * ntnode should be locked. */ void ntfs_frele( struct fnode *fp) { struct ntnode *ip = FTONT(fp); dprintf(("ntfs_frele: fnode: %p for %d: %p\n", fp, ip->i_number, ip)); dprintf(("ntfs_frele: deallocating fnode\n")); LIST_REMOVE(fp,f_fnlist); if (fp->f_flag & FN_AATTRNAME) FREE(fp->f_attrname, M_TEMP); if (fp->f_dirblbuf) FREE(fp->f_dirblbuf, M_NTFSDIR); FREE(fp, M_NTFSFNODE); ntfs_ntrele(ip); } /* * Lookup attribute name in format: [[:$ATTR_TYPE]:$ATTR_NAME], * $ATTR_TYPE is searched in attrdefs read from $AttrDefs. * If $ATTR_TYPE nott specifed, ATTR_A_DATA assumed. */ int ntfs_ntlookupattr( struct ntfsmount * ntmp, const char * name, int namelen, int *attrtype, char **attrname) { const char *sys; size_t syslen, i; struct ntvattrdef *adp; if (namelen == 0) return (0); if (name[0] == '$') { sys = name; for (syslen = 0; syslen < namelen; syslen++) { if(sys[syslen] == ':') { name++; namelen--; break; } } name += syslen; namelen -= syslen; adp = ntmp->ntm_ad; for (i = 0; i < ntmp->ntm_adnum; i++){ if((syslen == adp->ad_namelen) && (!strncmp(sys,adp->ad_name,syslen))) { *attrtype = adp->ad_type; if(namelen) { MALLOC((*attrname), char *, namelen, M_TEMP, M_WAITOK); memcpy((*attrname), name, namelen); (*attrname)[namelen] = '\0'; } return (0); } adp++; } return (ENOENT); } if(namelen) { MALLOC((*attrname), char *, namelen, M_TEMP, M_WAITOK); memcpy((*attrname), name, namelen); (*attrname)[namelen] = '\0'; *attrtype = NTFS_A_DATA; } return (0); } /* * Lookup specifed node for filename, matching cnp, * return fnode filled. */ int ntfs_ntlookupfile( struct ntfsmount * ntmp, struct vnode * vp, struct componentname * cnp, struct vnode ** vpp) { struct fnode *fp = VTOF(vp); struct ntnode *ip = FTONT(fp); struct ntvattr *vap; /* Root attribute */ cn_t cn; /* VCN in current attribute */ caddr_t rdbuf; /* Buffer to read directory's blocks */ u_int32_t blsize; u_int32_t rdsize; /* Length of data to read from current block */ struct attr_indexentry *iep; int error, res, anamelen, fnamelen; const char *fname,*aname; u_int32_t aoff; error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, &vap); if (error || (vap->va_flag & NTFS_AF_INRUN)) return (ENOTDIR); blsize = vap->va_a_iroot->ir_size; rdsize = vap->va_datalen; /* * Divide file name into: foofilefoofilefoofile[:attrspec] * Store like this: fname:fnamelen [aname:anamelen] */ fname = cnp->cn_nameptr; aname = NULL; anamelen = 0; for (fnamelen = 0; fnamelen < cnp->cn_namelen; fnamelen++) if(fname[fnamelen] == ':') { aname = fname + fnamelen + 1; anamelen = cnp->cn_namelen - fnamelen - 1; dprintf(("ntfs_ntlookupfile: %s (%d), attr: %s (%d)\n", fname, fnamelen, aname, anamelen)); break; } dprintf(("ntfs_ntlookupfile: blksz: %d, rdsz: %d\n", blsize, rdsize)); MALLOC(rdbuf, caddr_t, blsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, rdsize, rdbuf); if (error) goto fail; aoff = sizeof(struct attr_indexroot); do { iep = (struct attr_indexentry *) (rdbuf + aoff); while (!(iep->ie_flag & NTFS_IEFLAG_LAST) && (rdsize > aoff)) { ddprintf(("scan: %d, %d\n", (u_int32_t) iep->ie_number, (u_int32_t) iep->ie_fnametype)); res = ntfs_uastricmp(ntmp, iep->ie_fname, iep->ie_fnamelen, fname, fnamelen); if (res == 0) { /* Matched something (case ins.) */ if (iep->ie_fnametype == 0 || !(ntmp->ntm_flag & NTFS_MFLAG_CASEINS)) res = ntfs_uastrcmp(ntmp, iep->ie_fname, iep->ie_fnamelen, fname, fnamelen); if (res == 0) { int attrtype = NTFS_A_DATA; char *attrname = NULL; struct fnode *nfp; struct vnode *nvp; if (aname) { error = ntfs_ntlookupattr(ntmp, aname, anamelen, &attrtype, &attrname); if (error) goto fail; } /* Check if we've found ourself */ if ((iep->ie_number == ip->i_number) && (attrtype == fp->f_attrtype) && ((!attrname && !fp->f_attrname) || (attrname && fp->f_attrname && !strcmp(attrname, fp->f_attrname)))) { VREF(vp); *vpp = vp; goto fail; } /* vget node, but don't load it */ error = ntfs_vgetex(ntmp->ntm_mountp, iep->ie_number, attrtype, attrname, LK_EXCLUSIVE, VG_DONTLOADIN | VG_DONTVALIDFN, curproc, &nvp); if(error) goto fail; nfp = VTOF(nvp); if (nfp->f_flag & FN_VALID) { *vpp = nvp; goto fail; } nfp->f_fflag = iep->ie_fflag; nfp->f_pnumber = iep->ie_fpnumber; nfp->f_times = iep->ie_ftimes; if((nfp->f_fflag & NTFS_FFLAG_DIR) && (nfp->f_attrtype == NTFS_A_DATA) && (nfp->f_attrname == NULL)) nfp->f_type = VDIR; else nfp->f_type = VREG; nvp->v_type = nfp->f_type; if ((nfp->f_attrtype == NTFS_A_DATA) && (nfp->f_attrname == NULL)) { /* Opening default attribute */ nfp->f_size = iep->ie_fsize; nfp->f_allocated = iep->ie_fallocated; nfp->f_flag |= FN_PRELOADED; } else { error = ntfs_filesize(ntmp, nfp, &nfp->f_size, &nfp->f_allocated); if (error) { vput(nvp); goto fail; } } nfp->f_flag &= ~FN_VALID; *vpp = nvp; goto fail; } } else if (res > 0) break; aoff += iep->reclen; iep = (struct attr_indexentry *) (rdbuf + aoff); } /* Dive if possible */ if (iep->ie_flag & NTFS_IEFLAG_SUBNODE) { dprintf(("ntfs_ntlookupfile: diving\n")); cn = *(cn_t *) (rdbuf + aoff + iep->reclen - sizeof(cn_t)); rdsize = blsize; error = ntfs_readattr(ntmp, ip, NTFS_A_INDX, "$I30", ntfs_cntob(cn), rdsize, rdbuf); if (error) goto fail; error = ntfs_procfixups(ntmp, NTFS_INDXMAGIC, rdbuf, rdsize); if (error) goto fail; aoff = (((struct attr_indexalloc *) rdbuf)->ia_hdrsize + 0x18); } else { dprintf(("ntfs_ntlookupfile: nowhere to dive :-(\n")); error = ENOENT; break; } } while (1); dprintf(("finish\n")); fail: ntfs_ntvattrrele(vap); ntfs_ntput(ip); FREE(rdbuf, M_TEMP); return (error); } /* * Check if name type is permitted to show. */ int ntfs_isnamepermitted( struct ntfsmount * ntmp, struct attr_indexentry * iep) { if (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES) return 1; switch (iep->ie_fnametype) { case 2: ddprintf(("ntfs_isnamepermitted: skiped DOS name\n")); return 0; case 0: case 1: case 3: return 1; default: printf("ntfs_isnamepermitted: " \ "WARNING! Unknown file name type: %d\n", iep->ie_fnametype); break; } return 0; } /* * Read ntfs dir like stream of attr_indexentry, not like btree of them. * This is done by scaning $BITMAP:$I30 for busy clusters and reading them. * Ofcouse $INDEX_ROOT:$I30 is read before. Last read values are stored in * fnode, so we can skip toward record number num almost immediatly. * Anyway this is rather slow routine. The problem is that we don't know * how many records are there in $INDEX_ALLOCATION:$I30 block. */ int ntfs_ntreaddir( struct ntfsmount * ntmp, struct fnode * fp, u_int32_t num, struct attr_indexentry ** riepp) { struct ntnode *ip = FTONT(fp); struct ntvattr *vap = NULL; /* IndexRoot attribute */ struct ntvattr *bmvap = NULL; /* BitMap attribute */ struct ntvattr *iavap = NULL; /* IndexAllocation attribute */ caddr_t rdbuf; /* Buffer to read directory's blocks */ u_char *bmp = NULL; /* Bitmap */ u_int32_t blsize; /* Index allocation size (2048) */ u_int32_t rdsize; /* Length of data to read */ u_int32_t attrnum; /* Current attribute type */ u_int32_t cpbl = 1; /* Clusters per directory block */ u_int32_t blnum; struct attr_indexentry *iep; int error = ENOENT; u_int32_t aoff, cnum; dprintf(("ntfs_ntreaddir: read ino: %d, num: %d\n", ip->i_number, num)); error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, &vap); if (error) return (ENOTDIR); if (fp->f_dirblbuf == NULL) { fp->f_dirblsz = vap->va_a_iroot->ir_size; MALLOC(fp->f_dirblbuf, caddr_t, max(vap->va_datalen,fp->f_dirblsz), M_NTFSDIR, M_WAITOK); } blsize = fp->f_dirblsz; rdbuf = fp->f_dirblbuf; dprintf(("ntfs_ntreaddir: rdbuf: 0x%p, blsize: %d\n", rdbuf, blsize)); if (vap->va_a_iroot->ir_flag & NTFS_IRFLAG_INDXALLOC) { error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXBITMAP, "$I30", 0, &bmvap); if (error) { error = ENOTDIR; goto fail; } MALLOC(bmp, u_char *, bmvap->va_datalen, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, ip, NTFS_A_INDXBITMAP, "$I30", 0, bmvap->va_datalen, bmp); if (error) goto fail; error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDX, "$I30", 0, &iavap); if (error) { error = ENOTDIR; goto fail; } cpbl = ntfs_btocn(blsize + ntfs_cntob(1) - 1); dprintf(("ntfs_ntreaddir: indexalloc: %d, cpbl: %d\n", iavap->va_datalen, cpbl)); } else { dprintf(("ntfs_ntreadidir: w/o BitMap and IndexAllocation\n")); iavap = bmvap = NULL; bmp = NULL; } /* Try use previous values */ if ((fp->f_lastdnum < num) && (fp->f_lastdnum != 0)) { attrnum = fp->f_lastdattr; aoff = fp->f_lastdoff; blnum = fp->f_lastdblnum; cnum = fp->f_lastdnum; } else { attrnum = NTFS_A_INDXROOT; aoff = sizeof(struct attr_indexroot); blnum = 0; cnum = 0; } do { dprintf(("ntfs_ntreaddir: scan: 0x%x, %d, %d, %d, %d\n", attrnum, (u_int32_t) blnum, cnum, num, aoff)); rdsize = (attrnum == NTFS_A_INDXROOT) ? vap->va_datalen : blsize; error = ntfs_readattr(ntmp, ip, attrnum, "$I30", ntfs_cntob(blnum * cpbl), rdsize, rdbuf); if (error) goto fail; if (attrnum == NTFS_A_INDX) { error = ntfs_procfixups(ntmp, NTFS_INDXMAGIC, rdbuf, rdsize); if (error) goto fail; } if (aoff == 0) aoff = (attrnum == NTFS_A_INDX) ? (0x18 + ((struct attr_indexalloc *) rdbuf)->ia_hdrsize) : sizeof(struct attr_indexroot); iep = (struct attr_indexentry *) (rdbuf + aoff); while (!(iep->ie_flag & NTFS_IEFLAG_LAST) && (rdsize > aoff)) { if (ntfs_isnamepermitted(ntmp, iep)) { if (cnum >= num) { fp->f_lastdnum = cnum; fp->f_lastdoff = aoff; fp->f_lastdblnum = blnum; fp->f_lastdattr = attrnum; *riepp = iep; error = 0; goto fail; } cnum++; } aoff += iep->reclen; iep = (struct attr_indexentry *) (rdbuf + aoff); } if (iavap) { if (attrnum == NTFS_A_INDXROOT) blnum = 0; else blnum++; while (ntfs_cntob(blnum * cpbl) < iavap->va_datalen) { if (bmp[blnum >> 3] & (1 << (blnum & 3))) break; blnum++; } attrnum = NTFS_A_INDX; aoff = 0; if (ntfs_cntob(blnum * cpbl) >= iavap->va_datalen) break; dprintf(("ntfs_ntreaddir: blnum: %d\n", (u_int32_t) blnum)); } } while (iavap); *riepp = NULL; fp->f_lastdnum = 0; fail: if (vap) ntfs_ntvattrrele(vap); if (bmvap) ntfs_ntvattrrele(bmvap); if (iavap) ntfs_ntvattrrele(iavap); if (bmp) FREE(bmp, M_TEMP); ntfs_ntput(ip); return (error); } /* * Convert NTFS times that are in 100 ns units and begins from * 1601 Jan 1 into unix times. */ struct timespec ntfs_nttimetounix( u_int64_t nt) { struct timespec t; /* WindowNT times are in 100 ns and from 1601 Jan 1 */ t.tv_nsec = (nt % (1000 * 1000 * 10)) * 100; t.tv_sec = nt / (1000 * 1000 * 10) - 369LL * 365LL * 24LL * 60LL * 60LL - 89LL * 1LL * 24LL * 60LL * 60LL; return (t); } /* * Get file times from NTFS_A_NAME attribute. */ int ntfs_times( struct ntfsmount * ntmp, struct ntnode * ip, ntfs_times_t * tm) { struct ntvattr *vap; int error; dprintf(("ntfs_times: ino: %d...\n", ip->i_number)); error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_NAME, NULL, 0, &vap); if (error) { ntfs_ntput(ip); return (error); } *tm = vap->va_a_name->n_times; ntfs_ntvattrrele(vap); ntfs_ntput(ip); return (0); } /* * Get file sizes from corresponding attribute. * * ntnode under fnode should be locked. */ int ntfs_filesize( struct ntfsmount * ntmp, struct fnode * fp, u_int64_t * size, u_int64_t * bytes) { struct ntvattr *vap; struct ntnode *ip = FTONT(fp); u_int64_t sz, bn; int error; dprintf(("ntfs_filesize: ino: %d\n", ip->i_number)); error = ntfs_ntvattrget(ntmp, ip, fp->f_attrtype, fp->f_attrname, 0, &vap); if (error) return (error); bn = vap->va_allocated; sz = vap->va_datalen; dprintf(("ntfs_filesize: %d bytes (%d bytes allocated)\n", (u_int32_t) sz, (u_int32_t) bn)); if (size) *size = sz; if (bytes) *bytes = bn; ntfs_ntvattrrele(vap); return (0); } /* * This is one of write routine. * * ntnode should be locked. */ int ntfs_writeattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata, size_t * initp) { size_t init; int error = 0; off_t off = roff, left = rsize, towrite; caddr_t data = rdata; struct ntvattr *vap; *initp = 0; while (left) { error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, ntfs_btocn(off), &vap); if (error) return (error); towrite = min(left, ntfs_cntob(vap->va_vcnend + 1) - off); ddprintf(("ntfs_writeattr_plain: o: %d, s: %d (%d - %d)\n", (u_int32_t) off, (u_int32_t) towrite, (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend)); error = ntfs_writentvattr_plain(ntmp, ip, vap, off - ntfs_cntob(vap->va_vcnstart), towrite, data, &init); if (error) { printf("ntfs_writeattr_plain: " \ "ntfs_writentvattr_plain failed: o: %d, s: %d\n", (u_int32_t) off, (u_int32_t) towrite); printf("ntfs_writeattr_plain: attrib: %d - %d\n", (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend); ntfs_ntvattrrele(vap); break; } ntfs_ntvattrrele(vap); left -= towrite; off += towrite; data = data + towrite; *initp += init; } return (error); } /* * This is one of write routine. * * ntnode should be locked. */ int ntfs_writentvattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, struct ntvattr * vap, off_t roff, size_t rsize, void *rdata, size_t * initp) { int error = 0; int off; *initp = 0; if (vap->va_flag & NTFS_AF_INRUN) { int cnt; cn_t ccn, ccl, cn, left, cl; caddr_t data = rdata; struct buf *bp; size_t tocopy; ddprintf(("ntfs_writentvattr_plain: data in run: %d chains\n", vap->va_vruncnt)); off = roff; left = rsize; ccl = 0; ccn = 0; cnt = 0; while (left && (cnt < vap->va_vruncnt)) { ccn = vap->va_vruncn[cnt]; ccl = vap->va_vruncl[cnt]; ddprintf(("ntfs_writentvattr_plain: " \ "left %d, cn: 0x%x, cl: %d, off: %d\n", \ (u_int32_t) left, (u_int32_t) ccn, \ (u_int32_t) ccl, (u_int32_t) off)); if (ntfs_cntob(ccl) < off) { off -= ntfs_cntob(ccl); cnt++; continue; } if (ccn || ip->i_number == NTFS_BOOTINO) { /* XXX */ ccl -= ntfs_btocn(off); cn = ccn + ntfs_btocn(off); off = ntfs_btocnoff(off); while (left && ccl) { tocopy = min(left, min(ntfs_cntob(ccl) - off, MAXBSIZE - off)); cl = ntfs_btocl(tocopy + off); ddprintf(("ntfs_writentvattr_plain: " \ "write: cn: 0x%x cl: %d, " \ "off: %d len: %d, left: %d\n", (u_int32_t) cn, (u_int32_t) cl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); if ((off == 0) && (tocopy == ntfs_cntob(cl))) { bp = getblk(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), 0, 0); clrbuf(bp); } else { error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), NOCRED, &bp); if (error) { brelse(bp); return (error); } } memcpy(bp->b_data + off, data, tocopy); bawrite(bp); data = data + tocopy; *initp += tocopy; off = 0; left -= tocopy; cn += cl; ccl -= cl; } } cnt++; } if (left) { printf("ntfs_writentvattr_plain: POSSIBLE RUN ERROR\n"); error = EINVAL; } } else { printf("ntfs_writevattr_plain: CAN'T WRITE RES. ATTRIBUTE\n"); error = ENOTTY; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readntvattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, struct ntvattr * vap, off_t roff, size_t rsize, void *rdata, size_t * initp) { int error = 0; int off; *initp = 0; if (vap->va_flag & NTFS_AF_INRUN) { int cnt; cn_t ccn, ccl, cn, left, cl; caddr_t data = rdata; struct buf *bp; size_t tocopy; ddprintf(("ntfs_readntvattr_plain: data in run: %d chains\n", vap->va_vruncnt)); off = roff; left = rsize; ccl = 0; ccn = 0; cnt = 0; while (left && (cnt < vap->va_vruncnt)) { ccn = vap->va_vruncn[cnt]; ccl = vap->va_vruncl[cnt]; ddprintf(("ntfs_readntvattr_plain: " \ "left %d, cn: 0x%x, cl: %d, off: %d\n", \ (u_int32_t) left, (u_int32_t) ccn, \ (u_int32_t) ccl, (u_int32_t) off)); if (ntfs_cntob(ccl) < off) { off -= ntfs_cntob(ccl); cnt++; continue; } if (ccn || ip->i_number == NTFS_BOOTINO) { ccl -= ntfs_btocn(off); cn = ccn + ntfs_btocn(off); off = ntfs_btocnoff(off); while (left && ccl) { tocopy = min(left, min(ntfs_cntob(ccl) - off, MAXBSIZE - off)); cl = ntfs_btocl(tocopy + off); ddprintf(("ntfs_readntvattr_plain: " \ "read: cn: 0x%x cl: %d, " \ "off: %d len: %d, left: %d\n", (u_int32_t) cn, (u_int32_t) cl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), NOCRED, &bp); if (error) { brelse(bp); return (error); } memcpy(data, bp->b_data + off, tocopy); brelse(bp); data = data + tocopy; *initp += tocopy; off = 0; left -= tocopy; cn += cl; ccl -= cl; } } else { tocopy = min(left, ntfs_cntob(ccl) - off); ddprintf(("ntfs_readntvattr_plain: " "sparce: ccn: 0x%x ccl: %d, off: %d, " \ " len: %d, left: %d\n", (u_int32_t) ccn, (u_int32_t) ccl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); left -= tocopy; off = 0; bzero(data, tocopy); data = data + tocopy; } cnt++; } if (left) { printf("ntfs_readntvattr_plain: POSSIBLE RUN ERROR\n"); error = E2BIG; } } else { ddprintf(("ntfs_readnvattr_plain: data is in mft record\n")); memcpy(rdata, vap->va_datap + roff, rsize); *initp += rsize; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata, size_t * initp) { size_t init; int error = 0; off_t off = roff, left = rsize, toread; caddr_t data = rdata; struct ntvattr *vap; *initp = 0; while (left) { error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, ntfs_btocn(off), &vap); if (error) return (error); toread = min(left, ntfs_cntob(vap->va_vcnend + 1) - off); ddprintf(("ntfs_readattr_plain: o: %d, s: %d (%d - %d)\n", (u_int32_t) off, (u_int32_t) toread, (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend)); error = ntfs_readntvattr_plain(ntmp, ip, vap, off - ntfs_cntob(vap->va_vcnstart), toread, data, &init); if (error) { printf("ntfs_readattr_plain: " \ "ntfs_readntvattr_plain failed: o: %d, s: %d\n", (u_int32_t) off, (u_int32_t) toread); printf("ntfs_readattr_plain: attrib: %d - %d\n", (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend); ntfs_ntvattrrele(vap); break; } ntfs_ntvattrrele(vap); left -= toread; off += toread; data = data + toread; *initp += init; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readattr( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata) { int error = 0; struct ntvattr *vap; size_t init; ddprintf(("ntfs_readattr: reading %d: 0x%x, from %d size %d bytes\n", ip->i_number, attrnum, (u_int32_t) roff, (u_int32_t) rsize)); error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, 0, &vap); if (error) return (error); if ((roff > vap->va_datalen) || (roff + rsize > vap->va_datalen)) { ddprintf(("ntfs_readattr: offset too big\n")); ntfs_ntvattrrele(vap); return (E2BIG); } if (vap->va_compression && vap->va_compressalg) { u_int8_t *cup; u_int8_t *uup; off_t off = roff, left = rsize, tocopy; caddr_t data = rdata; cn_t cn; ddprintf(("ntfs_ntreadattr: compression: %d\n", vap->va_compressalg)); MALLOC(cup, u_int8_t *, ntfs_cntob(NTFS_COMPUNIT_CL), M_NTFSDECOMP, M_WAITOK); MALLOC(uup, u_int8_t *, ntfs_cntob(NTFS_COMPUNIT_CL), M_NTFSDECOMP, M_WAITOK); cn = (ntfs_btocn(roff)) & (~(NTFS_COMPUNIT_CL - 1)); off = roff - ntfs_cntob(cn); while (left) { error = ntfs_readattr_plain(ntmp, ip, attrnum, attrname, ntfs_cntob(cn), ntfs_cntob(NTFS_COMPUNIT_CL), cup, &init); if (error) break; tocopy = min(left, ntfs_cntob(NTFS_COMPUNIT_CL) - off); if (init == ntfs_cntob(NTFS_COMPUNIT_CL)) { memcpy(data, cup + off, tocopy); } else if (init == 0) { bzero(data, tocopy); } else { error = ntfs_uncompunit(ntmp, uup, cup); if (error) break; memcpy(data, uup + off, tocopy); } left -= tocopy; data = data + tocopy; off += tocopy - ntfs_cntob(NTFS_COMPUNIT_CL); cn += NTFS_COMPUNIT_CL; } FREE(uup, M_NTFSDECOMP); FREE(cup, M_NTFSDECOMP); } else error = ntfs_readattr_plain(ntmp, ip, attrnum, attrname, roff, rsize, rdata, &init); ntfs_ntvattrrele(vap); return (error); } #if UNUSED_CODE int ntfs_parserun( cn_t * cn, cn_t * cl, u_int8_t * run, u_long len, u_long *off) { u_int8_t sz; int i; if (NULL == run) { printf("ntfs_parsetun: run == NULL\n"); return (EINVAL); } sz = run[(*off)++]; if (0 == sz) { printf("ntfs_parserun: trying to go out of run\n"); return (E2BIG); } *cl = 0; if ((sz & 0xF) > 8 || (*off) + (sz & 0xF) > len) { printf("ntfs_parserun: " \ "bad run: length too big: sz: 0x%02x (%ld < %ld + sz)\n", sz, len, *off); return (EINVAL); } for (i = 0; i < (sz & 0xF); i++) *cl += (u_int32_t) run[(*off)++] << (i << 3); sz >>= 4; if ((sz & 0xF) > 8 || (*off) + (sz & 0xF) > len) { printf("ntfs_parserun: " \ "bad run: length too big: sz: 0x%02x (%ld < %ld + sz)\n", sz, len, *off); return (EINVAL); } for (i = 0; i < (sz & 0xF); i++) *cn += (u_int32_t) run[(*off)++] << (i << 3); return (0); } #endif /* * Process fixup routine on given buffer. */ int ntfs_procfixups( struct ntfsmount * ntmp, u_int32_t magic, caddr_t buf, size_t len) { struct fixuphdr *fhp = (struct fixuphdr *) buf; int i; u_int16_t fixup; u_int16_t *fxp; u_int16_t *cfxp; if (fhp->fh_magic != magic) { printf("ntfs_procfixups: magic doesn't match: %08x != %08x\n", fhp->fh_magic, magic); return (EINVAL); } if ((fhp->fh_fnum - 1) * ntmp->ntm_bps != len) { printf("ntfs_procfixups: " \ "bad fixups number: %d for %d bytes block\n", fhp->fh_fnum, len); return (EINVAL); } if (fhp->fh_foff >= ntmp->ntm_spc * ntmp->ntm_mftrecsz * ntmp->ntm_bps) { printf("ntfs_procfixups: invalid offset: %x", fhp->fh_foff); return (EINVAL); } fxp = (u_int16_t *) (buf + fhp->fh_foff); cfxp = (u_int16_t *) (buf + ntmp->ntm_bps - 2); fixup = *fxp++; for (i = 1; i < fhp->fh_fnum; i++, fxp++) { if (*cfxp != fixup) { printf("ntfs_procfixups: fixup %d doesn't match\n", i); return (EINVAL); } *cfxp = *fxp; ((caddr_t) cfxp) += ntmp->ntm_bps; } return (0); } #if UNUSED_CODE int ntfs_runtocn( cn_t * cn, struct ntfsmount * ntmp, u_int8_t * run, u_long len, cn_t vcn) { cn_t ccn = 0; cn_t ccl = 0; u_long off = 0; int error = 0; #if NTFS_DEBUG int i; printf("ntfs_runtocn: run: 0x%p, %ld bytes, vcn:%ld\n", run, len, (u_long) vcn); printf("ntfs_runtocn: run: "); for (i = 0; i < len; i++) printf("0x%02x ", run[i]); printf("\n"); #endif if (NULL == run) { printf("ntfs_runtocn: run == NULL\n"); return (EINVAL); } do { if (run[off] == 0) { printf("ntfs_runtocn: vcn too big\n"); return (E2BIG); } vcn -= ccl; error = ntfs_parserun(&ccn, &ccl, run, len, &off); if (error) { printf("ntfs_runtocn: ntfs_parserun failed\n"); return (error); } } while (ccl <= vcn); *cn = ccn + vcn; return (0); } #endif Index: head/sys/fs/ntfs/ntfs_vfsops.c =================================================================== --- head/sys/fs/ntfs/ntfs_vfsops.c (revision 49534) +++ head/sys/fs/ntfs/ntfs_vfsops.c (revision 49535) @@ -1,996 +1,994 @@ /* $NetBSD: ntfs_vfsops.c,v 1.2 1999/05/06 15:43:20 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_vfsops.c,v 1.6 1999/05/12 09:43:04 semenu Exp $ + * $Id: ntfs_vfsops.c,v 1.7 1999/05/31 11:28:30 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include /*#define NTFS_DEBUG 1*/ #include #include #include #include #include #include #include #if defined(__FreeBSD__) MALLOC_DEFINE(M_NTFSMNT, "NTFS mount", "NTFS mount structure"); MALLOC_DEFINE(M_NTFSNTNODE,"NTFS ntnode", "NTFS ntnode information"); MALLOC_DEFINE(M_NTFSFNODE,"NTFS fnode", "NTFS fnode information"); MALLOC_DEFINE(M_NTFSDIR,"NTFS dir", "NTFS dir buffer"); #endif #if defined(__FreeBSD__) static int ntfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); #else static int ntfs_mount __P((struct mount *, const char *, void *, struct nameidata *, struct proc *)); #endif static int ntfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int ntfs_root __P((struct mount *, struct vnode **)); static int ntfs_start __P((struct mount *, int, struct proc *)); static int ntfs_statfs __P((struct mount *, struct statfs *, struct proc *)); static int ntfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int ntfs_unmount __P((struct mount *, int, struct proc *)); static int ntfs_vget __P((struct mount *mp, ino_t ino, struct vnode **vpp)); static int ntfs_mountfs __P((register struct vnode *, struct mount *, struct ntfs_args *, struct proc *)); static int ntfs_vptofh __P((struct vnode *, struct fid *)); #if defined(__FreeBSD__) static int ntfs_init __P((struct vfsconf *)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); #elif defined(__NetBSD__) static void ntfs_init __P((void)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct vnode **)); static int ntfs_checkexp __P((struct mount *, struct mbuf *, int *, struct ucred **)); static int ntfs_mountroot __P((void)); static int ntfs_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); #else static int ntfs_init __P((void)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, struct vnode **, int *, struct ucred **)); #endif #ifdef __NetBSD__ /*ARGSUSED*/ static int ntfs_checkexp(mp, nam, exflagsp, credanonp) register struct mount *mp; struct mbuf *nam; int *exflagsp; struct ucred **credanonp; { return (EINVAL); } /*ARGSUSED*/ static int ntfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; void *oldp; size_t *oldlenp; void *newp; size_t newlen; struct proc *p; { return (EINVAL); } static int ntfs_mountroot() { return (EINVAL); } #endif #if defined(__FreeBSD__) static int ntfs_init ( struct vfsconf *vcp ) #elif defined(__NetBSD__) static void ntfs_init () #else static int ntfs_init () #endif { ntfs_nthashinit(); #if !defined(__NetBSD__) return 0; #endif } static int ntfs_mount ( struct mount *mp, #if defined(__FreeBSD__) char *path, caddr_t data, #else const char *path, void *data, #endif struct nameidata *ndp, struct proc *p ) { u_int size; int err = 0; struct vnode *devvp; struct ntfs_args args; /* * Use NULL path to flag a root mount */ if( path == NULL) { /* *** * Mounting root file system *** */ /* Get vnode for root device*/ if( bdevvp( rootdev, &rootvp)) panic("ffs_mountroot: can't setup bdevvp for root"); /* * FS specific handling */ mp->mnt_flag |= MNT_RDONLY; /* XXX globally applicable?*/ /* * Attempt mount */ if( ( err = ntfs_mountfs(rootvp, mp, &args, p)) != 0) { /* fs specific cleanup (if any)*/ goto error_1; } goto dostatfs; /* success*/ } /* *** * Mounting non-root file system or updating a file system *** */ /* copy in user arguments*/ err = copyin(data, (caddr_t)&args, sizeof (struct ntfs_args)); if (err) goto error_1; /* can't get arguments*/ /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { printf("ntfs_mount(): MNT_UPDATE not supported\n"); err = EINVAL; goto error_1; #if 0 ump = VFSTOUFS(mp); fs = ump->um_fs; err = 0; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp)) { err = EBUSY; goto error_1; } err = ffs_flushfiles(mp, flags, p); vfs_unbusy(mp); } if (!err && (mp->mnt_flag & MNT_RELOAD)) err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); if (err) { goto error_1; } if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) { if (!fs->fs_clean) { if (mp->mnt_flag & MNT_FORCE) { printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt); } else { printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n", fs->fs_fsmnt); err = EPERM; goto error_1; } } fs->fs_ronly = 0; } if (fs->fs_ronly == 0) { fs->fs_clean = 0; ffs_sbupdate(ump, MNT_WAIT); } /* if not updating name...*/ if (args.fspec == 0) { /* * Process export requests. Jumping to "success" * will return the vfs_export() error code. */ err = vfs_export(mp, &ump->um_export, &args.export); goto success; } #endif } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); err = namei(ndp); if (err) { /* can't get devvp!*/ goto error_1; } devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { err = ENOTBLK; goto error_2; } if (bdevsw(devvp->v_rdev) == NULL) { err = ENXIO; goto error_2; } if (mp->mnt_flag & MNT_UPDATE) { #if 0 /* ******************** * UPDATE ******************** */ if (devvp != ntmp->um_devvp) err = EINVAL; /* needs translation */ else vrele(devvp); /* * Update device name only on success */ if( !err) { /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); } #endif } else { /* ******************** * NEW MOUNT ******************** */ /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. */ /* Save "last mounted on" info for mount point (NULL pad)*/ copyinstr( path, /* mount point*/ mp->mnt_stat.f_mntonname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size); /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, /* device name*/ mp->mnt_stat.f_mntfromname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); err = ntfs_mountfs(devvp, mp, &args, p); } if (err) { goto error_2; } dostatfs: /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname * * This code is common to root and non-root mounts */ (void)VFS_STATFS(mp, &mp->mnt_stat, p); goto success; error_2: /* error with devvp held*/ /* release devvp before failing*/ vrele(devvp); error_1: /* no state to back out*/ success: return( err); } /* * Common code for mount and mountroot */ int ntfs_mountfs(devvp, mp, argsp, p) register struct vnode *devvp; struct mount *mp; struct ntfs_args *argsp; struct proc *p; { struct buf *bp; struct ntfsmount *ntmp; dev_t dev = devvp->v_rdev; int error, ronly, ncount, i; struct vnode *vp; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); ncount = vcount(devvp); #if defined(__FreeBSD__) if (devvp->v_object) ncount -= 1; #endif if (ncount > 1 && devvp != rootvp) return (EBUSY); #if defined(__FreeBSD__) vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); #else error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); #endif if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); bp = NULL; error = bread(devvp, BBLOCK, BBSIZE, NOCRED, &bp); if (error) goto out; ntmp = malloc( sizeof *ntmp, M_NTFSMNT, M_WAITOK ); bzero( ntmp, sizeof *ntmp ); bcopy( bp->b_data, &ntmp->ntm_bootfile, sizeof(struct bootfile) ); brelse( bp ); bp = NULL; if (strncmp(ntmp->ntm_bootfile.bf_sysid, NTFS_BBID, NTFS_BBIDLEN)) { error = EINVAL; printf("ntfs_mountfs: invalid boot block\n"); goto out; } { int8_t cpr = ntmp->ntm_mftrecsz; if( cpr > 0 ) ntmp->ntm_bpmftrec = ntmp->ntm_spc * cpr; else ntmp->ntm_bpmftrec = (1 << (-cpr)) / ntmp->ntm_bps; } dprintf(("ntfs_mountfs(): bps: %d, spc: %d, media: %x, mftrecsz: %d (%d sects)\n", ntmp->ntm_bps,ntmp->ntm_spc,ntmp->ntm_bootfile.bf_media, ntmp->ntm_mftrecsz,ntmp->ntm_bpmftrec)); dprintf(("ntfs_mountfs(): mftcn: 0x%x|0x%x\n", (u_int32_t)ntmp->ntm_mftcn,(u_int32_t)ntmp->ntm_mftmirrcn)); ntmp->ntm_mountp = mp; ntmp->ntm_dev = dev; ntmp->ntm_devvp = devvp; ntmp->ntm_uid = argsp->uid; ntmp->ntm_gid = argsp->gid; ntmp->ntm_mode = argsp->mode; ntmp->ntm_flag = argsp->flag; mp->mnt_data = (qaddr_t)ntmp; dprintf(("ntfs_mountfs(): case-%s,%s uid: %d, gid: %d, mode: %o\n", (ntmp->ntm_flag & NTFS_MFLAG_CASEINS)?"insens.":"sens.", (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES)?" allnames,":"", ntmp->ntm_uid, ntmp->ntm_gid, ntmp->ntm_mode)); /* * We read in some system nodes to do not allow * reclaim them and to have everytime access to them. */ { int pi[3] = { NTFS_MFTINO, NTFS_ROOTINO, NTFS_BITMAPINO }; for (i=0; i<3; i++) { error = VFS_VGET(mp, pi[i], &(ntmp->ntm_sysvn[pi[i]])); if(error) goto out1; ntmp->ntm_sysvn[pi[i]]->v_flag |= VSYSTEM; VREF(ntmp->ntm_sysvn[pi[i]]); vput(ntmp->ntm_sysvn[pi[i]]); } } /* * Read in WHOLE lowcase -> upcase translation * file. */ MALLOC(ntmp->ntm_upcase, wchar *, 65536 * sizeof(wchar), M_NTFSMNT, M_WAITOK); error = VFS_VGET(mp, NTFS_UPCASEINO, &vp); if(error) goto out1; error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, 65536*sizeof(wchar), ntmp->ntm_upcase); vput(vp); if(error) goto out1; /* * Scan $BitMap and count free clusters */ error = ntfs_calccfree(ntmp, &ntmp->ntm_cfree); if(error) goto out1; /* * Read and translate to internal format attribute * definition file. */ { int num,j; struct attrdef ad; /* Open $AttrDef */ error = VFS_VGET(mp, NTFS_ATTRDEFINO, &vp ); if(error) goto out1; /* Count valid entries */ for(num=0;;num++) { error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, num * sizeof(ad), sizeof(ad), &ad); if (error) goto out1; if (ad.ad_name[0] == 0) break; } /* Alloc memory for attribute definitions */ MALLOC(ntmp->ntm_ad, struct ntvattrdef *, num * sizeof(struct ntvattrdef), M_NTFSMNT, M_WAITOK); ntmp->ntm_adnum = num; /* Read them and translate */ for(i=0;intm_ad[i].ad_name[j] = ad.ad_name[j]; } while(ad.ad_name[j++]); ntmp->ntm_ad[i].ad_namelen = j - 1; ntmp->ntm_ad[i].ad_type = ad.ad_type; } vput(vp); } mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); #if defined(__FreeBSD__) mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; #else mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_NTFS); #endif mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; #if defined(__FreeBSD__) devvp->v_specmountpoint = mp; #else devvp->v_specflags |= SI_MOUNTEDON; #endif return (0); out1: for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); if (vflush(mp,NULLVP,0)) printf("ntfs_mountfs: vflush failed\n"); out: #if defined(__FreeBSD__) devvp->v_specmountpoint = NULL; #else devvp->v_specflags &= ~SI_MOUNTEDON; #endif if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); return (error); } static int ntfs_start ( struct mount *mp, int flags, struct proc *p ) { return (0); } static int ntfs_unmount( struct mount *mp, int mntflags, struct proc *p) { register struct ntfsmount *ntmp; int error, ronly = 0, flags, i; dprintf(("ntfs_unmount: unmounting...\n")); ntmp = VFSTONTFS(mp); flags = 0; if(mntflags & MNT_FORCE) flags |= FORCECLOSE; dprintf(("ntfs_unmount: vflushing...\n")); error = vflush(mp,NULLVP,flags | SKIPSYSTEM); if (error) { printf("ntfs_unmount: vflush failed: %d\n",error); return (error); } /* Check if only system vnodes are rest */ for(i=0;intm_sysvn[i]) && (ntmp->ntm_sysvn[i]->v_usecount > 1)) return (EBUSY); /* Derefernce all system vnodes */ for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); /* vflush system vnodes */ error = vflush(mp,NULLVP,flags); if (error) printf("ntfs_unmount: vflush failed(sysnodes): %d\n",error); #if defined(__FreeBSD__) ntmp->ntm_devvp->v_specmountpoint = NULL; #else ntmp->ntm_devvp->v_specflags &= ~SI_MOUNTEDON; #endif vinvalbuf(ntmp->ntm_devvp, V_SAVE, NOCRED, p, 0, 0); error = VOP_CLOSE(ntmp->ntm_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(ntmp->ntm_devvp); dprintf(("ntfs_umount: freeing memory...\n")); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; FREE(ntmp->ntm_ad, M_NTFSMNT); FREE(ntmp->ntm_upcase, M_NTFSMNT); FREE(ntmp, M_NTFSMNT); return (error); } static int ntfs_root( struct mount *mp, struct vnode **vpp ) { struct vnode *nvp; int error = 0; dprintf(("ntfs_root(): sysvn: %p\n", VFSTONTFS(mp)->ntm_sysvn[NTFS_ROOTINO])); error = VFS_VGET(mp, (ino_t)NTFS_ROOTINO, &nvp); if(error) { printf("ntfs_root: VFS_VGET failed: %d\n",error); return (error); } *vpp = nvp; return (0); } static int ntfs_quotactl ( struct mount *mp, int cmds, uid_t uid, caddr_t arg, struct proc *p) { printf("\nntfs_quotactl():\n"); return EOPNOTSUPP; } int ntfs_calccfree( struct ntfsmount *ntmp, cn_t *cfreep) { struct vnode *vp; u_int8_t *tmp; int j, error; long cfree = 0; size_t bmsize, i; vp = ntmp->ntm_sysvn[NTFS_BITMAPINO]; bmsize = VTOF(vp)->f_size; MALLOC(tmp, u_int8_t *, bmsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, bmsize, tmp); if(error) { FREE(tmp, M_TEMP); return (error); } for(i=0;intm_sysvn[NTFS_MFTINO])->f_size; mftallocated = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_allocated; #if defined(__FreeBSD__) sbp->f_type = mp->mnt_vfc->vfc_typenum; #elif defined(__NetBSD__) sbp->f_type = 0; #else sbp->f_type = MOUNT_NTFS; #endif sbp->f_bsize = ntmp->ntm_bps; sbp->f_iosize = ntmp->ntm_bps * ntmp->ntm_spc; sbp->f_blocks = ntmp->ntm_bootfile.bf_spv; sbp->f_bfree = sbp->f_bavail = ntfs_cntobn(ntmp->ntm_cfree); sbp->f_ffree = sbp->f_bfree / ntmp->ntm_bpmftrec; sbp->f_files = mftallocated / ntfs_bntob(ntmp->ntm_bpmftrec) + sbp->f_ffree; if (sbp != &mp->mnt_stat) { bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } sbp->f_flags = mp->mnt_flag; return (0); } static int ntfs_sync ( struct mount *mp, int waitfor, struct ucred *cred, struct proc *p) { /*dprintf(("ntfs_sync():\n"));*/ return (0); } /*ARGSUSED*/ static int ntfs_fhtovp( #if defined(__FreeBSD__) struct mount *mp, struct fid *fhp, struct sockaddr *nam, struct vnode **vpp, int *exflagsp, struct ucred **credanonp) #elif defined(__NetBSD__) struct mount *mp, struct fid *fhp, struct vnode **vpp) #else struct mount *mp, struct fid *fhp, struct mbuf *nam, struct vnode **vpp, int *exflagsp, struct ucred **credanonp) #endif { printf("\ntfs_fhtovp():\n"); return 0; } static int ntfs_vptofh( struct vnode *vp, struct fid *fhp) { printf("ntfs_vptofh():\n"); return EOPNOTSUPP; } int ntfs_vgetex( struct mount *mp, ino_t ino, u_int32_t attrtype, char *attrname, u_long lkflags, u_long flags, struct proc *p, struct vnode **vpp) { int error; register struct ntfsmount *ntmp; struct ntnode *ip; struct fnode *fp; struct vnode *vp; dprintf(("ntfs_vgetex: ino: %d, attr: 0x%x:%s, lkf: 0x%x, f: 0x%x\n", ino, attrtype, attrname?attrname:"", lkflags, flags )); ntmp = VFSTONTFS(mp); *vpp = NULL; /* Get ntnode */ error = ntfs_ntlookup(ntmp, ino, &ip); if (error) { printf("ntfs_vget: ntfs_ntget failed\n"); return (error); } /* It may be not initialized fully, so force load it */ if (!(flags & VG_DONTLOADIN) && !(ip->i_flag & IN_LOADED)) { error = ntfs_loadntnode(ntmp, ip); if(error) { printf("ntfs_vget: CAN'T LOAD ATTRIBUTES FOR INO: %d\n", ip->i_number); ntfs_ntput(ip); return (error); } } error = ntfs_fget(ntmp, ip, attrtype, attrname, &fp); if (error) { printf("ntfs_vget: ntfs_fget failed\n"); ntfs_ntput(ip); return (error); } if (!(flags & VG_DONTVALIDFN) && !(fp->f_flag & FN_VALID)) { if ((ip->i_frflag & NTFS_FRFLAG_DIR) && (fp->f_attrtype == 0x80 && fp->f_attrname == NULL)) { fp->f_type = VDIR; } else if(flags & VG_EXT) { fp->f_type = VNON; fp->f_size =fp->f_allocated = 0; } else { fp->f_type = VREG; error = ntfs_filesize(ntmp, fp, &fp->f_size, &fp->f_allocated); if (error) { ntfs_ntput(ip); return (error); } } fp->f_flag |= FN_VALID; } if (FTOV(fp)) { VGET(FTOV(fp), lkflags, p); *vpp = FTOV(fp); ntfs_ntput(ip); return (0); } error = getnewvnode(VT_NTFS, ntmp->ntm_mountp, ntfs_vnodeop_p, &vp); if(error) { ntfs_frele(fp); ntfs_ntput(ip); return (error); } dprintf(("ntfs_vget: vnode: %p for ntnode: %d\n", vp,ino)); lockinit(&fp->f_lock, PINOD, "fnode", 0, 0); fp->f_vp = vp; vp->v_data = fp; vp->v_type = fp->f_type; if (ino == NTFS_ROOTINO) vp->v_flag |= VROOT; ntfs_ntput(ip); if (lkflags & LK_TYPE_MASK) { error = VN_LOCK(vp, lkflags, p); if (error) { vput(vp); return (error); } } VREF(fp->f_devvp); *vpp = vp; return (0); } static int ntfs_vget( struct mount *mp, ino_t ino, struct vnode **vpp) { return ntfs_vgetex(mp, ino, NTFS_A_DATA, NULL, LK_EXCLUSIVE, 0, curproc, vpp); } #if defined(__FreeBSD__) static struct vfsops ntfs_vfsops = { ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, NULL }; VFS_SET(ntfs_vfsops, ntfs, 0); #elif defined(__NetBSD__) extern struct vnodeopv_desc ntfs_vnodeop_opv_desc; struct vnodeopv_desc *ntfs_vnodeopv_descs[] = { &ntfs_vnodeop_opv_desc, NULL, }; struct vfsops ntfs_vfsops = { MOUNT_NTFS, ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, ntfs_sysctl, ntfs_mountroot, ntfs_checkexp, ntfs_vnodeopv_descs, }; #else static struct vfsops ntfs_vfsops = { ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, }; VFS_SET(ntfs_vfsops, ntfs, MOUNT_NTFS, 0); #endif Index: head/sys/fs/ntfs/ntfs_vnops.c =================================================================== --- head/sys/fs/ntfs/ntfs_vnops.c (revision 49534) +++ head/sys/fs/ntfs/ntfs_vnops.c (revision 49535) @@ -1,1030 +1,1029 @@ /* $NetBSD: ntfs_vnops.c,v 1.2 1999/05/06 15:43:20 christos Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_vnops.c,v 1.4 1999/05/11 19:54:52 phk Exp $ + * $Id: ntfs_vnops.c,v 1.5 1999/05/12 09:43:06 semenu Exp $ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #endif #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #include -#include static int ntfs_bypass __P((struct vop_generic_args *ap)); static int ntfs_read __P((struct vop_read_args *)); static int ntfs_write __P((struct vop_write_args *ap)); static int ntfs_getattr __P((struct vop_getattr_args *ap)); static int ntfs_inactive __P((struct vop_inactive_args *ap)); static int ntfs_print __P((struct vop_print_args *ap)); static int ntfs_reclaim __P((struct vop_reclaim_args *ap)); static int ntfs_strategy __P((struct vop_strategy_args *ap)); #if defined(__NetBSD__) static int ntfs_islocked __P((struct vop_islocked_args *ap)); static int ntfs_unlock __P((struct vop_unlock_args *ap)); static int ntfs_lock __P((struct vop_lock_args *ap)); #endif static int ntfs_access __P((struct vop_access_args *ap)); static int ntfs_open __P((struct vop_open_args *ap)); static int ntfs_close __P((struct vop_close_args *ap)); static int ntfs_readdir __P((struct vop_readdir_args *ap)); static int ntfs_lookup __P((struct vop_lookup_args *ap)); static int ntfs_bmap __P((struct vop_bmap_args *ap)); #if defined(__FreeBSD__) static int ntfs_getpages __P((struct vop_getpages_args *ap)); static int ntfs_putpages __P((struct vop_putpages_args *)); #endif static int ntfs_fsync __P((struct vop_fsync_args *ap)); int ntfs_prtactive = 1; /* 1 => print out reclaim of active vnodes */ #if defined(__FreeBSD__) int ntfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int ntfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } #endif /* * This is a noop, simply returning what one has been given. */ int ntfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn)); if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; #if !defined(__NetBSD__) if (ap->a_runb != NULL) *ap->a_runb = 0; #endif return (0); } static int ntfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int8_t *data; u_int64_t toread; int error; dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); toread = fp->f_size; dprintf(("ntfs_read: filesize: %d",(u_int32_t)toread)); toread = min( uio->uio_resid, toread - uio->uio_offset ); dprintf((", toread: %d\n",(u_int32_t)toread)); MALLOC(data, u_int8_t *, toread, M_TEMP,M_WAITOK); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, toread, data); if(error) { printf("ntfs_read: ntfs_readattr failed: %d\n",error); FREE(data, M_TEMP); return (error); } error = uiomove(data, (int) toread, uio); if(error) { printf("ntfs_read: uiomove failed: %d\n",error); FREE(data, M_TEMP); return (error); } FREE(data, M_TEMP); return (0); } static int ntfs_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { int error = ENOTTY; dprintf(("ntfs_bypass: %s\n", ap->a_desc->vdesc_name)); return (error); } static int ntfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); register struct vattr *vap = ap->a_vap; dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag)); vap->va_fsid = dev2udev(fp->f_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = fp->f_size; vap->va_bytes = fp->f_allocated; vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access); vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write); vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create); vap->va_flags = ip->i_flag; vap->va_gen = 0; vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps; vap->va_type = fp->f_type; vap->va_filerev = 0; return (0); } /* * Last reference to an ntnode. If necessary, write or delete it. */ int ntfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); int error; dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_inactive: pushing active", vp); error = 0; VOP__UNLOCK(vp,0,ap->a_p); /* * If we are done with the ntnode, reclaim it * so that it can be reused immediately. */ if (vp->v_usecount == 0 && ip->i_mode == 0) #if defined(__FreeBSD__) vrecycle(vp, (struct simplelock *)0, ap->a_p); #else /* defined(__NetBSD__) */ vgone(vp); #endif return (error); } /* * Reclaim an inode so that it can be used for other purposes. */ int ntfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); int error; dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number)); error = ntfs_ntget(ip); if (error) return (error); #if defined(__FreeBSD__) VOP__UNLOCK(vp,0,ap->a_p); #endif /* Purge old data structures associated with the inode. */ cache_purge(vp); if (fp->f_devvp) { vrele(fp->f_devvp); fp->f_devvp = NULL; } ntfs_frele(fp); vp->v_data = NULL; ntfs_ntput(ip); return (0); } static int ntfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { /* printf("[ntfs_print]");*/ return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ntfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct ntfsmount *ntmp = ip->i_mp; int error; dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); dprintf(("strategy: bcount: %d flags: 0x%x\n", (u_int32_t)bp->b_bcount,bp->b_flags)); if (bp->b_flags & B_READ) { u_int32_t toread; if (ntfs_cntob(bp->b_blkno) >= fp->f_size) { clrbuf(bp); error = 0; } else { toread = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: toread: %d, fsize: %d\n", toread,(u_int32_t)fp->f_size)); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno), toread, bp->b_data); if (error) { printf("ntfs_strategy: ntfs_readattr failed\n"); bp->b_error = error; bp->b_flags |= B_ERROR; } bzero(bp->b_data + toread, bp->b_bcount - toread); } } else { size_t tmp; u_int32_t towrite; if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) { printf("ntfs_strategy: CAN'T EXTEND FILE\n"); bp->b_error = error = EFBIG; bp->b_flags |= B_ERROR; } else { towrite = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n", towrite,(u_int32_t)fp->f_size)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite, bp->b_data, &tmp); if (error) { printf("ntfs_strategy: ntfs_writeattr fail\n"); bp->b_error = error; bp->b_flags |= B_ERROR; } } } biodone(bp); return (error); } static int ntfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int8_t *data; u_int64_t towrite; off_t off; size_t written; int error; dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); towrite = fp->f_size; dprintf(("ntfs_write: filesize: %d",(u_int32_t)towrite)); if (uio->uio_resid + uio->uio_offset > towrite) { printf("ntfs_write: CAN'T WRITE BEYOND OF FILE\n"); return (EFBIG); } towrite = min(uio->uio_resid, towrite - uio->uio_offset); off = uio->uio_offset; dprintf((", towrite: %d\n",(u_int32_t)towrite)); MALLOC(data, u_int8_t *, towrite, M_TEMP,M_WAITOK); error = uiomove(data, (int) towrite, uio); if(error) { FREE(data, M_TEMP); return (error); } error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, off, towrite, data, &written); if(error) { printf("ntfs_write: ntfs_writeattr failed: %d\n",error); FREE(data, M_TEMP); return (error); } FREE(data, M_TEMP); return (0); } #if defined(__NetBSD__) /* * Check for a locked ntnode. */ int ntfs_islocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { register struct ntnode *ip = VTONT(ap->a_vp); dprintf(("ntfs_islocked %d\n",ip->i_number)); if (ip->i_flag & IN_LOCKED) return (1); return (0); } /* * Unlock an ntnode. If WANT bit is on, wakeup. */ int ntfs_lockcount = 90; int ntfs_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { register struct ntnode *ip = VTONT(ap->a_vp); #ifdef DIAGNOSTIC struct proc *p = curproc; #endif dprintf(("ntfs_unlock %d\n",ip->i_number)); #ifdef DIAGNOSTIC if ((ip->i_flag & IN_LOCKED) == 0) { vprint("ntfs_unlock: unlocked ntnode", ap->a_vp); panic("ntfs_unlock NOT LOCKED"); } if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 && ip->i_lockholder > -1 && ntfs_lockcount++ < 100) panic("unlocker (%d) != lock holder (%d)", p->p_pid, ip->i_lockholder); #endif if (--ip->i_lockcount > 0) { if ((ip->i_flag & IN_RECURSE) == 0) panic("ntfs_unlock: recursive lock prematurely released, pid=%d\n", ip->i_lockholder); return (0); } ip->i_lockholder = 0; ip->i_flag &= ~(IN_LOCKED|IN_RECURSE); if (ip->i_flag & IN_WANTED) { ip->i_flag &= ~IN_WANTED; wakeup((caddr_t)ip); } return (0); } /* * Lock an ntnode. If its already locked, set the WANT bit and sleep. */ int ntfs_lock(ap) struct vop_lock_args /* { struct vnode *a_vp; } */ *ap; { struct proc *p = curproc; register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); dprintf(("ntfs_lock %d (%d locks)\n",ip->i_number,ip->i_lockcount)); start: while (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t)vp, PINOD, "ntflk1", 0); } if (vp->v_tag == VT_NON) return (ENOENT); ip = VTONT(vp); if (ip->i_flag & IN_LOCKED) { if (p->p_pid == ip->i_lockholder) { if( (ip->i_flag & IN_RECURSE) == 0) panic("ntfs_lock: recursive lock not expected, pid: %d\n", ip->i_lockholder); } else { ip->i_flag |= IN_WANTED; #ifdef DIAGNOSTIC if (p) ip->i_lockwaiter = p->p_pid; else ip->i_lockwaiter = -1; #endif (void) tsleep((caddr_t)ip, PINOD, "ntflk2", 0); goto start; } } #ifdef DIAGNOSTIC ip->i_lockwaiter = 0; if (((ip->i_flag & IN_RECURSE) == 0) && (ip->i_lockholder != 0)) panic("lockholder (%d) != 0", ip->i_lockholder); if (p && p->p_pid == 0) printf("locking by process 0\n"); #endif if ((ip->i_flag & IN_RECURSE) == 0) ip->i_lockcount = 1; else ++ip->i_lockcount; if (p) ip->i_lockholder = p->p_pid; else ip->i_lockholder = -1; ip->i_flag |= IN_LOCKED; return (0); } #endif int ntfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct ntnode *ip = VTONT(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; register gid_t *gp; int i; #ifdef QUOTA int error; #endif dprintf(("ntfs_access: %d\n",ip->i_number)); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } /* If immutable bit set, nobody gets to write it. */ /* if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) return (EPERM); */ /* Otherwise, user id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->i_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->i_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int ntfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_open: %d\n",ip->i_number); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ntfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_close: %d\n",ip->i_number); #endif return (0); } int ntfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; int i, error = 0; u_int32_t faked = 0, num; int ncookies = 0; struct dirent cde; off_t off; dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; /* Simulate . in every dir except ROOT */ if( ip->i_number != NTFS_ROOTINO ) { struct dirent dot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 1, "." }; if( uio->uio_offset < sizeof(struct dirent) ) { dot.d_fileno = ip->i_number; error = uiomove((char *)&dot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } } /* Simulate .. in every dir including ROOT */ if( uio->uio_offset < 2 * sizeof(struct dirent) ) { struct dirent dotdot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 2, ".." }; error = uiomove((char *)&dotdot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2; num = uio->uio_offset / sizeof(struct dirent) - faked; while( uio->uio_resid >= sizeof(struct dirent) ) { struct attr_indexentry *iep; error = ntfs_ntreaddir(ntmp, fp, num, &iep); if(error) return (error); if( NULL == iep ) break; while( !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent)) ) { if( ntfs_isnamepermitted(ntmp,iep) ) { dprintf(("ntfs_readdir: elem: %d, fname:[",num)); for(i=0;iie_fnamelen;i++) { cde.d_name[i] = (char)iep->ie_fname[i]; dprintf(("%c", cde.d_name[i])); } dprintf(("] type: %d, flag: %d, ",iep->ie_fnametype, iep->ie_flag)); cde.d_name[i] = '\0'; cde.d_namlen = iep->ie_fnamelen; cde.d_fileno = iep->ie_number; cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg")); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if(error) return (error); ncookies++; num++; } iep = NTFS_NEXTREC(iep,struct attr_indexentry *); } } dprintf(("ntfs_readdir: %d entries (%d bytes) read\n", ncookies,(u_int)(uio->uio_offset - off))); dprintf(("ntfs_readdir: off: %d resid: %d\n", (u_int32_t)uio->uio_offset,uio->uio_resid)); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; #if defined(__FreeBSD__) u_long *cookies; u_long *cookiep; #else /* defined(__NetBSD__) */ off_t *cookies; off_t *cookiep; #endif printf("ntfs_readdir: %d cookies\n",ncookies); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ntfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); #if defined(__FreeBSD__) MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); #else /* defined(__NetBSD__) */ MALLOC(cookies, off_t *, ncookies * sizeof(off_t), M_TEMP, M_WAITOK); #endif for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } /* if (ap->a_eofflag) *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset; */ return (error); } int ntfs_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct ntnode *dip = VTONT(dvp); struct ntfsmount *ntmp = dip->i_mp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int lockparent = cnp->cn_flags & LOCKPARENT; #if NTFS_DEBUG int wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); #endif dprintf(("ntfs_lookup: %s (%ld bytes) in %d, lp: %d, wp: %d \n", cnp->cn_nameptr, cnp->cn_namelen, dip->i_number,lockparent, wantparent)); error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc); if(error) return (error); if( (cnp->cn_namelen == 1) && !strncmp(cnp->cn_nameptr,".",1) ) { dprintf(("ntfs_lookup: faking . directory in %d\n", dip->i_number)); VREF(dvp); *ap->a_vpp = dvp; return (0); } else if( (cnp->cn_namelen == 2) && !strncmp(cnp->cn_nameptr,"..",2) && (cnp->cn_flags & ISDOTDOT) ) { struct ntvattr *vap; dprintf(("ntfs_lookup: faking .. directory in %d\n", dip->i_number)); error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap); if(error) return (error); VOP__UNLOCK(dvp,0,cnp->cn_proc); dprintf(("ntfs_lookup: parentdir: %d\n", vap->va_a_name->n_pnumber)); error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber,ap->a_vpp); ntfs_ntvattrrele(vap); if(error) { VOP__LOCK(dvp, 0, cnp->cn_proc); return(error); } if( lockparent && (cnp->cn_flags & ISLASTCN) && (error = VOP__LOCK(dvp, 0, cnp->cn_proc)) ) { vput( *(ap->a_vpp) ); return (error); } return (error); } else { error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp); if(error) return (error); dprintf(("ntfs_lookup: found ino: %d\n", VTONT(*ap->a_vpp)->i_number)); if(!lockparent || !(cnp->cn_flags & ISLASTCN)) VOP__UNLOCK(dvp, 0, cnp->cn_proc); if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *ap->a_vpp, cnp); } return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int ntfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { return (0); } /* * Global vfs data structures */ vop_t **ntfs_vnodeop_p; #if defined(__FreeBSD__) static #endif struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *)ntfs_bypass }, { &vop_getattr_desc, (vop_t *)ntfs_getattr }, { &vop_inactive_desc, (vop_t *)ntfs_inactive }, { &vop_reclaim_desc, (vop_t *)ntfs_reclaim }, { &vop_print_desc, (vop_t *)ntfs_print }, #if defined(__FreeBSD__) { &vop_islocked_desc, (vop_t *)vop_stdislocked }, { &vop_unlock_desc, (vop_t *)vop_stdunlock }, { &vop_lock_desc, (vop_t *)vop_stdlock }, { &vop_cachedlookup_desc, (vop_t *)ntfs_lookup }, { &vop_lookup_desc, (vop_t *)vfs_cache_lookup }, #else { &vop_islocked_desc, (vop_t *)ntfs_islocked }, { &vop_unlock_desc, (vop_t *)ntfs_unlock }, { &vop_lock_desc, (vop_t *)ntfs_lock }, { &vop_lookup_desc, (vop_t *)ntfs_lookup }, #endif { &vop_access_desc, (vop_t *)ntfs_access }, { &vop_close_desc, (vop_t *)ntfs_close }, { &vop_open_desc, (vop_t *)ntfs_open }, { &vop_readdir_desc, (vop_t *)ntfs_readdir }, { &vop_fsync_desc, (vop_t *)ntfs_fsync }, { &vop_bmap_desc, (vop_t *)ntfs_bmap }, #if defined(__FreeBSD__) { &vop_getpages_desc, (vop_t *) ntfs_getpages }, { &vop_putpages_desc, (vop_t *) ntfs_putpages }, #endif { &vop_strategy_desc, (vop_t *)ntfs_strategy }, #if defined(__FreeBSD__) { &vop_bwrite_desc, (vop_t *)vop_stdbwrite }, #else /* defined(__NetBSD__) */ { &vop_bwrite_desc, (vop_t *)vn_bwrite }, #endif { &vop_read_desc, (vop_t *)ntfs_read }, { &vop_write_desc, (vop_t *)ntfs_write }, { NULL, NULL } }; #if defined(__FreeBSD__) static #endif struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; #if defined(__FreeBSD__) VNODEOP_SET(ntfs_vnodeop_opv_desc); #endif Index: head/sys/fs/specfs/spec_vnops.c =================================================================== --- head/sys/fs/specfs/spec_vnops.c (revision 49534) +++ head/sys/fs/specfs/spec_vnops.c (revision 49535) @@ -1,963 +1,961 @@ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 - * $Id: spec_vnops.c,v 1.89 1999/06/26 02:46:21 mckusick Exp $ + * $Id: spec_vnops.c,v 1.90 1999/07/20 09:47:45 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include static int spec_advlock __P((struct vop_advlock_args *)); static int spec_badop __P((void)); static int spec_bmap __P((struct vop_bmap_args *)); static int spec_close __P((struct vop_close_args *)); static int spec_freeblks __P((struct vop_freeblks_args *)); static int spec_fsync __P((struct vop_fsync_args *)); static int spec_getattr __P((struct vop_getattr_args *)); static int spec_getpages __P((struct vop_getpages_args *)); static int spec_inactive __P((struct vop_inactive_args *)); static int spec_ioctl __P((struct vop_ioctl_args *)); static int spec_lookup __P((struct vop_lookup_args *)); static int spec_open __P((struct vop_open_args *)); static int spec_poll __P((struct vop_poll_args *)); static int spec_print __P((struct vop_print_args *)); static int spec_read __P((struct vop_read_args *)); static int spec_strategy __P((struct vop_strategy_args *)); static int spec_write __P((struct vop_write_args *)); vop_t **spec_vnodeop_p; static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_ebadf }, { &vop_advlock_desc, (vop_t *) spec_advlock }, { &vop_bmap_desc, (vop_t *) spec_bmap }, { &vop_close_desc, (vop_t *) spec_close }, { &vop_create_desc, (vop_t *) spec_badop }, { &vop_freeblks_desc, (vop_t *) spec_freeblks }, { &vop_fsync_desc, (vop_t *) spec_fsync }, { &vop_getattr_desc, (vop_t *) spec_getattr }, { &vop_getpages_desc, (vop_t *) spec_getpages }, { &vop_inactive_desc, (vop_t *) spec_inactive }, { &vop_ioctl_desc, (vop_t *) spec_ioctl }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) spec_badop }, { &vop_lookup_desc, (vop_t *) spec_lookup }, { &vop_mkdir_desc, (vop_t *) spec_badop }, { &vop_mknod_desc, (vop_t *) spec_badop }, { &vop_open_desc, (vop_t *) spec_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) spec_poll }, { &vop_print_desc, (vop_t *) spec_print }, { &vop_read_desc, (vop_t *) spec_read }, { &vop_readdir_desc, (vop_t *) spec_badop }, { &vop_readlink_desc, (vop_t *) spec_badop }, { &vop_reallocblks_desc, (vop_t *) spec_badop }, { &vop_reclaim_desc, (vop_t *) vop_null }, { &vop_remove_desc, (vop_t *) spec_badop }, { &vop_rename_desc, (vop_t *) spec_badop }, { &vop_rmdir_desc, (vop_t *) spec_badop }, { &vop_setattr_desc, (vop_t *) vop_ebadf }, { &vop_strategy_desc, (vop_t *) spec_strategy }, { &vop_symlink_desc, (vop_t *) spec_badop }, { &vop_write_desc, (vop_t *) spec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; VNODEOP_SET(spec_vnodeop_opv_desc); int spec_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } static void spec_getpages_iodone __P((struct buf *bp)); /* * Trivial lookup routine that always fails. */ static int spec_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * Open a special file. */ /* ARGSUSED */ static int spec_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct proc *p = ap->a_p; struct vnode *bvp, *vp = ap->a_vp; dev_t bdev, dev = vp->v_rdev; int error; struct cdevsw *dsw; /* * Don't allow open if fs is mounted -nodev. */ if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) return (ENXIO); switch (vp->v_type) { case VCHR: dsw = devsw(dev); if ( (dsw == NULL) || (dsw->d_open == NULL)) return ENXIO; if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. */ if (securelevel >= 2 && dsw->d_bmaj != -1 && (dsw->d_flags & D_TYPEMASK) == D_DISK) return (EPERM); /* * When running in secure mode, do not allow opens * for writing of /dev/mem, /dev/kmem, or character * devices whose corresponding block devices are * currently mounted. */ if (securelevel >= 1) { if ((bdev = chrtoblk(dev)) != NODEV && vfinddev(bdev, VBLK, &bvp) && bvp->v_usecount > 0 && (error = vfs_mountedon(bvp))) return (error); if (iskmemdev(dev)) return (EPERM); } } if ((dsw->d_flags & D_TYPEMASK) == D_TTY) vp->v_flag |= VISTTY; VOP_UNLOCK(vp, 0, p); error = (*dsw->d_open)(dev, ap->a_mode, S_IFCHR, p); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); /* NOT REACHED */ case VBLK: dsw = bdevsw(dev); if ( (dsw == NULL) || (dsw->d_open == NULL)) return ENXIO; /* * When running in very secure mode, do not allow * opens for writing of any disk block devices. */ if (securelevel >= 2 && ap->a_cred != FSCRED && (ap->a_mode & FWRITE) && (dsw->d_flags & D_TYPEMASK) == D_DISK) return (EPERM); /* * Do not allow opens of block devices that are * currently mounted. */ error = vfs_mountedon(vp); if (error) return (error); return ((*dsw->d_open)(dev, ap->a_mode, S_IFBLK, p)); /* NOT REACHED */ default: break; } return (0); } /* * Vnode op for read */ /* ARGSUSED */ static int spec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn, nextbn; long bsize, bscale; struct partinfo dpart; int n, on; d_ioctl_t *ioctl; int error = 0; dev_t dev; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("spec_read mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_read proc"); #endif if (uio->uio_resid == 0) return (0); switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*devsw(vp->v_rdev)->d_read) (vp->v_rdev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_offset < 0) return (EINVAL); dev = vp->v_rdev; /* * Calculate block size for block device. The block size must * be larger then the physical minimum. */ bsize = vp->v_specinfo->si_bsize_best; if ((ioctl = bdevsw(dev)->d_ioctl) != NULL && (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && dpart.part->p_fstype == FS_BSDFFS && dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) bsize = dpart.part->p_frag * dpart.part->p_fsize; bscale = btodb(bsize); do { bn = btodb(uio->uio_offset) & ~(bscale - 1); on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (vp->v_lastr + bscale == bn) { nextbn = bn + bscale; error = breadn(vp, bn, (int)bsize, &nextbn, (int *)&bsize, 1, NOCRED, &bp); } else error = bread(vp, bn, (int)bsize, NOCRED, &bp); vp->v_lastr = bn; n = min(n, bsize - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove((char *)bp->b_data + on, n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("spec_read type"); } /* NOTREACHED */ } /* * Vnode op for write */ /* ARGSUSED */ static int spec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn; int bsize, blkmask; struct partinfo dpart; register int n, on; int error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("spec_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_write proc"); #endif switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*devsw(vp->v_rdev)->d_write) (vp->v_rdev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); /* * Calculate block size for block device. The block size must * be larger then the physical minimum. */ bsize = vp->v_specinfo->si_bsize_best; if ((*bdevsw(vp->v_rdev)->d_ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) { if (dpart.part->p_fstype == FS_BSDFFS && dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) bsize = dpart.part->p_frag * dpart.part->p_fsize; } blkmask = btodb(bsize) - 1; do { bn = btodb(uio->uio_offset) & ~blkmask; on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (n == bsize) bp = getblk(vp, bn, bsize, 0, 0); else error = bread(vp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); if (n + on == bsize) bawrite(bp); else bdwrite(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("spec_write type"); } /* NOTREACHED */ } /* * Device ioctl operation. */ /* ARGSUSED */ static int spec_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { dev_t dev = ap->a_vp->v_rdev; switch (ap->a_vp->v_type) { case VCHR: return ((*devsw(dev)->d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); case VBLK: return ((*bdevsw(dev)->d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); default: panic("spec_ioctl"); /* NOTREACHED */ } } /* ARGSUSED */ static int spec_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register dev_t dev; switch (ap->a_vp->v_type) { case VCHR: dev = ap->a_vp->v_rdev; return (*devsw(dev)->d_poll)(dev, ap->a_events, ap->a_p); default: return (vop_defaultop((struct vop_generic_args *)ap)); } } /* * Synch buffers associated with a block device */ /* ARGSUSED */ static int spec_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct buf *bp; struct buf *nbp; int s; if (vp->v_type == VCHR) return (0); /* * Flush all dirty buffers associated with a block device. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("spec_fsync: not dirty"); if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); splx(s); } else { bremfree(bp); splx(s); bawrite(bp); } goto loop; } if (ap->a_waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spfsyn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("spec_fsync: dirty", vp); splx(s); goto loop; } #endif } splx(s); return (0); } static int spec_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } /* * Just call the device strategy routine */ static int spec_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { struct buf *bp; bp = ap->a_bp; if (((bp->b_flags & B_READ) == 0) && (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) (*bioops.io_start)(bp); (*bdevsw(bp->b_dev)->d_strategy)(bp); return (0); } static int spec_freeblks(ap) struct vop_freeblks_args /* { struct vnode *a_vp; daddr_t a_addr; daddr_t a_length; } */ *ap; { struct cdevsw *bsw; struct buf *bp; bsw = bdevsw(ap->a_vp->v_rdev); if ((bsw->d_flags & D_CANFREE) == 0) return (0); bp = geteblk(ap->a_length); bp->b_flags |= B_FREEBUF; bp->b_dev = ap->a_vp->v_rdev; bp->b_blkno = ap->a_addr; bp->b_offset = dbtob(ap->a_addr); bp->b_bcount = ap->a_length; (*bsw->d_strategy)(bp); return (0); } /* * This is a noop, simply returning what one has been given. */ static int spec_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Device close routine */ /* ARGSUSED */ static int spec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; d_close_t *devclose; int mode, error; switch (vp->v_type) { case VCHR: /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (vcount(vp) == 2 && ap->a_p && (vp->v_flag & VXLOCK) == 0 && vp == ap->a_p->p_session->s_ttyvp) { vrele(vp); ap->a_p->p_session->s_ttyvp = NULL; } /* * If the vnode is locked, then we are in the midst * of forcably closing the device, otherwise we only * close on last reference. */ if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) return (0); devclose = devsw(dev)->d_close; mode = S_IFCHR; break; case VBLK: /* * On last close of a block device (that isn't mounted) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); VOP_UNLOCK(vp, 0, ap->a_p); if (error) return (error); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ if ((vcount(vp) > 1) && (vp->v_flag & VXLOCK) == 0) return (0); devclose = bdevsw(dev)->d_close; mode = S_IFBLK; break; default: panic("spec_close: not special"); } return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); } /* * Print out the contents of a special device vnode. */ static int spec_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), minor(ap->a_vp->v_rdev)); return (0); } /* * Special device advisory byte-level locks. */ /* ARGSUSED */ static int spec_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* * Special device bad operation */ static int spec_badop() { panic("spec_badop called"); /* NOTREACHED */ } static void spec_getpages_iodone(bp) struct buf *bp; { bp->b_flags |= B_DONE; wakeup(bp); } static int spec_getpages(ap) struct vop_getpages_args *ap; { vm_offset_t kva; int error; int i, pcount, size, s; daddr_t blkno; struct buf *bp; vm_page_t m; vm_ooffset_t offset; int toff, nextoff, nread; struct vnode *vp = ap->a_vp; int blksiz; int gotreqpage; error = 0; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * Calculate the offset of the transfer and do sanity check. * FreeBSD currently only supports an 8 TB range due to b_blkno * being in DEV_BSIZE ( usually 512 ) byte chunks on call to * VOP_STRATEGY. XXX */ offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; #define DADDR_T_BIT (sizeof(daddr_t)*8) #define OFFSET_MAX ((1LL << (DADDR_T_BIT + DEV_BSHIFT)) - 1) if (offset < 0 || offset > OFFSET_MAX) { /* XXX still no %q in kernel. */ printf("spec_getpages: preposterous offset 0x%x%08x\n", (u_int)((u_quad_t)offset >> 32), (u_int)(offset & 0xffffffff)); return (VM_PAGER_ERROR); } blkno = btodb(offset); /* * Round up physical size for real devices. We cannot round using * v_mount's block size data because v_mount has nothing to do with * the device. i.e. it's usually '/dev'. We need the physical block * size for the device itself. * * We can't use v_specmountpoint because it only exists when the * block device is mounted. However, we can use v_specinfo. */ if (vp->v_type == VBLK) blksiz = vp->v_specinfo->si_bsize_phys; else blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* * Map the pages to be read into the kva. */ pmap_qenter(kva, ap->a_m, pcount); /* Build a minimal buffer header. */ bp->b_flags = B_READ | B_CALL; bp->b_iodone = spec_getpages_iodone; /* B_PHYS is not set, but it is nice to fill this in. */ bp->b_rcred = bp->b_wcred = curproc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_blkno = blkno; bp->b_lblkno = blkno; pbgetvp(ap->a_vp, bp); bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; /* Do the input. */ VOP_STRATEGY(bp->b_vp, bp); s = splbio(); /* We definitely need to be at splbio here. */ while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PVM, "spread", 0); splx(s); if ((bp->b_flags & B_ERROR) != 0) { if (bp->b_error) error = bp->b_error; else error = EIO; } nread = size - bp->b_resid; if (nread < ap->a_count) { bzero((caddr_t)kva + nread, ap->a_count - nread); } pmap_qremove(kva, pcount); gotreqpage = 0; for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { nextoff = toff + PAGE_SIZE; m = ap->a_m[i]; m->flags &= ~PG_ZERO; if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } else if (toff < nread) { /* * Since this is a VM request, we have to supply the * unaligned offset to allow vm_page_set_validclean() * to zero sub-DEV_BSIZE'd portions of the page. */ vm_page_set_validclean(m, 0, nread - toff); } else { m->valid = 0; m->dirty = 0; } if (i != ap->a_reqpage) { /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error || (m->valid == VM_PAGE_BITS_ALL)) { if (m->valid) { if (m->flags & PG_WANTED) { vm_page_activate(m); } else { vm_page_deactivate(m); } vm_page_wakeup(m); } else { vm_page_free(m); } } else { vm_page_free(m); } } else if (m->valid) { gotreqpage = 1; /* * Since this is a VM request, we need to make the * entire page presentable by zeroing invalid sections. */ if (m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, FALSE); } } if (!gotreqpage) { m = ap->a_m[ap->a_reqpage]; #ifndef MAX_PERF printf( "spec_getpages: I/O read failure: (error code=%d) bp %p vp %p\n", error, bp, bp->b_vp); printf( " size: %d, resid: %ld, a_count: %d, valid: 0x%x\n", size, bp->b_resid, ap->a_count, m->valid); printf( " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", nread, ap->a_reqpage, (u_long)m->pindex, pcount); #endif /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_OK; } /* ARGSUSED */ static int spec_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vattr *vap = ap->a_vap; struct partinfo dpart; bzero(vap, sizeof (*vap)); if (vp->v_type == VBLK) { if (vp->v_specinfo) vap->va_blocksize = vp->v_specmountpoint->mnt_stat.f_iosize; else vap->va_blocksize = BLKDEV_IOSIZE; } else if (vp->v_type == VCHR) { vap->va_blocksize = MAXBSIZE; } if ((*bdevsw(vp->v_rdev)->d_ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, ap->a_p) == 0) { vap->va_bytes = dbtob(dpart.disklab->d_partitions [minor(vp->v_rdev)].p_size); vap->va_size = vap->va_bytes; } return (0); } Index: head/sys/gnu/ext2fs/ext2_bmap.c =================================================================== --- head/sys/gnu/ext2fs/ext2_bmap.c (revision 49534) +++ head/sys/gnu/ext2fs/ext2_bmap.c (revision 49535) @@ -1,355 +1,354 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.27 1999/05/07 10:11:36 phk Exp $ + * $Id: ufs_bmap.c,v 1.28 1999/05/08 06:40:25 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include -#include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; ufs_daddr_t a_bn; struct vnode **a_vpp; ufs_daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; ufs_daddr_t bn; ufs_daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { *runp = 0; } if (runb) { *runb = 0; } maxrun = 0; if (runp || runb || (vp->v_maxio == 0)) { struct vnode *devvp; int blksize; blksize = mp->mnt_stat.f_iosize; /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ devvp = ip->i_devvp; if (devvp != NULL && devvp->v_tag != VT_MFS && devvp->v_type == VBLK) { if (bdevsw(devvp->v_rdev)->d_maxio > MAXPHYS) { maxrun = MAXPHYS; vp->v_maxio = MAXPHYS; } else { maxrun = bdevsw(devvp->v_rdev)->d_maxio; vp->v_maxio = bdevsw(devvp->v_rdev)->d_maxio; } maxrun = maxrun / blksize; maxrun -= 1; } if (maxrun <= 0) { vp->v_maxio = DFLTPHYS; maxrun = DFLTPHYS / blksize; maxrun -= 1; } } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((ufs_daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((ufs_daddr_t *)bp->b_data)[bn - 1], ((ufs_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; ufs_daddr_t bn; struct indir *ap; int *nump; { long blockcnt, metalbn, realbn; struct ufsmount *ump; int i, numlevels, off; int64_t qblockcnt; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the previous level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); /* * Use int64_t's here to avoid overflow for triple indirect * blocks when longs have 32 bits and the block size is more * than 4K. */ qblockcnt = (int64_t)blockcnt * MNINDIR(ump); if (bn < qblockcnt) break; blockcnt = qblockcnt; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; blockcnt /= MNINDIR(ump); } if (nump) *nump = numlevels; return (0); } Index: head/sys/gnu/ext2fs/ext2_vfsops.c =================================================================== --- head/sys/gnu/ext2fs/ext2_vfsops.c (revision 49534) +++ head/sys/gnu/ext2fs/ext2_vfsops.c (revision 49535) @@ -1,1190 +1,1188 @@ /* * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#include - #include #include #include #include #include #include #include #include static int ext2_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int ext2_flushfiles __P((struct mount *mp, int flags, struct proc *p)); static int ext2_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int ext2_mountfs __P((struct vnode *, struct mount *, struct proc *)); static int ext2_reload __P((struct mount *mountp, struct ucred *cred, struct proc *p)); static int ext2_sbupdate __P((struct ufsmount *, int)); static int ext2_statfs __P((struct mount *, struct statfs *, struct proc *)); static int ext2_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int ext2_unmount __P((struct mount *, int, struct proc *)); static int ext2_vget __P((struct mount *, ino_t, struct vnode **)); static int ext2_vptofh __P((struct vnode *, struct fid *)); static MALLOC_DEFINE(M_EXT2NODE, "EXT2 node", "EXT2 vnode private part"); static struct vfsops ext2fs_vfsops = { ext2_mount, ufs_start, /* empty function */ ext2_unmount, ufs_root, /* root inode via vget */ ufs_quotactl, /* does operations associated with quotas */ ext2_statfs, ext2_sync, ext2_vget, ext2_fhtovp, ext2_vptofh, ext2_init, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); #define bsd_malloc malloc #define bsd_free free static int ext2fs_inode_hash_lock; static int compute_sb_data __P((struct vnode * devvp, struct ext2_super_block * es, struct ext2_sb_info * fs)); #ifdef notyet static int ext2_mountroot __P((void)); /* * Called by main() when ext2fs is going to be mounted as root. * * Name is updated by mount(8) after booting. */ #define ROOTNAME "root_device" static int ext2_mountroot() { register struct ext2_sb_info *fs; register struct mount *mp; struct proc *p = curproc; struct ufsmount *ump; u_int size; int error; if ((error = bdevvp(rootdev, &rootvp))) { printf("ext2_mountroot: can't find rootvp"); return (error); } mp = bsd_malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); mp->mnt_op = &ext2fs_vfsops; mp->mnt_flag = MNT_RDONLY; if (bdevsw(rootdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(rootdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if (error = ext2_mountfs(rootvp, mp, p)) { bsd_free(mp, M_MOUNT); return (error); } if (error = vfs_lock(mp)) { (void)ext2_unmount(mp, 0, p); bsd_free(mp, M_MOUNT); return (error); } CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list); mp->mnt_flag |= MNT_ROOTFS; mp->mnt_vnodecovered = NULLVP; ump = VFSTOUFS(mp); fs = ump->um_e2fs; bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); fs->fs_fsmnt[0] = '/'; bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN); (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)ext2_statfs(mp, &mp->mnt_stat, p); vfs_unlock(mp); inittodr(fs->s_es->s_wtime); /* this helps to set the time */ return (0); } #endif /* * VFS Operations. * * mount system call */ static int ext2_mount(mp, path, data, ndp, p) register struct mount *mp; char *path; caddr_t data; /* this is actually a (struct ufs_args *) */ struct nameidata *ndp; struct proc *p; { struct vnode *devvp; struct ufs_args args; struct ufsmount *ump = 0; register struct ext2_sb_info *fs; u_int size; int error, flags; mode_t accessmode; if ((error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) != 0) return (error); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. * Disallow clearing MNT_NOCLUSTERR and MNT_NOCLUSTERW flags, * if block device requests. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_e2fs; error = 0; if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if (fs->s_rd_only == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp, LK_NOWAIT, 0, p)) return (EBUSY); error = ext2_flushfiles(mp, flags, p); vfs_unbusy(mp, p); if (!error && fs->s_wasvalid) { fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } fs->s_rd_only = 1; } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, ndp->ni_cnd.cn_cred, p); if (error) return (error); if (fs->s_rd_only && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (p->p_ucred->cr_uid != 0) { devvp = ump->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p)) != 0) { VOP_UNLOCK(devvp, 0, p); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 || (fs->s_es->s_state & EXT2_ERROR_FS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } fs->s_es->s_state &= ~EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); fs->s_rd_only = 0; } if (args.fspec == 0) { /* * Process export requests. */ return (vfs_export(mp, &ump->um_export, &args.export)); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); if ((error = namei(ndp)) != 0) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return (ENOTBLK); } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return (ENXIO); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (p->p_ucred->cr_uid != 0) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p)) != 0) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; error = ext2_mountfs(devvp, mp, p); } else { if (devvp != ump->um_devvp) error = EINVAL; /* needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOUFS(mp); fs = ump->um_e2fs; (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)ext2_statfs(mp, &mp->mnt_stat, p); return (0); } /* * checks that the data in the descriptor blocks make sense * this is taken from ext2/super.c */ static int ext2_check_descriptors (struct ext2_sb_info * sb) { int i; int desc_block = 0; unsigned long block = sb->s_es->s_first_data_block; struct ext2_group_desc * gdp = NULL; /* ext2_debug ("Checking group descriptors"); */ for (i = 0; i < sb->s_groups_count; i++) { /* examine next descriptor block */ if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext2_group_desc *) sb->s_group_desc[desc_block++]->b_data; if (gdp->bg_block_bitmap < block || gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Block bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_block_bitmap); return 0; } if (gdp->bg_inode_bitmap < block || gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_bitmap); return 0; } if (gdp->bg_inode_table < block || gdp->bg_inode_table + sb->s_itb_per_group >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode table for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_table); return 0; } block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } return 1; } /* * this computes the fields of the ext2_sb_info structure from the * data in the ext2_super_block structure read in */ static int compute_sb_data(devvp, es, fs) struct vnode * devvp; struct ext2_super_block * es; struct ext2_sb_info * fs; { int db_count, error; int i, j; int logic_sb_block = 1; /* XXX for now */ #if 1 #define V(v) #else #define V(v) printf(#v"= %d\n", fs->v); #endif fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; V(s_blocksize) fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size; V(s_bshift) fs->s_fsbtodb = es->s_log_block_size + 1; V(s_fsbtodb) fs->s_qbmask = fs->s_blocksize - 1; V(s_bmask) fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es); V(s_blocksize_bits) fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size; V(s_frag_size) if (fs->s_frag_size) fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size; V(s_frags_per_block) fs->s_blocks_per_group = es->s_blocks_per_group; V(s_blocks_per_group) fs->s_frags_per_group = es->s_frags_per_group; V(s_frags_per_group) fs->s_inodes_per_group = es->s_inodes_per_group; V(s_inodes_per_group) fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE; V(s_inodes_per_block) fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block; V(s_itb_per_group) fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc); V(s_desc_per_block) /* s_resuid / s_resgid ? */ fs->s_groups_count = (es->s_blocks_count - es->s_first_data_block + EXT2_BLOCKS_PER_GROUP(fs) - 1) / EXT2_BLOCKS_PER_GROUP(fs); V(s_groups_count) db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) / EXT2_DESC_PER_BLOCK(fs); fs->s_db_per_group = db_count; V(s_db_per_group) fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *), M_UFSMNT, M_WAITOK); /* adjust logic_sb_block */ if(fs->s_blocksize > SBSIZE) /* Godmar thinks: if the blocksize is greater than 1024, then the superblock is logically part of block zero. */ logic_sb_block = 0; for (i = 0; i < db_count; i++) { error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), fs->s_blocksize, NOCRED, &fs->s_group_desc[i]); if(error) { for (j = 0; j < i; j++) brelse(fs->s_group_desc[j]); bsd_free(fs->s_group_desc, M_UFSMNT); printf("EXT2-fs: unable to read group descriptors (%d)\n", error); return EIO; } /* Set the B_LOCKED flag on the buffer, then brelse() it */ LCK_BUF(fs->s_group_desc[i]) } if(!ext2_check_descriptors(fs)) { for (j = 0; j < db_count; j++) ULCK_BUF(fs->s_group_desc[j]) bsd_free(fs->s_group_desc, M_UFSMNT); printf("EXT2-fs: (ext2_check_descriptors failure) " "unable to read group descriptors\n"); return EIO; } for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) { fs->s_inode_bitmap_number[i] = 0; fs->s_inode_bitmap[i] = NULL; fs->s_block_bitmap_number[i] = 0; fs->s_block_bitmap[i] = NULL; } fs->s_loaded_inode_bitmaps = 0; fs->s_loaded_block_bitmaps = 0; return 0; } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ext2_reload(mountp, cred, p) register struct mount *mountp; struct ucred *cred; struct proc *p; { register struct vnode *vp, *nvp, *devvp; struct inode *ip; struct buf *bp; struct ext2_super_block * es; struct ext2_sb_info *fs; int error; if ((mountp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mountp)->um_devvp; if (vinvalbuf(devvp, 0, cred, p, 0, 0)) panic("ext2_reload: dirty1"); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2_super_block *)bp->b_data; if (es->s_magic != EXT2_SUPER_MAGIC) { if(es->s_magic == EXT2_PRE_02B_MAGIC) printf("This filesystem bears the magic number of a pre " "0.2b version of ext2. This is not supported by " "Lites.\n"); else printf("Wrong magic number: %x (expected %x for ext2 fs\n", es->s_magic, EXT2_SUPER_MAGIC); brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOUFS(mountp)->um_e2fs; bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block)); if((error = compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return error; } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); loop: simple_lock(&mntvnode_slock); for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { if (vp->v_mount != mountp) { simple_unlock(&mntvnode_slock); goto loop; } nvp = vp->v_mntvnodes.le_next; /* * Step 4: invalidate all inactive vnodes. */ if (vrecycle(vp, &mntvnode_slock, p)) goto loop; /* * Step 5: invalidate all cached file data. */ simple_lock(&vp->v_interlock); simple_unlock(&mntvnode_slock); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { goto loop; } if (vinvalbuf(vp, 0, cred, p, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp); if (error) { vput(vp); return (error); } ext2_ei2di((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), &ip->i_din); brelse(bp); vput(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); return (0); } /* * Common code for mount and mountroot */ static int ext2_mountfs(devvp, mp, p) register struct vnode *devvp; struct mount *mp; struct proc *p; { register struct ufsmount *ump; struct buf *bp; register struct ext2_sb_info *fs; struct ext2_super_block * es; dev_t dev = devvp->v_rdev; struct partinfo dpart; int havepart = 0; int error, i, size; int ronly; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ if ((error = vfs_mountedon(devvp)) != 0) return (error); if (vcount(devvp) > 1 && devvp != rootvp) return (EBUSY); if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) != 0) return (error); #ifdef READONLY /* turn on this to force it to be read-only */ mp->mnt_flag |= MNT_RDONLY; #endif ronly = (mp->mnt_flag & MNT_RDONLY) != 0; if ((error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) != 0) return (error); if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) size = DEV_BSIZE; else { havepart = 1; size = dpart.disklab->d_secsize; } bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2_super_block *)bp->b_data; if (es->s_magic != EXT2_SUPER_MAGIC) { if(es->s_magic == EXT2_PRE_02B_MAGIC) printf("This filesystem bears the magic number of a pre " "0.2b version of ext2. This is not supported by " "Lites.\n"); else printf("Wrong magic number: %x (expected %x for EXT2FS)\n", es->s_magic, EXT2_SUPER_MAGIC); error = EINVAL; /* XXX needs translation */ goto out; } if ((es->s_state & EXT2_VALID_FS) == 0 || (es->s_state & EXT2_ERROR_FS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = bsd_malloc(sizeof *ump, M_UFSMNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); ump->um_malloctype = M_EXT2NODE; ump->um_blkatoff = ext2_blkatoff; ump->um_truncate = ext2_truncate; ump->um_update = ext2_update; ump->um_valloc = ext2_valloc; ump->um_vfree = ext2_vfree; /* I don't know whether this is the right strategy. Note that we dynamically allocate both a ext2_sb_info and a ext2_super_block while Linux keeps the super block in a locked buffer */ ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), M_UFSMNT, M_WAITOK); ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), M_UFSMNT, M_WAITOK); bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block)); if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs))) goto out; /* * We don't free the group descriptors allocated by compute_sb_data() * until ext2_unmount(). This is OK since the mount will succeed. */ brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->s_rd_only = ronly; /* ronly is set according to mnt_flags */ /* if the fs is not mounted read-only, make sure the super block is always written back on a sync() */ fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0; if (ronly == 0) { fs->s_dirt = 1; /* mark it modified */ fs->s_es->s_state &= ~EXT2_VALID_FS; /* set fs invalid */ } mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; /* setting those two parameters allows us to use ufs_bmap w/o changse ! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = fs->s_es->s_log_block_size + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; devvp->v_specmountpoint = mp; if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); return (0); out: if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); if (ump) { bsd_free(ump->um_e2fs->s_es, M_UFSMNT); bsd_free(ump->um_e2fs, M_UFSMNT); bsd_free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } /* * unmount system call */ static int ext2_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct ufsmount *ump; register struct ext2_sb_info *fs; int error, flags, ronly, i; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, p)) != 0) return (error); ump = VFSTOUFS(mp); fs = ump->um_e2fs; ronly = fs->s_rd_only; if (ronly == 0) { if (fs->s_wasvalid) fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } /* release buffers containing group descriptors */ for(i = 0; i < fs->s_db_per_group; i++) ULCK_BUF(fs->s_group_desc[i]) bsd_free(fs->s_group_desc, M_UFSMNT); /* release cached inode/block bitmaps */ for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_inode_bitmap[i]) ULCK_BUF(fs->s_inode_bitmap[i]) for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_block_bitmap[i]) ULCK_BUF(fs->s_block_bitmap[i]) ump->um_devvp->v_specmountpoint = NULL; error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(ump->um_devvp); bsd_free(fs->s_es, M_UFSMNT); bsd_free(fs, M_UFSMNT); bsd_free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(mp, flags, p) register struct mount *mp; int flags; struct proc *p; { register struct ufsmount *ump; int error; #if QUOTA int i; #endif ump = VFSTOUFS(mp); #if QUOTA if (mp->mnt_flag & MNT_QUOTA) { if ((error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) != 0) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(p, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif error = vflush(mp, NULLVP, flags); return (error); } /* * Get file system statistics. * taken from ext2/super.c ext2_statfs */ static int ext2_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { unsigned long overhead; unsigned long overhead_per_group; register struct ufsmount *ump; register struct ext2_sb_info *fs; register struct ext2_super_block *es; ump = VFSTOUFS(mp); fs = ump->um_e2fs; es = fs->s_es; if (es->s_magic != EXT2_SUPER_MAGIC) panic("ext2_statfs - magic number spoiled"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* super block */ + fs->s_db_per_group + 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->s_itb_per_group; overhead = es->s_first_data_block + fs->s_groups_count * overhead_per_group; sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = es->s_blocks_count - overhead; sbp->f_bfree = es->s_free_blocks_count; sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; sbp->f_files = es->s_inodes_count; sbp->f_ffree = es->s_free_inodes_count; if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { struct vnode *nvp, *vp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct ext2_sb_info *fs; int error, allerror = 0; fs = ump->um_e2fs; if (fs->s_dirt != 0 && fs->s_rd_only != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ext2_sync: rofs mod"); } /* * Write back each (modified) inode. */ simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (TAILQ_EMPTY(&vp->v_dirtyblkhd) || waitfor == MNT_LAZY))) { simple_unlock(&vp->v_interlock); continue; } simple_unlock(&mntvnode_slock); error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); if (error == ENOENT) goto loop; continue; } if ((error = VOP_FSYNC(vp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(vp, 0, p); vrele(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0, p); } #if QUOTA qsync(mp); #endif /* * Write back modified superblock. */ if (fs->s_dirt != 0) { fs->s_dirt = 0; fs->s_es->s_wtime = time_second; if ((error = ext2_sbupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { register struct ext2_sb_info *fs; register struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; dev_t dev; int i, error; int used_blocks; ump = VFSTOUFS(mp); dev = ump->um_dev; restart: if ((*vpp = ufs_ihashget(dev, ino)) != NULL) return (0); /* * Lock out the creation of new entries in the FFS hash table in * case getnewvnode() or MALLOC() blocks, otherwise a duplicate * may occur! */ if (ext2fs_inode_hash_lock) { while (ext2fs_inode_hash_lock) { ext2fs_inode_hash_lock = -1; tsleep(&ext2fs_inode_hash_lock, PVM, "e2vget", 0); } goto restart; } ext2fs_inode_hash_lock = 1; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ext2_sync() if a sync happens to fire right then, * which will cause a panic because ext2_sync() blindly * dereferences vp->v_data (as well it should). */ MALLOC(ip, struct inode *, sizeof(struct inode), M_EXT2NODE, M_WAITOK); /* Allocate a new vnode/inode. */ if ((error = getnewvnode(VT_UFS, mp, ext2_vnodeop_p, &vp)) != 0) { if (ext2fs_inode_hash_lock < 0) wakeup(&ext2fs_inode_hash_lock); ext2fs_inode_hash_lock = 0; *vpp = NULL; FREE(ip, M_EXT2NODE); return (error); } bzero((caddr_t)ip, sizeof(struct inode)); lockinit(&ip->i_lock, PINOD, "ext2in", 0, 0); vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_dev = dev; ip->i_number = ino; #if QUOTA for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; #endif /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ ufs_ihashins(ip); if (ext2fs_inode_hash_lock < 0) wakeup(&ext2fs_inode_hash_lock); ext2fs_inode_hash_lock = 0; /* Read in the disk contents for the inode, copy into the inode. */ #if 0 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino))); #endif if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->s_blocksize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vput(vp); brelse(bp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ ext2_ei2di((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ino)), &ip->i_din); ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; ip->i_prealloc_count = 0; ip->i_prealloc_block = 0; /* now we want to make sure that block pointers for unused blocks are zeroed out - ext2_balloc depends on this although for regular files and directories only */ if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) { used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize; for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } /* ext2_print_inode(ip); */ brelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ufs_vinit(mp, ext2_specop_p, ext2_fifoop_p, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; VREF(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { register struct ufid *ufhp; struct ext2_sb_info *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_e2fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->s_groups_count * fs->s_es->s_inodes_per_group) return (ESTALE); return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct inode *ip; register struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { register struct ext2_sb_info *fs = mp->um_e2fs; register struct ext2_super_block *es = fs->s_es; register struct buf *bp; int error = 0; /* printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no"); */ bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here */ return (error); } Index: head/sys/gnu/fs/ext2fs/ext2_bmap.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_bmap.c (revision 49534) +++ head/sys/gnu/fs/ext2fs/ext2_bmap.c (revision 49535) @@ -1,355 +1,354 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.27 1999/05/07 10:11:36 phk Exp $ + * $Id: ufs_bmap.c,v 1.28 1999/05/08 06:40:25 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include -#include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; ufs_daddr_t a_bn; struct vnode **a_vpp; ufs_daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; ufs_daddr_t bn; ufs_daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { *runp = 0; } if (runb) { *runb = 0; } maxrun = 0; if (runp || runb || (vp->v_maxio == 0)) { struct vnode *devvp; int blksize; blksize = mp->mnt_stat.f_iosize; /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ devvp = ip->i_devvp; if (devvp != NULL && devvp->v_tag != VT_MFS && devvp->v_type == VBLK) { if (bdevsw(devvp->v_rdev)->d_maxio > MAXPHYS) { maxrun = MAXPHYS; vp->v_maxio = MAXPHYS; } else { maxrun = bdevsw(devvp->v_rdev)->d_maxio; vp->v_maxio = bdevsw(devvp->v_rdev)->d_maxio; } maxrun = maxrun / blksize; maxrun -= 1; } if (maxrun <= 0) { vp->v_maxio = DFLTPHYS; maxrun = DFLTPHYS / blksize; maxrun -= 1; } } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((ufs_daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((ufs_daddr_t *)bp->b_data)[bn - 1], ((ufs_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; ufs_daddr_t bn; struct indir *ap; int *nump; { long blockcnt, metalbn, realbn; struct ufsmount *ump; int i, numlevels, off; int64_t qblockcnt; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the previous level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); /* * Use int64_t's here to avoid overflow for triple indirect * blocks when longs have 32 bits and the block size is more * than 4K. */ qblockcnt = (int64_t)blockcnt * MNINDIR(ump); if (bn < qblockcnt) break; blockcnt = qblockcnt; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; blockcnt /= MNINDIR(ump); } if (nump) *nump = numlevels; return (0); } Index: head/sys/gnu/fs/ext2fs/ext2_vfsops.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 49534) +++ head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 49535) @@ -1,1190 +1,1188 @@ /* * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#include - #include #include #include #include #include #include #include #include static int ext2_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int ext2_flushfiles __P((struct mount *mp, int flags, struct proc *p)); static int ext2_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int ext2_mountfs __P((struct vnode *, struct mount *, struct proc *)); static int ext2_reload __P((struct mount *mountp, struct ucred *cred, struct proc *p)); static int ext2_sbupdate __P((struct ufsmount *, int)); static int ext2_statfs __P((struct mount *, struct statfs *, struct proc *)); static int ext2_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int ext2_unmount __P((struct mount *, int, struct proc *)); static int ext2_vget __P((struct mount *, ino_t, struct vnode **)); static int ext2_vptofh __P((struct vnode *, struct fid *)); static MALLOC_DEFINE(M_EXT2NODE, "EXT2 node", "EXT2 vnode private part"); static struct vfsops ext2fs_vfsops = { ext2_mount, ufs_start, /* empty function */ ext2_unmount, ufs_root, /* root inode via vget */ ufs_quotactl, /* does operations associated with quotas */ ext2_statfs, ext2_sync, ext2_vget, ext2_fhtovp, ext2_vptofh, ext2_init, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); #define bsd_malloc malloc #define bsd_free free static int ext2fs_inode_hash_lock; static int compute_sb_data __P((struct vnode * devvp, struct ext2_super_block * es, struct ext2_sb_info * fs)); #ifdef notyet static int ext2_mountroot __P((void)); /* * Called by main() when ext2fs is going to be mounted as root. * * Name is updated by mount(8) after booting. */ #define ROOTNAME "root_device" static int ext2_mountroot() { register struct ext2_sb_info *fs; register struct mount *mp; struct proc *p = curproc; struct ufsmount *ump; u_int size; int error; if ((error = bdevvp(rootdev, &rootvp))) { printf("ext2_mountroot: can't find rootvp"); return (error); } mp = bsd_malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); mp->mnt_op = &ext2fs_vfsops; mp->mnt_flag = MNT_RDONLY; if (bdevsw(rootdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(rootdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if (error = ext2_mountfs(rootvp, mp, p)) { bsd_free(mp, M_MOUNT); return (error); } if (error = vfs_lock(mp)) { (void)ext2_unmount(mp, 0, p); bsd_free(mp, M_MOUNT); return (error); } CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list); mp->mnt_flag |= MNT_ROOTFS; mp->mnt_vnodecovered = NULLVP; ump = VFSTOUFS(mp); fs = ump->um_e2fs; bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); fs->fs_fsmnt[0] = '/'; bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN); (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)ext2_statfs(mp, &mp->mnt_stat, p); vfs_unlock(mp); inittodr(fs->s_es->s_wtime); /* this helps to set the time */ return (0); } #endif /* * VFS Operations. * * mount system call */ static int ext2_mount(mp, path, data, ndp, p) register struct mount *mp; char *path; caddr_t data; /* this is actually a (struct ufs_args *) */ struct nameidata *ndp; struct proc *p; { struct vnode *devvp; struct ufs_args args; struct ufsmount *ump = 0; register struct ext2_sb_info *fs; u_int size; int error, flags; mode_t accessmode; if ((error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) != 0) return (error); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. * Disallow clearing MNT_NOCLUSTERR and MNT_NOCLUSTERW flags, * if block device requests. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_e2fs; error = 0; if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if (fs->s_rd_only == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp, LK_NOWAIT, 0, p)) return (EBUSY); error = ext2_flushfiles(mp, flags, p); vfs_unbusy(mp, p); if (!error && fs->s_wasvalid) { fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } fs->s_rd_only = 1; } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, ndp->ni_cnd.cn_cred, p); if (error) return (error); if (fs->s_rd_only && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (p->p_ucred->cr_uid != 0) { devvp = ump->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p)) != 0) { VOP_UNLOCK(devvp, 0, p); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 || (fs->s_es->s_state & EXT2_ERROR_FS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } fs->s_es->s_state &= ~EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); fs->s_rd_only = 0; } if (args.fspec == 0) { /* * Process export requests. */ return (vfs_export(mp, &ump->um_export, &args.export)); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); if ((error = namei(ndp)) != 0) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return (ENOTBLK); } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return (ENXIO); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (p->p_ucred->cr_uid != 0) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p)) != 0) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; error = ext2_mountfs(devvp, mp, p); } else { if (devvp != ump->um_devvp) error = EINVAL; /* needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOUFS(mp); fs = ump->um_e2fs; (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)ext2_statfs(mp, &mp->mnt_stat, p); return (0); } /* * checks that the data in the descriptor blocks make sense * this is taken from ext2/super.c */ static int ext2_check_descriptors (struct ext2_sb_info * sb) { int i; int desc_block = 0; unsigned long block = sb->s_es->s_first_data_block; struct ext2_group_desc * gdp = NULL; /* ext2_debug ("Checking group descriptors"); */ for (i = 0; i < sb->s_groups_count; i++) { /* examine next descriptor block */ if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext2_group_desc *) sb->s_group_desc[desc_block++]->b_data; if (gdp->bg_block_bitmap < block || gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Block bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_block_bitmap); return 0; } if (gdp->bg_inode_bitmap < block || gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_bitmap); return 0; } if (gdp->bg_inode_table < block || gdp->bg_inode_table + sb->s_itb_per_group >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode table for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_table); return 0; } block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } return 1; } /* * this computes the fields of the ext2_sb_info structure from the * data in the ext2_super_block structure read in */ static int compute_sb_data(devvp, es, fs) struct vnode * devvp; struct ext2_super_block * es; struct ext2_sb_info * fs; { int db_count, error; int i, j; int logic_sb_block = 1; /* XXX for now */ #if 1 #define V(v) #else #define V(v) printf(#v"= %d\n", fs->v); #endif fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; V(s_blocksize) fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size; V(s_bshift) fs->s_fsbtodb = es->s_log_block_size + 1; V(s_fsbtodb) fs->s_qbmask = fs->s_blocksize - 1; V(s_bmask) fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es); V(s_blocksize_bits) fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size; V(s_frag_size) if (fs->s_frag_size) fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size; V(s_frags_per_block) fs->s_blocks_per_group = es->s_blocks_per_group; V(s_blocks_per_group) fs->s_frags_per_group = es->s_frags_per_group; V(s_frags_per_group) fs->s_inodes_per_group = es->s_inodes_per_group; V(s_inodes_per_group) fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE; V(s_inodes_per_block) fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block; V(s_itb_per_group) fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc); V(s_desc_per_block) /* s_resuid / s_resgid ? */ fs->s_groups_count = (es->s_blocks_count - es->s_first_data_block + EXT2_BLOCKS_PER_GROUP(fs) - 1) / EXT2_BLOCKS_PER_GROUP(fs); V(s_groups_count) db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) / EXT2_DESC_PER_BLOCK(fs); fs->s_db_per_group = db_count; V(s_db_per_group) fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *), M_UFSMNT, M_WAITOK); /* adjust logic_sb_block */ if(fs->s_blocksize > SBSIZE) /* Godmar thinks: if the blocksize is greater than 1024, then the superblock is logically part of block zero. */ logic_sb_block = 0; for (i = 0; i < db_count; i++) { error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), fs->s_blocksize, NOCRED, &fs->s_group_desc[i]); if(error) { for (j = 0; j < i; j++) brelse(fs->s_group_desc[j]); bsd_free(fs->s_group_desc, M_UFSMNT); printf("EXT2-fs: unable to read group descriptors (%d)\n", error); return EIO; } /* Set the B_LOCKED flag on the buffer, then brelse() it */ LCK_BUF(fs->s_group_desc[i]) } if(!ext2_check_descriptors(fs)) { for (j = 0; j < db_count; j++) ULCK_BUF(fs->s_group_desc[j]) bsd_free(fs->s_group_desc, M_UFSMNT); printf("EXT2-fs: (ext2_check_descriptors failure) " "unable to read group descriptors\n"); return EIO; } for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) { fs->s_inode_bitmap_number[i] = 0; fs->s_inode_bitmap[i] = NULL; fs->s_block_bitmap_number[i] = 0; fs->s_block_bitmap[i] = NULL; } fs->s_loaded_inode_bitmaps = 0; fs->s_loaded_block_bitmaps = 0; return 0; } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ext2_reload(mountp, cred, p) register struct mount *mountp; struct ucred *cred; struct proc *p; { register struct vnode *vp, *nvp, *devvp; struct inode *ip; struct buf *bp; struct ext2_super_block * es; struct ext2_sb_info *fs; int error; if ((mountp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mountp)->um_devvp; if (vinvalbuf(devvp, 0, cred, p, 0, 0)) panic("ext2_reload: dirty1"); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2_super_block *)bp->b_data; if (es->s_magic != EXT2_SUPER_MAGIC) { if(es->s_magic == EXT2_PRE_02B_MAGIC) printf("This filesystem bears the magic number of a pre " "0.2b version of ext2. This is not supported by " "Lites.\n"); else printf("Wrong magic number: %x (expected %x for ext2 fs\n", es->s_magic, EXT2_SUPER_MAGIC); brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOUFS(mountp)->um_e2fs; bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block)); if((error = compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return error; } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); loop: simple_lock(&mntvnode_slock); for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { if (vp->v_mount != mountp) { simple_unlock(&mntvnode_slock); goto loop; } nvp = vp->v_mntvnodes.le_next; /* * Step 4: invalidate all inactive vnodes. */ if (vrecycle(vp, &mntvnode_slock, p)) goto loop; /* * Step 5: invalidate all cached file data. */ simple_lock(&vp->v_interlock); simple_unlock(&mntvnode_slock); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { goto loop; } if (vinvalbuf(vp, 0, cred, p, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp); if (error) { vput(vp); return (error); } ext2_ei2di((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), &ip->i_din); brelse(bp); vput(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); return (0); } /* * Common code for mount and mountroot */ static int ext2_mountfs(devvp, mp, p) register struct vnode *devvp; struct mount *mp; struct proc *p; { register struct ufsmount *ump; struct buf *bp; register struct ext2_sb_info *fs; struct ext2_super_block * es; dev_t dev = devvp->v_rdev; struct partinfo dpart; int havepart = 0; int error, i, size; int ronly; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ if ((error = vfs_mountedon(devvp)) != 0) return (error); if (vcount(devvp) > 1 && devvp != rootvp) return (EBUSY); if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) != 0) return (error); #ifdef READONLY /* turn on this to force it to be read-only */ mp->mnt_flag |= MNT_RDONLY; #endif ronly = (mp->mnt_flag & MNT_RDONLY) != 0; if ((error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) != 0) return (error); if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) size = DEV_BSIZE; else { havepart = 1; size = dpart.disklab->d_secsize; } bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2_super_block *)bp->b_data; if (es->s_magic != EXT2_SUPER_MAGIC) { if(es->s_magic == EXT2_PRE_02B_MAGIC) printf("This filesystem bears the magic number of a pre " "0.2b version of ext2. This is not supported by " "Lites.\n"); else printf("Wrong magic number: %x (expected %x for EXT2FS)\n", es->s_magic, EXT2_SUPER_MAGIC); error = EINVAL; /* XXX needs translation */ goto out; } if ((es->s_state & EXT2_VALID_FS) == 0 || (es->s_state & EXT2_ERROR_FS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = bsd_malloc(sizeof *ump, M_UFSMNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); ump->um_malloctype = M_EXT2NODE; ump->um_blkatoff = ext2_blkatoff; ump->um_truncate = ext2_truncate; ump->um_update = ext2_update; ump->um_valloc = ext2_valloc; ump->um_vfree = ext2_vfree; /* I don't know whether this is the right strategy. Note that we dynamically allocate both a ext2_sb_info and a ext2_super_block while Linux keeps the super block in a locked buffer */ ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), M_UFSMNT, M_WAITOK); ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), M_UFSMNT, M_WAITOK); bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block)); if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs))) goto out; /* * We don't free the group descriptors allocated by compute_sb_data() * until ext2_unmount(). This is OK since the mount will succeed. */ brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->s_rd_only = ronly; /* ronly is set according to mnt_flags */ /* if the fs is not mounted read-only, make sure the super block is always written back on a sync() */ fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0; if (ronly == 0) { fs->s_dirt = 1; /* mark it modified */ fs->s_es->s_state &= ~EXT2_VALID_FS; /* set fs invalid */ } mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; /* setting those two parameters allows us to use ufs_bmap w/o changse ! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = fs->s_es->s_log_block_size + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; devvp->v_specmountpoint = mp; if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); return (0); out: if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); if (ump) { bsd_free(ump->um_e2fs->s_es, M_UFSMNT); bsd_free(ump->um_e2fs, M_UFSMNT); bsd_free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } /* * unmount system call */ static int ext2_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct ufsmount *ump; register struct ext2_sb_info *fs; int error, flags, ronly, i; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, p)) != 0) return (error); ump = VFSTOUFS(mp); fs = ump->um_e2fs; ronly = fs->s_rd_only; if (ronly == 0) { if (fs->s_wasvalid) fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } /* release buffers containing group descriptors */ for(i = 0; i < fs->s_db_per_group; i++) ULCK_BUF(fs->s_group_desc[i]) bsd_free(fs->s_group_desc, M_UFSMNT); /* release cached inode/block bitmaps */ for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_inode_bitmap[i]) ULCK_BUF(fs->s_inode_bitmap[i]) for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_block_bitmap[i]) ULCK_BUF(fs->s_block_bitmap[i]) ump->um_devvp->v_specmountpoint = NULL; error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(ump->um_devvp); bsd_free(fs->s_es, M_UFSMNT); bsd_free(fs, M_UFSMNT); bsd_free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(mp, flags, p) register struct mount *mp; int flags; struct proc *p; { register struct ufsmount *ump; int error; #if QUOTA int i; #endif ump = VFSTOUFS(mp); #if QUOTA if (mp->mnt_flag & MNT_QUOTA) { if ((error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) != 0) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(p, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif error = vflush(mp, NULLVP, flags); return (error); } /* * Get file system statistics. * taken from ext2/super.c ext2_statfs */ static int ext2_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { unsigned long overhead; unsigned long overhead_per_group; register struct ufsmount *ump; register struct ext2_sb_info *fs; register struct ext2_super_block *es; ump = VFSTOUFS(mp); fs = ump->um_e2fs; es = fs->s_es; if (es->s_magic != EXT2_SUPER_MAGIC) panic("ext2_statfs - magic number spoiled"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* super block */ + fs->s_db_per_group + 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->s_itb_per_group; overhead = es->s_first_data_block + fs->s_groups_count * overhead_per_group; sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = es->s_blocks_count - overhead; sbp->f_bfree = es->s_free_blocks_count; sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; sbp->f_files = es->s_inodes_count; sbp->f_ffree = es->s_free_inodes_count; if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { struct vnode *nvp, *vp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct ext2_sb_info *fs; int error, allerror = 0; fs = ump->um_e2fs; if (fs->s_dirt != 0 && fs->s_rd_only != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ext2_sync: rofs mod"); } /* * Write back each (modified) inode. */ simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (TAILQ_EMPTY(&vp->v_dirtyblkhd) || waitfor == MNT_LAZY))) { simple_unlock(&vp->v_interlock); continue; } simple_unlock(&mntvnode_slock); error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); if (error == ENOENT) goto loop; continue; } if ((error = VOP_FSYNC(vp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(vp, 0, p); vrele(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0, p); } #if QUOTA qsync(mp); #endif /* * Write back modified superblock. */ if (fs->s_dirt != 0) { fs->s_dirt = 0; fs->s_es->s_wtime = time_second; if ((error = ext2_sbupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { register struct ext2_sb_info *fs; register struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; dev_t dev; int i, error; int used_blocks; ump = VFSTOUFS(mp); dev = ump->um_dev; restart: if ((*vpp = ufs_ihashget(dev, ino)) != NULL) return (0); /* * Lock out the creation of new entries in the FFS hash table in * case getnewvnode() or MALLOC() blocks, otherwise a duplicate * may occur! */ if (ext2fs_inode_hash_lock) { while (ext2fs_inode_hash_lock) { ext2fs_inode_hash_lock = -1; tsleep(&ext2fs_inode_hash_lock, PVM, "e2vget", 0); } goto restart; } ext2fs_inode_hash_lock = 1; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ext2_sync() if a sync happens to fire right then, * which will cause a panic because ext2_sync() blindly * dereferences vp->v_data (as well it should). */ MALLOC(ip, struct inode *, sizeof(struct inode), M_EXT2NODE, M_WAITOK); /* Allocate a new vnode/inode. */ if ((error = getnewvnode(VT_UFS, mp, ext2_vnodeop_p, &vp)) != 0) { if (ext2fs_inode_hash_lock < 0) wakeup(&ext2fs_inode_hash_lock); ext2fs_inode_hash_lock = 0; *vpp = NULL; FREE(ip, M_EXT2NODE); return (error); } bzero((caddr_t)ip, sizeof(struct inode)); lockinit(&ip->i_lock, PINOD, "ext2in", 0, 0); vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_dev = dev; ip->i_number = ino; #if QUOTA for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; #endif /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ ufs_ihashins(ip); if (ext2fs_inode_hash_lock < 0) wakeup(&ext2fs_inode_hash_lock); ext2fs_inode_hash_lock = 0; /* Read in the disk contents for the inode, copy into the inode. */ #if 0 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino))); #endif if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->s_blocksize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vput(vp); brelse(bp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ ext2_ei2di((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ino)), &ip->i_din); ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; ip->i_prealloc_count = 0; ip->i_prealloc_block = 0; /* now we want to make sure that block pointers for unused blocks are zeroed out - ext2_balloc depends on this although for regular files and directories only */ if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) { used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize; for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } /* ext2_print_inode(ip); */ brelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ufs_vinit(mp, ext2_specop_p, ext2_fifoop_p, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; VREF(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { register struct ufid *ufhp; struct ext2_sb_info *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_e2fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->s_groups_count * fs->s_es->s_inodes_per_group) return (ESTALE); return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct inode *ip; register struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { register struct ext2_sb_info *fs = mp->um_e2fs; register struct ext2_super_block *es = fs->s_es; register struct buf *bp; int error = 0; /* printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no"); */ bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here */ return (error); } Index: head/sys/isofs/cd9660/cd9660_vfsops.c =================================================================== --- head/sys/isofs/cd9660/cd9660_vfsops.c (revision 49534) +++ head/sys/isofs/cd9660/cd9660_vfsops.c (revision 49535) @@ -1,956 +1,955 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 - * $Id: cd9660_vfsops.c,v 1.55 1999/05/08 06:39:32 phk Exp $ + * $Id: cd9660_vfsops.c,v 1.56 1999/05/31 11:27:21 phk Exp $ */ #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ISOFSMNT, "ISOFS mount", "ISOFS mount structure"); MALLOC_DEFINE(M_ISOFSNODE, "ISOFS node", "ISOFS vnode private part"); static int cd9660_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int cd9660_start __P((struct mount *, int, struct proc *)); static int cd9660_unmount __P((struct mount *, int, struct proc *)); static int cd9660_root __P((struct mount *, struct vnode **)); static int cd9660_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int cd9660_statfs __P((struct mount *, struct statfs *, struct proc *)); static int cd9660_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int cd9660_vget __P((struct mount *, ino_t, struct vnode **)); static int cd9660_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int cd9660_vptofh __P((struct vnode *, struct fid *)); static struct vfsops cd9660_vfsops = { cd9660_mount, cd9660_start, cd9660_unmount, cd9660_root, cd9660_quotactl, cd9660_statfs, cd9660_sync, cd9660_vget, cd9660_fhtovp, cd9660_vptofh, cd9660_init }; VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY); /* * Called by vfs_mountroot when iso is going to be mounted as root. */ static int iso_get_ssector __P((dev_t dev, struct proc *p)); static int iso_mountfs __P((struct vnode *devvp, struct mount *mp, struct proc *p, struct iso_args *argp)); /* * Try to find the start of the last data track on this CD-ROM. This * is used to mount the last session of a multi-session CD. Bail out * and return 0 if we fail, this is always a safe bet. */ static int iso_get_ssector(dev, p) dev_t dev; struct proc *p; { struct ioc_toc_header h; struct ioc_read_toc_single_entry t; int i; struct cdevsw *bd; d_ioctl_t *ioctlp; bd = bdevsw(dev); ioctlp = bd->d_ioctl; if (ioctlp == NULL) return 0; if (ioctlp(dev, CDIOREADTOCHEADER, (caddr_t)&h, FREAD, p) != 0) return 0; for (i = h.ending_track; i >= 0; i--) { t.address_format = CD_LBA_FORMAT; t.track = i; if (ioctlp(dev, CDIOREADTOCENTRY, (caddr_t)&t, FREAD, p) != 0) return 0; if ((t.entry.control & 4) != 0) /* found a data track */ break; } if (i < 0) return 0; return ntohl(t.entry.addr.lba); } static int iso_mountroot __P((struct mount *mp, struct proc *p)); static int iso_mountroot(mp, p) struct mount *mp; struct proc *p; { struct iso_args args; int error; if ((error = bdevvp(rootdev, &rootvp))) { printf("iso_mountroot: can't find rootvp"); return (error); } args.flags = ISOFSMNT_ROOT; args.ssector = iso_get_ssector(rootdev, p); if (bootverbose) printf("iso_mountroot(): using session at block %d\n", args.ssector); if ((error = iso_mountfs(rootvp, mp, p, &args)) != 0) return (error); (void)cd9660_statfs(mp, &mp->mnt_stat, p); return (0); } /* * VFS Operations. * * mount system call */ static int cd9660_mount(mp, path, data, ndp, p) register struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct proc *p; { struct vnode *devvp; struct iso_args args; size_t size; int error; mode_t accessmode; struct iso_mnt *imp = 0; if ((mp->mnt_flag & MNT_ROOTFS) != 0) { if (bdevsw(rootdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; return (iso_mountroot(mp, p)); } if ((error = copyin(data, (caddr_t)&args, sizeof (struct iso_args)))) return (error); if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EROFS); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. * Disallow clearing MNT_NOCLUSTERR flag, if block device requests. */ if (mp->mnt_flag & MNT_UPDATE) { imp = VFSTOISOFS(mp); if (bdevsw(imp->im_devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (args.fspec == 0) return (vfs_export(mp, &imp->im_export, &args.export)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); if ((error = namei(ndp))) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return ENOTBLK; } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return ENXIO; } /* * Verify that user has necessary permissions on the device, * or has superuser abilities */ accessmode = VREAD; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p); if (error) error = suser(p); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); if ((mp->mnt_flag & MNT_UPDATE) == 0) { if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; error = iso_mountfs(devvp, mp, p, &args); } else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return error; } imp = VFSTOISOFS(mp); (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void) cd9660_statfs(mp, &mp->mnt_stat, p); return 0; } /* * Common code for mount and mountroot */ static int iso_mountfs(devvp, mp, p, argp) register struct vnode *devvp; struct mount *mp; struct proc *p; struct iso_args *argp; { register struct iso_mnt *isomp = (struct iso_mnt *)0; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; dev_t dev = devvp->v_rdev; int error = EINVAL; int needclose = 0; int high_sierra = 0; int iso_bsize; int iso_blknum; int joliet_level; struct iso_volume_descriptor *vdp = 0; struct iso_primary_descriptor *pri = NULL; struct iso_sierra_primary_descriptor *pri_sierra = NULL; struct iso_supplementary_descriptor *sup = NULL; struct iso_directory_record *rootp; int logical_block_size; if (!(mp->mnt_flag & MNT_RDONLY)) return EROFS; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ if ((error = vfs_mountedon(devvp))) return error; if (vcount(devvp) > 1 && devvp != rootvp) return EBUSY; if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0))) return (error); if ((error = VOP_OPEN(devvp, FREAD, FSCRED, p))) return error; needclose = 1; /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. For now, we'll just use a constant. */ iso_bsize = ISO_DEFAULT_BLOCK_SIZE; joliet_level = 0; for (iso_blknum = 16 + argp->ssector; iso_blknum < 100 + argp->ssector; iso_blknum++) { if ((error = bread(devvp, iso_blknum * btodb(iso_bsize), iso_bsize, NOCRED, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { if (bcmp (vdp->id_sierra, ISO_SIERRA_ID, sizeof vdp->id) != 0) { error = EINVAL; goto out; } else high_sierra = 1; } switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){ case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; pri = (struct iso_primary_descriptor *)vdp; pri_sierra = (struct iso_sierra_primary_descriptor *)vdp; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; sup = (struct iso_supplementary_descriptor *)vdp; if (!(argp->flags & ISOFSMNT_NOJOLIET)) { if (bcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (bcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (bcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; if (isonum_711 (sup->flags) & 1) joliet_level = 0; } } break; case ISO_VD_END: goto vd_end; default: break; } if (bp) { brelse(bp); bp = NULL; } } vd_end: if (bp) { brelse(bp); bp = NULL; } if (pri == NULL) { error = EINVAL; goto out; } logical_block_size = isonum_723 (high_sierra? pri_sierra->logical_block_size: pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) { error = EINVAL; goto out; } rootp = (struct iso_directory_record *) (high_sierra? pri_sierra->root_directory_record: pri->root_directory_record); isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK); bzero((caddr_t)isomp, sizeof *isomp); isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (high_sierra? pri_sierra->volume_space_size: pri->volume_space_size); isomp->joliet_level = 0; /* * Since an ISO9660 multi-session CD can also access previous * sessions, we have to include them into the space consider- * ations. This doesn't yield a very accurate number since * parts of the old sessions might be inaccessible now, but we * can't do much better. This is also important for the NFS * filehandle validation. */ isomp->volume_space_size += argp->ssector; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = ffs(logical_block_size) - 1; pribp->b_flags |= B_AGE; brelse(pribp); pribp = NULL; mp->mnt_data = (qaddr_t)isomp; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; devvp->v_specmountpoint = mp; /* Check the Rock Ridge Extention support */ if (!(argp->flags & ISOFSMNT_NORRIP)) { if ((error = bread(isomp->im_devvp, (isomp->root_extent + isonum_711(rootp->ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, NOCRED, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { argp->flags |= ISOFSMNT_NORRIP; } else { argp->flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ bp->b_flags |= B_AGE; brelse(bp); bp = NULL; } isomp->im_flags = argp->flags & (ISOFSMNT_NORRIP | ISOFSMNT_GENS | ISOFSMNT_EXTATT | ISOFSMNT_NOJOLIET); if (high_sierra) { /* this effectively ignores all the mount flags */ log(LOG_INFO, "cd9660: High Sierra Format\n"); isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA; } else switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { default: isomp->iso_ftype = ISO_FTYPE_DEFAULT; break; case ISOFSMNT_GENS|ISOFSMNT_NORRIP: isomp->iso_ftype = ISO_FTYPE_9660; break; case 0: log(LOG_INFO, "cd9660: RockRidge Extension\n"); isomp->iso_ftype = ISO_FTYPE_RRIP; break; } /* Decide whether to use the Joliet descriptor */ if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) { log(LOG_INFO, "cd9660: Joliet Extension\n"); rootp = (struct iso_directory_record *) sup->root_directory_record; bcopy (rootp, isomp->root, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->joliet_level = joliet_level; supbp->b_flags |= B_AGE; } if (supbp) { brelse(supbp); supbp = NULL; } return 0; out: devvp->v_specmountpoint = NULL; if (bp) brelse(bp); if (pribp) brelse(pribp); if (supbp) brelse(supbp); if (needclose) (void)VOP_CLOSE(devvp, FREAD, NOCRED, p); if (isomp) { free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; } return error; } /* * Make a filesystem operational. * Nothing to do at the moment. */ /* ARGSUSED */ static int cd9660_start(mp, flags, p) struct mount *mp; int flags; struct proc *p; { return 0; } /* * unmount system call */ static int cd9660_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; #if 0 mntflushbuf(mp, 0); if (mntinvalbuf(mp)) return EBUSY; #endif if ((error = vflush(mp, NULLVP, flags))) return (error); isomp = VFSTOISOFS(mp); isomp->im_devvp->v_specmountpoint = NULL; error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p); vrele(isomp->im_devvp); free((caddr_t)isomp, M_ISOFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Return root of a filesystem */ static int cd9660_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ return (cd9660_vget_internal(mp, ino, vpp, imp->iso_ftype == ISO_FTYPE_RRIP, dp)); } /* * Do operations associated with quotas, not supported */ /* ARGSUSED */ static int cd9660_quotactl(mp, cmd, uid, arg, p) struct mount *mp; int cmd; uid_t uid; caddr_t arg; struct proc *p; { return (EOPNOTSUPP); } /* * Get file system statistics. */ int cd9660_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { register struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } return 0; } /* ARGSUSED */ static int cd9660_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ struct ifid { ushort ifid_len; ushort ifid_pad; int ifid_ino; long ifid_start; }; /* ARGSUSED */ int cd9660_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { struct ifid *ifhp = (struct ifid *)fhp; register struct iso_node *ip; register struct netcred *np; register struct iso_mnt *imp = VFSTOISOFS(mp); struct vnode *nvp; int error; #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifhp->ifid_ino, ifhp->ifid_start); #endif /* * Get the export permission structure for this tuple. */ np = vfs_export_lookup(mp, &imp->im_export, nam); if (np == NULL) return (EACCES); if ((error = VFS_VGET(mp, ifhp->ifid_ino, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; *exflagsp = np->netc_exflags; *credanonp = &np->netc_anon; return (0); } int cd9660_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { /* * XXXX * It would be nice if we didn't always set the `relocated' flag * and force the extra read, but I don't want to think about fixing * that right now. */ return (cd9660_vget_internal(mp, ino, vpp, #if 0 VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP, #else 0, #endif (struct iso_directory_record *)0)); } int cd9660_vget_internal(mp, ino, vpp, relocated, isodir) struct mount *mp; ino_t ino; struct vnode **vpp; int relocated; struct iso_directory_record *isodir; { struct iso_mnt *imp; struct iso_node *ip; struct buf *bp; struct vnode *vp, *nvp; dev_t dev; int error; imp = VFSTOISOFS(mp); dev = imp->im_dev; if ((*vpp = cd9660_ihashget(dev, ino)) != NULLVP) return (0); /* Allocate a new vnode/iso_node. */ if ((error = getnewvnode(VT_ISOFS, mp, cd9660_vnodeop_p, &vp)) != 0) { *vpp = NULLVP; return (error); } MALLOC(ip, struct iso_node *, sizeof(struct iso_node), M_ISOFSNODE, M_WAITOK); bzero((caddr_t)ip, sizeof(struct iso_node)); lockinit(&ip->i_lock, PINOD, "isonode", 0, 0); vp->v_data = ip; ip->i_vnode = vp; ip->i_dev = dev; ip->i_number = ino; /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ cd9660_ihashins(ip); if (isodir == 0) { int lbn, off; lbn = lblkno(imp, ino); if (lbn >= imp->volume_space_size) { vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { vput(vp); brelse(bp); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)(bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { vput(vp); if (bp != 0) brelse(bp); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { if (bp != 0) brelse(bp); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif } else bp = 0; ip->i_mnt = imp; ip->i_devvp = imp->im_devvp; VREF(ip->i_devvp); if (relocated) { /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; if (bp != 0) brelse(bp); if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) { vput(vp); return (error); } isodir = (struct iso_directory_record *)bp->b_data; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; /* * Setup time stamp, attribute */ vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; int off; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660); cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660); if (bp2) brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } if (bp != 0) brelse(bp); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = cd9660_fifoop_p; break; case VCHR: case VBLK: /* * if device, look at device number table for translation */ vp->v_op = cd9660_specop_p; if ((nvp = checkalias(vp, ip->inode.iso_rdev, mp)) != NULL) { /* * Discard unneeded vnode, but save its iso_node. * Note that the lock is carried over in the iso_node * to the replacement vnode. */ nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased inode. */ vp = nvp; ip->i_vnode = vp; } break; default: break; } if (ip->iso_extent == imp->root_extent) vp->v_flag |= VROOT; /* * XXX need generation number? */ *vpp = vp; return (0); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int cd9660_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct iso_node *ip = VTOI(vp); register struct ifid *ifhp; ifhp = (struct ifid *)fhp; ifhp->ifid_len = sizeof(struct ifid); ifhp->ifid_ino = ip->i_number; ifhp->ifid_start = ip->iso_start; #ifdef ISOFS_DBG printf("vptofh: ino %d, start %ld\n", ifhp->ifid_ino,ifhp->ifid_start); #endif return 0; } Index: head/sys/isofs/cd9660/cd9660_vnops.c =================================================================== --- head/sys/isofs/cd9660/cd9660_vnops.c (revision 49534) +++ head/sys/isofs/cd9660/cd9660_vnops.c (revision 49535) @@ -1,917 +1,917 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vnops.c 8.19 (Berkeley) 5/27/95 - * $Id: cd9660_vnops.c,v 1.55 1999/04/18 10:58:02 dcs Exp $ + * $Id: cd9660_vnops.c,v 1.56 1999/05/11 19:54:25 phk Exp $ */ #include #include #include #include #include #include #include #include -#include #include #include #include #include +#include #include #include #include #include #include #include static int cd9660_setattr __P((struct vop_setattr_args *)); static int cd9660_access __P((struct vop_access_args *)); static int cd9660_getattr __P((struct vop_getattr_args *)); static int cd9660_pathconf __P((struct vop_pathconf_args *)); static int cd9660_read __P((struct vop_read_args *)); struct isoreaddir; static int iso_uiodir __P((struct isoreaddir *idp, struct dirent *dp, off_t off)); static int iso_shipdir __P((struct isoreaddir *idp)); static int cd9660_readdir __P((struct vop_readdir_args *)); static int cd9660_readlink __P((struct vop_readlink_args *ap)); static int cd9660_abortop __P((struct vop_abortop_args *)); static int cd9660_strategy __P((struct vop_strategy_args *)); static int cd9660_print __P((struct vop_print_args *)); static int cd9660_getpages __P((struct vop_getpages_args *)); static int cd9660_putpages __P((struct vop_putpages_args *)); /* * Setattr call. Only allowed for block and character special devices. */ int cd9660_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: return (0); } } return (0); } /* * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. * The mode is shifted to select the owner/group/other fields. The * super user is granted all permissions. */ /* ARGSUSED */ static int cd9660_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; gid_t *gp; int i; /* * Disallow write attempts unless the file is a socket, * fifo, or a block or character device resident on the * file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } /* User id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->inode.iso_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->inode.iso_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->inode.iso_mode & mask) == mask ? 0 : EACCES); } static int cd9660_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; register struct vattr *vap = ap->a_vap; register struct iso_node *ip = VTOI(vp); vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->inode.iso_mode; vap->va_nlink = ip->inode.iso_links; vap->va_uid = ip->inode.iso_uid; vap->va_gid = ip->inode.iso_gid; vap->va_atime = ip->inode.iso_atime; vap->va_mtime = ip->inode.iso_mtime; vap->va_ctime = ip->inode.iso_ctime; vap->va_rdev = ip->inode.iso_rdev; vap->va_size = (u_quad_t) ip->i_size; if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { struct vop_readlink_args rdlnk; struct iovec aiov; struct uio auio; char *cp; MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = ap->a_p; auio.uio_resid = MAXPATHLEN; rdlnk.a_uio = &auio; rdlnk.a_vp = ap->a_vp; rdlnk.a_cred = ap->a_cred; if (cd9660_readlink(&rdlnk) == 0) vap->va_size = MAXPATHLEN - auio.uio_resid; FREE(cp, M_TEMP); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = ip->i_mnt->logical_block_size; vap->va_bytes = (u_quad_t) ip->i_size; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Vnode op for reading. */ static int cd9660_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; register struct iso_node *ip = VTOI(vp); register struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; int rasize, error = 0; long size, n, on; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = min((u_int)(imp->logical_block_size - on), uio->uio_resid); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { if (vp->v_lastr + 1 == lbn && lblktosize(imp, rablock) < ip->i_size) { rasize = blksize(imp, ip, rablock); error = breadn(vp, lbn, size, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); } vp->v_lastr = lbn; n = min(n, size - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Structure for reading directories */ struct isoreaddir { struct dirent saveent; struct dirent assocent; struct dirent current; off_t saveoff; off_t assocoff; off_t curroff; struct uio *uio; off_t uio_off; int eofflag; u_long *cookies; int ncookies; }; int iso_uiodir(idp,dp,off) struct isoreaddir *idp; struct dirent *dp; off_t off; { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = GENERIC_DIRSIZ(dp); if (idp->uio->uio_resid < dp->d_reclen) { idp->eofflag = 0; return (-1); } if (idp->cookies) { if (idp->ncookies <= 0) { idp->eofflag = 0; return (-1); } *idp->cookies++ = off; --idp->ncookies; } if ((error = uiomove((caddr_t) dp,dp->d_reclen,idp->uio)) != 0) return (error); idp->uio_off = off; return (0); } int iso_shipdir(idp) struct isoreaddir *idp; { struct dirent *dp; int cl, sl, assoc; int error; char *cname, *sname; cl = idp->current.d_namlen; cname = idp->current.d_name; assoc = (cl > 1) && (*cname == ASSOCCHAR); if (assoc) { cl--; cname++; } dp = &idp->saveent; sname = dp->d_name; if (!(sl = dp->d_namlen)) { dp = &idp->assocent; sname = dp->d_name + 1; sl = dp->d_namlen - 1; } if (sl > 0) { if (sl != cl || bcmp(sname,cname,sl)) { if (idp->assocent.d_namlen) { if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0) return (error); idp->assocent.d_namlen = 0; } if (idp->saveent.d_namlen) { if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0) return (error); idp->saveent.d_namlen = 0; } } } idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current); if (assoc) { idp->assocoff = idp->curroff; bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); } else { idp->saveoff = idp->curroff; bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); } return (0); } /* * Vnode op for readdir */ static int cd9660_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long *a_cookies; } */ *ap; { register struct uio *uio = ap->a_uio; struct isoreaddir *idp; struct vnode *vdp = ap->a_vp; struct iso_node *dp; struct iso_mnt *imp; struct buf *bp = NULL; struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; u_long bmask; int error = 0; int reclen; u_short namelen; int ncookies = 0; u_long *cookies = NULL; dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_bmask; MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = idp->assocent.d_namlen = 0; /* * XXX * Is it worth trying to figure out the type? */ idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type = DT_UNKNOWN; idp->uio = uio; if (ap->a_ncookies == NULL) { idp->cookies = NULL; } else { /* * Guess the number of cookies needed. */ ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_int), M_TEMP, M_WAITOK); idp->cookies = cookies; idp->ncookies = ncookies; } idp->eofflag = 1; idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) { FREE(idp, M_TEMP); return (error); } endsearch = dp->i_size; while (idp->curroff < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0) break; entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ idp->curroff = (idp->curroff & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) { error = EINVAL; /* illegal entry, stop */ break; } if (entryoffsetinblock + reclen > imp->logical_block_size) { error = EINVAL; /* illegal directory, so stop looking */ break; } idp->current.d_namlen = isonum_711(ep->name_len); if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { error = EINVAL; /* illegal entry, stop */ break; } if (isonum_711(ep->flags)&2) idp->current.d_fileno = isodirino(ep, imp); else idp->current.d_fileno = dbtob(bp->b_blkno) + entryoffsetinblock; idp->curroff += reclen; switch (imp->iso_ftype) { case ISO_FTYPE_RRIP: cd9660_rrip_getname(ep,idp->current.d_name, &namelen, &idp->current.d_fileno,imp); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); break; default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/ strcpy(idp->current.d_name,".."); if (idp->current.d_namlen == 1 && ep->name[0] == 0) { idp->current.d_namlen = 1; error = iso_uiodir(idp,&idp->current,idp->curroff); } else if (idp->current.d_namlen == 1 && ep->name[0] == 1) { idp->current.d_namlen = 2; error = iso_uiodir(idp,&idp->current,idp->curroff); } else { isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, imp->iso_ftype == ISO_FTYPE_9660, isonum_711(ep->flags)&4, imp->joliet_level); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); else error = iso_uiodir(idp,&idp->current,idp->curroff); } } if (error) break; entryoffsetinblock += reclen; } if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { idp->current.d_namlen = 0; error = iso_shipdir(idp); } if (error < 0) error = 0; if (ap->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { /* * Work out the number of cookies actually used. */ *ap->a_ncookies = ncookies - idp->ncookies; *ap->a_cookies = cookies; } } if (bp) brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; FREE(idp, M_TEMP); return (error); } /* * Return target name of a symbolic link * Shouldn't we get the parent vnode and read the data from there? * This could eventually result in deadlocks in cd9660_lookup. * But otherwise the block read here is in the block buffer two times. */ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; static int cd9660_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { ISONODE *ip; ISODIR *dirp; ISOMNT *imp; struct buf *bp; struct uio *uio; u_short symlen; int error; char *symname; ip = VTOI(ap->a_vp); imp = ip->i_mnt; uio = ap->a_uio; if (imp->iso_ftype != ISO_FTYPE_RRIP) return (EINVAL); /* * Get parents directory record block that this inode included. */ error = bread(imp->im_devvp, (ip->i_number >> imp->im_bshift) << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... * 1: Check not cross boundary on block */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > (unsigned)imp->logical_block_size) { brelse(bp); return (EINVAL); } /* * Now get a buffer * Abuse a namei buffer for now. */ if (uio->uio_segflg == UIO_SYSSPACE) symname = uio->uio_iov->iov_base; else symname = zalloc(namei_zone); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { if (uio->uio_segflg != UIO_SYSSPACE) zfree(namei_zone, symname); brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ brelse(bp); /* * return with the symbolic name to caller's. */ if (uio->uio_segflg != UIO_SYSSPACE) { error = uiomove(symname, symlen, uio); zfree(namei_zone, symname); return (error); } uio->uio_resid -= symlen; uio->uio_iov->iov_base += symlen; uio->uio_iov->iov_len -= symlen; return (0); } /* * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually * done. If a buffer has been saved in anticipation of a CREATE, delete it. */ static int cd9660_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ static int cd9660_strategy(ap) struct vop_strategy_args /* { struct buf *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct iso_node *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if ((error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL))) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } /* * Print out the contents of an inode. */ static int cd9660_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_ISOFS, isofs vnode\n"); return (0); } /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ static int cd9660_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) *ap->a_retval = NAME_MAX; else *ap->a_retval = 37; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * get page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int cd9660_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * put page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int cd9660_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } /* * Global vfs data structures for cd9660 */ vop_t **cd9660_vnodeop_p; static struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) cd9660_abortop }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_bmap_desc, (vop_t *) cd9660_bmap }, { &vop_cachedlookup_desc, (vop_t *) cd9660_lookup }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_pathconf_desc, (vop_t *) cd9660_pathconf }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_read_desc, (vop_t *) cd9660_read }, { &vop_readdir_desc, (vop_t *) cd9660_readdir }, { &vop_readlink_desc, (vop_t *) cd9660_readlink }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_strategy_desc, (vop_t *) cd9660_strategy }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_getpages_desc, (vop_t *) cd9660_getpages }, { &vop_putpages_desc, (vop_t *) cd9660_putpages }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_vnodeop_opv_desc = { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; VNODEOP_SET(cd9660_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **cd9660_specop_p; static struct vnodeopv_entry_desc cd9660_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_specop_opv_desc = { &cd9660_specop_p, cd9660_specop_entries }; VNODEOP_SET(cd9660_specop_opv_desc); vop_t **cd9660_fifoop_p; static struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) cd9660_access }, { &vop_getattr_desc, (vop_t *) cd9660_getattr }, { &vop_inactive_desc, (vop_t *) cd9660_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) cd9660_print }, { &vop_reclaim_desc, (vop_t *) cd9660_reclaim }, { &vop_setattr_desc, (vop_t *) cd9660_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { NULL, NULL } }; static struct vnodeopv_desc cd9660_fifoop_opv_desc = { &cd9660_fifoop_p, cd9660_fifoop_entries }; VNODEOP_SET(cd9660_fifoop_opv_desc); Index: head/sys/kern/kern_conf.c =================================================================== --- head/sys/kern/kern_conf.c (revision 49534) +++ head/sys/kern/kern_conf.c (revision 49535) @@ -1,285 +1,304 @@ /*- * Parts Copyright (c) 1995 Terrence R. Lambert * Copyright (c) 1995 Julian R. Elischer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Terrence R. Lambert. * 4. The name Terrence R. Lambert may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: kern_conf.c,v 1.53 1999/07/20 21:51:12 green Exp $ + * $Id: kern_conf.c,v 1.54 1999/08/08 00:34:00 grog Exp $ */ #include #include #include #include #include #include #include #include +#include -#include - #define cdevsw_ALLOCSTART (NUMCDEVSW/2) struct cdevsw *cdevsw[NUMCDEVSW]; static int bmaj2cmaj[NUMCDEVSW]; MALLOC_DEFINE(M_DEVT, "dev_t", "dev_t storage"); #define DEVT_HASH 83 #define DEVT_STASH 50 static struct specinfo devt_stash[DEVT_STASH]; static SLIST_HEAD(devt_hash_head, specinfo) dev_hash[DEVT_HASH]; /* * Routine to convert from character to block device number. * * A minimal stub routine can always return NODEV. */ dev_t chrtoblk(dev_t dev) { struct cdevsw *cd; if((cd = devsw(dev)) != NULL) { if (cd->d_bmaj != -1) return(makebdev(cd->d_bmaj,minor(dev))); } return(NODEV); } struct cdevsw * devsw(dev_t dev) { return(cdevsw[major(dev)]); } struct cdevsw * bdevsw(dev_t dev) { return(cdevsw[major(dev)]); } /* * Add a cdevsw entry */ int cdevsw_add(struct cdevsw *newentry) { int i; static int setup; if (!setup) { for (i = 0; i < NUMCDEVSW; i++) if (!bmaj2cmaj[i]) bmaj2cmaj[i] = 254; setup++; } if (newentry->d_maj < 0 || newentry->d_maj >= NUMCDEVSW) { printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n", newentry->d_name, newentry->d_maj); return EINVAL; } if (cdevsw[newentry->d_maj]) { printf("WARNING: \"%s\" is usurping \"%s\"'s cdevsw[]\n", newentry->d_name, cdevsw[newentry->d_maj]->d_name); } cdevsw[newentry->d_maj] = newentry; if (newentry->d_bmaj >= 0 && newentry->d_bmaj < NUMCDEVSW) { if (bmaj2cmaj[newentry->d_bmaj] != 254) { printf("WARNING: \"%s\" is usurping \"%s\"'s bmaj\n", newentry->d_name, cdevsw[bmaj2cmaj[newentry->d_bmaj]]->d_name); } bmaj2cmaj[newentry->d_bmaj] = newentry->d_maj; } return 0; } /* * Remove a cdevsw entry */ int cdevsw_remove(struct cdevsw *oldentry) { if (oldentry->d_maj < 0 || oldentry->d_maj >= NUMCDEVSW) { printf("%s: ERROR: driver has bogus cdevsw->d_maj = %d\n", oldentry->d_name, oldentry->d_maj); return EINVAL; } cdevsw[oldentry->d_maj] = NULL; if (oldentry->d_bmaj >= 0 && oldentry->d_bmaj < NUMCDEVSW) bmaj2cmaj[oldentry->d_bmaj] = 254; return 0; } int devsw_module_handler(module_t mod, int what, void* arg) { struct devsw_module_data* data = (struct devsw_module_data*) arg; int error = 0; switch (what) { case MOD_LOAD: error = cdevsw_add(data->cdevsw); if (!error && data->chainevh) error = data->chainevh(mod, what, data->chainarg); return error; case MOD_UNLOAD: if (data->chainevh) { error = data->chainevh(mod, what, data->chainarg); if (error) return error; } cdevsw_remove(data->cdevsw); return error; } if (data->chainevh) return data->chainevh(mod, what, data->chainarg); else return 0; } /* * dev_t and u_dev_t primitives */ int major(dev_t x) { if (x == NODEV) return NOUDEV; return((x->si_udev >> 8) & 0xff); } int minor(dev_t x) { if (x == NODEV) return NOUDEV; return(x->si_udev & 0xffff00ff); } dev_t makebdev(int x, int y) { return (makedev(bmaj2cmaj[x], y)); } dev_t makedev(int x, int y) { struct specinfo *si; udev_t udev; int hash; static int stashed; udev = (x << 8) | y; hash = udev % DEVT_HASH; SLIST_FOREACH(si, &dev_hash[hash], si_hash) { if (si->si_udev == udev) return (si); } if (stashed >= DEVT_STASH) { MALLOC(si, struct specinfo *, sizeof(*si), M_DEVT, M_USE_RESERVE); } else { si = devt_stash + stashed++; } bzero(si, sizeof(*si)); si->si_udev = udev; si->si_bsize_phys = DEV_BSIZE; si->si_bsize_best = BLKDEV_IOSIZE; si->si_bsize_max = MAXBSIZE; + if (y > 256) + sprintf(si->si_name, "#%d/0x%x", x, y); + else + sprintf(si->si_name, "#%d/%d", x, y); SLIST_INSERT_HEAD(&dev_hash[hash], si, si_hash); return (si); } udev_t dev2udev(dev_t x) { if (x == NODEV) return NOUDEV; return (x->si_udev); } udev_t dev2budev(dev_t x) { if (x == NODEV) return NOUDEV; else return makeudev(devsw(x)->d_bmaj, minor(x)); } dev_t udev2dev(udev_t x, int b) { switch (b) { case 0: return makedev(umajor(x), uminor(x)); case 1: return makebdev(umajor(x), uminor(x)); default: Debugger("udev2dev(...,X)"); return NODEV; } } int uminor(udev_t dev) { return(dev & 0xffff00ff); } int umajor(udev_t dev) { return((dev & 0xff00) >> 8); } udev_t makeudev(int x, int y) { return ((x << 8) | y); +} + +dev_t +make_dev(struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, char *fmt, ...) +{ + dev_t dev; + va_list ap; + int i; + + dev = makedev(devsw->d_maj, minor); + va_start(ap, fmt); + i = kvprintf(fmt, NULL, dev->si_name, 32, ap); + dev->si_name[i] = '\0'; + va_end(ap); + dev->si_devsw = devsw; + return (dev); } Index: head/sys/kern/kern_mib.c =================================================================== --- head/sys/kern/kern_mib.c (revision 49534) +++ head/sys/kern/kern_mib.c (revision 49535) @@ -1,251 +1,251 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 - * $Id: kern_mib.c,v 1.21 1999/07/19 09:13:12 phk Exp $ + * $Id: kern_mib.c,v 1.22 1999/07/20 07:19:32 phk Exp $ */ #include #include #include #include #include #include #include #if defined(SMP) #include #endif SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, "High kernel, proc, limits &c"); SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, "Virtual memory"); SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0, "File system"); SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0, "Network, (see socket.h)"); SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0, "Debugging"); SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0, "Sizeof various things"); SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0, "hardware"); SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0, "machine dependent"); SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, "user-level"); SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0, "p1003_1b, (see p1003_1b.h)"); SYSCTL_NODE(_kern, OID_AUTO, prison, CTLFLAG_RW, 0, "Prison rules"); SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "Operating system type"); SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "Operating system revision"); SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "Kernel version"); SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "Operating system type"); extern int osreldate; SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "Operating system release date"); SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, &maxproc, 0, "Maximum number of processes"); SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, &maxprocperuid, 0, "Maximum processes allowed per userid"); SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "Maximum bytes of argument to execve(2)"); SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to"); SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "Maximum number of groups a user can belong to"); SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "Whether job control is available"); #ifdef _POSIX_SAVED_IDS SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "Whether saved set-group/user ID is available"); #else SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "Whether saved set-group/user ID is available"); #endif char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, kernelname, sizeof kernelname, "Name of kernel file booted"); #ifdef SMP SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, &mp_ncpus, 0, "Number of active CPUs"); #else SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "Number of active CPUs"); #endif SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "System byte order"); SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "System memory page size"); static char machine_arch[] = MACHINE_ARCH; SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, machine_arch, 0, "System architecture"); char hostname[MAXHOSTNAMELEN]; static int sysctl_hostname SYSCTL_HANDLER_ARGS { int error; if (req->p->p_prison) error = sysctl_handle_string(oidp, req->p->p_prison->pr_host, sizeof req->p->p_prison->pr_host, req); else error = sysctl_handle_string(oidp, hostname, sizeof hostname, req); return (error); } SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_hostname, "A", "Hostname"); int securelevel = -1; static int sysctl_kern_securelvl SYSCTL_HANDLER_ARGS { int error, level; level = securelevel; error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); if (level < securelevel) return (EPERM); securelevel = level; return (error); } SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_securelvl, "I", "Current secure level"); char domainname[MAXHOSTNAMELEN]; SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, &domainname, sizeof(domainname), "Name of the current YP/NIS domain"); long hostid; /* Some trouble here, if sizeof (int) != sizeof (long) */ SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID"); /* * This is really cheating. These actually live in the libc, something * which I'm not quite sure is a good idea anyway, but in order for * getnext and friends to actually work, we define dummies here. */ SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "PATH that finds all the standard utilities"); SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "Max ibase/obase values in bc(1)"); SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "Max array size in bc(1)"); SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "Max scale value in bc(1)"); SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "Max string length in bc(1)"); SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry"); SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, ""); SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "Max length (bytes) of a text-processing utility's input line"); SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "Maximum number of repeats of a regexp permitted"); SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "The version of POSIX 1003.2 with which the system attempts to comply"); SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "Whether C development supports the C bindings option"); SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "Whether system supports the C development utilities option"); SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, ""); SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "Whether system supports FORTRAN development utilities"); SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "Whether system supports FORTRAN runtime utilities"); SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "Whether system supports creation of locales"); SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "Whether system supports software development utilities"); SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "Whether system supports the user portability utilities"); SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "Min Maximum number of streams a process may have open at one time"); SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "Min Maximum number of types supported for timezone names"); #include SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD, 0, sizeof(struct vnode), "sizeof(struct vnode)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD, 0, sizeof(struct proc), "sizeof(struct proc)"); -#include +#include SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD, 0, sizeof(struct specinfo), "sizeof(struct specinfo)"); Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 49534) +++ head/sys/kern/vfs_aio.c (revision 49535) @@ -1,2008 +1,2007 @@ /* * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. * - * $Id: vfs_aio.c,v 1.53 1999/06/30 15:33:36 peter Exp $ + * $Id: vfs_aio.c,v 1.54 1999/07/01 13:21:40 peter Exp $ */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include static long jobrefid; #define JOBST_NULL 0x0 #define JOBST_JOBQPROC 0x1 #define JOBST_JOBQGLOBAL 0x2 #define JOBST_JOBRUNNING 0x3 #define JOBST_JOBFINISHED 0x4 #define JOBST_JOBQBUF 0x5 #define JOBST_JOBBFINISHED 0x6 #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef MAX_AIO_PROCS #define MAX_AIO_PROCS 32 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef TARGET_AIO_PROCS #define TARGET_AIO_PROCS 0 #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif #ifndef AIOD_TIMEOUT_DEFAULT #define AIOD_TIMEOUT_DEFAULT (10 * hz) #endif #ifndef AIOD_LIFETIME_DEFAULT #define AIOD_LIFETIME_DEFAULT (30 * hz) #endif static int max_aio_procs = MAX_AIO_PROCS; static int num_aio_procs = 0; static int target_aio_procs = TARGET_AIO_PROCS; static int max_queue_count = MAX_AIO_QUEUE; static int num_queue_count = 0; static int num_buf_aio = 0; static int num_aio_resv_start = 0; static int aiod_timeout; static int aiod_lifetime; static int max_aio_per_proc = MAX_AIO_PER_PROC, max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; static int max_buf_aio = MAX_BUF_AIO; SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, ""); SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, ""); /* * Job queue item */ #define AIOCBLIST_CANCELLED 0x1 #define AIOCBLIST_RUNDOWN 0x4 #define AIOCBLIST_ASYNCFREE 0x8 #define AIOCBLIST_DONE 0x10 struct aiocblist { TAILQ_ENTRY (aiocblist) list; /* List of jobs */ TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ int jobflags; int jobstate; int inputcharge, outputcharge; struct buf *bp; /* buffer pointer */ struct proc *userproc; /* User process */ struct aioproclist *jobaioproc; /* AIO process descriptor */ struct aio_liojob *lio; /* optional lio job */ struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ struct aiocb uaiocb; /* Kernel I/O control block */ }; /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ struct aioproclist { int aioprocflags; /* AIO proc flags */ TAILQ_ENTRY(aioproclist) list; /* List of processes */ struct proc *aioproc; /* The AIO thread */ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY (aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ static void aio_init_aioinfo(struct proc *p) ; static void aio_onceonly(void *) ; static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void) ; static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; static void aio_physwakeup(struct buf *bp); static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); static vm_zone_t kaio_zone=0, aiop_zone=0, aiocb_zone=0, aiol_zone=0, aiolio_zone=0; /* * Startup initialization */ void aio_onceonly(void *na) { TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); TAILQ_INIT(&aio_freejobs); kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; } /* * Init the per-process aioinfo structure. * The aioinfo limits are set per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = zalloc(kaio_zone); p->p_aioinfo = ki; ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_queue_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; ki->kaio_buffer_finished_count = 0; ki->kaio_p = p; TAILQ_INIT(&ki->kaio_jobdone); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_bufdone); TAILQ_INIT(&ki->kaio_bufqueue); TAILQ_INIT(&ki->kaio_liojoblist); } } /* * Free a job entry. Wait for completion if it is currently * active, but don't delay forever. If we delay, we return * a flag that says that we have to restart the queue scan. */ int aio_free_entry(struct aiocblist *aiocbe) { struct kaioinfo *ki; struct aioproclist *aiop; struct aio_liojob *lj; struct proc *p; int error; int s; if (aiocbe->jobstate == JOBST_NULL) panic("aio_free_entry: freeing already free job"); p = aiocbe->userproc; ki = p->p_aioinfo; lj = aiocbe->lio; if (ki == NULL) panic("aio_free_entry: missing p->p_aioinfo"); if (aiocbe->jobstate == JOBST_JOBRUNNING) { if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) return 0; aiocbe->jobflags |= AIOCBLIST_RUNDOWN; tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); } aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; if (aiocbe->bp == NULL) { if (ki->kaio_queue_count <= 0) panic("aio_free_entry: process queue size <= 0"); if (num_queue_count <= 0) panic("aio_free_entry: system wide queue size <= 0"); if(lj) { lj->lioj_queue_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) lj->lioj_queue_finished_count--; } ki->kaio_queue_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) ki->kaio_queue_finished_count--; num_queue_count--; } else { if(lj) { lj->lioj_buffer_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) lj->lioj_buffer_finished_count--; } if (aiocbe->jobflags & AIOCBLIST_DONE) ki->kaio_buffer_finished_count--; ki->kaio_buffer_count--; num_buf_aio--; } if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(p); } if ( aiocbe->jobstate == JOBST_JOBQBUF) { if ((error = aio_fphysio(p, aiocbe, 1)) != 0) return error; if (aiocbe->jobstate != JOBST_JOBBFINISHED) panic("aio_free_entry: invalid physio finish-up state"); s = splbio(); TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); splx(s); } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { aiop = aiocbe->jobaioproc; TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { s = splbio(); TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); splx(s); if (aiocbe->bp) { vunmapbuf(aiocbe->bp); relpbuf(aiocbe->bp, NULL); aiocbe->bp = NULL; } } if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); zfree(aiolio_zone, lj); } TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); aiocbe->jobstate = JOBST_NULL; return 0; } /* * Rundown the jobs for a given process. */ void aio_proc_rundown(struct proc *p) { int s; struct kaioinfo *ki; struct aio_liojob *lj, *ljn; struct aiocblist *aiocbe, *aiocbn; ki = p->p_aioinfo; if (ki == NULL) return; ki->kaio_flags |= LIOJ_SIGNAL_POSTED; while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { ki->kaio_flags |= KAIO_RUNDOWN; if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) break; } restart1: for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) goto restart1; } restart2: for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) goto restart2; } /* * Note the use of lots of splbio here, trying to avoid * splbio for long chains of I/O. Probably unnecessary. */ restart3: s = splbio(); while (TAILQ_FIRST(&ki->kaio_bufqueue)) { ki->kaio_flags |= KAIO_WAKEUP; tsleep (p, PRIBIO, "aioprn", 0); splx(s); goto restart3; } splx(s); restart4: s = splbio(); for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) { splx(s); goto restart4; } } splx(s); for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { ljn = TAILQ_NEXT(lj, lioj_list); if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); zfree(aiolio_zone, lj); } else { #if defined(DIAGNOSTIC) printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", lj->lioj_buffer_count, lj->lioj_buffer_finished_count, lj->lioj_queue_count, lj->lioj_queue_finished_count); #endif } } zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon) */ static struct aiocblist * aio_selectjob(struct aioproclist *aiop) { struct aiocblist *aiocbe; aiocbe = TAILQ_FIRST(&aiop->jobtorun); if (aiocbe) { TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); return aiocbe; } for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = TAILQ_NEXT(aiocbe, list)) { struct kaioinfo *ki; struct proc *userp; userp = aiocbe->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); return aiocbe; } } return NULL; } /* * The AIO processing activity. This is the code that does the * I/O request for the non-physio version of the operations. The * normal vn operations are used, and this code should work in * all instances for every type of file, including pipes, sockets, * fifos, and regular files. */ void aio_process(struct aiocblist *aiocbe) { struct filedesc *fdp; struct proc *userp, *mycp; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; unsigned int fd; int cnt; int error; off_t offset; int oublock_st, oublock_end; int inblock_st, inblock_end; userp = aiocbe->userproc; cb = &aiocbe->uaiocb; mycp = curproc; fdp = mycp->p_fd; fd = cb->aio_fildes; fp = fdp->fd_ofiles[fd]; aiov.iov_base = (void *) cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = mycp; inblock_st = mycp->p_stats->p_ru.ru_inblock; oublock_st = mycp->p_stats->p_ru.ru_oublock; if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET); } else { auio.uio_rw = UIO_WRITE; error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET); } inblock_end = mycp->p_stats->p_ru.ru_inblock; oublock_end = mycp->p_stats->p_ru.ru_oublock; aiocbe->inputcharge = inblock_end - inblock_st; aiocbe->outputcharge = oublock_end - oublock_st; if (error) { if (auio.uio_resid != cnt) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) psignal(userp, SIGPIPE); } } cnt -= auio.uio_resid; cb->_aiocb_private.error = error; cb->_aiocb_private.status = cnt; return; } /* * The AIO daemon, most of the actual work is done in aio_process, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *uproc) { int s; struct aioproclist *aiop; struct vmspace *myvm; struct proc *mycp; /* * Local copies of curproc (cp) and vmspace (myvm) */ mycp = curproc; myvm = mycp->p_vmspace; if (mycp->p_textvp) { vrele(mycp->p_textvp); mycp->p_textvp = NULL; } /* * Allocate and ready the aio control info. There is one * aiop structure per daemon. */ aiop = zalloc(aiop_zone); aiop->aioproc = mycp; aiop->aioprocflags |= AIOP_FREE; TAILQ_INIT(&aiop->jobtorun); /* * Place thread (lightweight process) onto the AIO free thread list */ if (TAILQ_EMPTY(&aio_freeproc)) wakeup(&aio_freeproc); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); /* * Make up a name for the daemon */ strcpy(mycp->p_comm, "aiod"); /* * Get rid of our current filedescriptors. AIOD's don't need any * filedescriptors, except as temporarily inherited from the client. * Credentials are also cloned, and made equivalent to "root." */ fdfree(mycp); mycp->p_fd = NULL; mycp->p_ucred = crcopy(mycp->p_ucred); mycp->p_ucred->cr_uid = 0; mycp->p_ucred->cr_ngroups = 1; mycp->p_ucred->cr_groups[0] = 1; /* * The daemon resides in its own pgrp. */ enterpgrp(mycp, mycp->p_pid, 1); /* * Mark special process type */ mycp->p_flag |= P_SYSTEM|P_KTHREADP; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * creating to many daemons.) */ wakeup(mycp); while(1) { struct proc *curcp; struct aiocblist *aiocbe; /* * curcp is the current daemon process context. * userp is the current user process context. */ curcp = mycp; /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } aiop->aioprocflags &= ~AIOP_SCHED; /* * Check for jobs */ while ((aiocbe = aio_selectjob(aiop)) != NULL) { struct proc *userp; struct aiocb *cb; struct kaioinfo *ki; struct aio_liojob *lj; cb = &aiocbe->uaiocb; userp = aiocbe->userproc; aiocbe->jobstate = JOBST_JOBRUNNING; /* * Connect to process address space for user program */ if (userp != curcp) { struct vmspace *tmpvm; /* * Save the current address space that we are connected to. */ tmpvm = mycp->p_vmspace; /* * Point to the new user address space, and refer to it. */ mycp->p_vmspace = userp->p_vmspace; mycp->p_vmspace->vm_refcnt++; /* * Activate the new mapping. */ pmap_activate(mycp); /* * If the old address space wasn't the daemons own address * space, then we need to remove the daemon's reference from * the other process that it was acting on behalf of. */ if (tmpvm != myvm) { vmspace_free(tmpvm); } /* * Disassociate from previous clients file descriptors, and * associate to the new clients descriptors. Note that * the daemon doesn't need to worry about its orginal * descriptors, because they were originally freed. */ if (mycp->p_fd) fdfree(mycp); mycp->p_fd = fdshare(userp); curcp = userp; } ki = userp->p_aioinfo; lj = aiocbe->lio; /* * Account for currently active jobs */ ki->kaio_active_count++; /* * Do the I/O function */ aiocbe->jobaioproc = aiop; aio_process(aiocbe); /* * decrement the active job count */ ki->kaio_active_count--; /* * increment the completion count for wakeup/signal comparisons */ aiocbe->jobflags |= AIOCBLIST_DONE; ki->kaio_queue_finished_count++; if (lj) { lj->lioj_queue_finished_count++; } if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(userp); } s = splbio(); if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { psignal(userp, lj->lioj_signal.sigev_signo); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } splx(s); aiocbe->jobstate = JOBST_JOBFINISHED; /* * If the I/O request should be automatically rundown, do the * needed cleanup. Otherwise, place the queue entry for * the just finished I/O request into the done queue for the * associated client. */ if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); } else { TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); } if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { wakeup(aiocbe); aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; } if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { psignal(userp, cb->aio_sigevent.sigev_signo); } } /* * Disconnect from user address space */ if (curcp != mycp) { struct vmspace *tmpvm; /* * Get the user address space to disconnect from. */ tmpvm = mycp->p_vmspace; /* * Get original address space for daemon. */ mycp->p_vmspace = myvm; /* * Activate the daemon's address space. */ pmap_activate(mycp); #if defined(DIAGNOSTIC) if (tmpvm == myvm) printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); #endif /* * remove our vmspace reference. */ vmspace_free(tmpvm); /* * disassociate from the user process's file descriptors. */ if (mycp->p_fd) fdfree(mycp); mycp->p_fd = NULL; curcp = mycp; } /* * If we are the first to be put onto the free queue, wakeup * anyone waiting for a daemon. */ TAILQ_REMOVE(&aio_activeproc, aiop, list); if (TAILQ_EMPTY(&aio_freeproc)) wakeup(&aio_freeproc); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, thereby * freeing resources. */ if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { if ((TAILQ_FIRST(&aio_jobs) == NULL) && (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { if ((aiop->aioprocflags & AIOP_FREE) && (num_aio_procs > target_aio_procs)) { TAILQ_REMOVE(&aio_freeproc, aiop, list); zfree(aiop_zone, aiop); num_aio_procs--; #if defined(DIAGNOSTIC) if (mycp->p_vmspace->vm_refcnt <= 1) printf("AIOD: bad vm refcnt for exiting daemon: %d\n", mycp->p_vmspace->vm_refcnt); #endif exit1(mycp, 0); } } } } } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. * The AIO daemon modifies its environment itself. */ static int aio_newproc() { int error; struct proc *p, *np; p = &proc0; error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); if (error) return error; cpu_set_fork_handler(np, aio_daemon, curproc); /* * Wait until daemon is started, but continue on just in case (to * handle error conditions. */ error = tsleep(np, PZERO, "aiosta", aiod_timeout); num_aio_procs++; return error; } /* * Try the high-performance physio method for eligible VCHR devices. This * routine doesn't require the use of any additional threads, and have * overhead. */ int aio_qphysio(p, aiocbe) struct proc *p; struct aiocblist *aiocbe; { int error; struct aiocb *cb; struct file *fp; struct buf *bp; int bflags; struct vnode *vp; struct kaioinfo *ki; struct filedesc *fdp; struct aio_liojob *lj; int fd; int s; int cnt; dev_t dev; int rw; d_strategy_t *fstrategy; struct cdevsw *cdev; struct cdevsw *bdev; cb = &aiocbe->uaiocb; fdp = p->p_fd; fd = cb->aio_fildes; fp = fdp->fd_ofiles[fd]; if (fp->f_type != DTYPE_VNODE) { return -1; } vp = (struct vnode *)fp->f_data; if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { return -1; } if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { return -1; } if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) { return -1; } if (vp->v_rdev == NODEV) { return -1; } cdev = devsw(vp->v_rdev); if (cdev == NULL) { return -1; } if (cdev->d_bmaj == -1) { return -1; } bdev = cdev; ki = p->p_aioinfo; if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { return -1; } cnt = cb->aio_nbytes; if (cnt > MAXPHYS) { return -1; } dev = makebdev(bdev->d_bmaj, minor(vp->v_rdev)); /* * Physical I/O is charged directly to the process, so we don't have * to fake it. */ aiocbe->inputcharge = 0; aiocbe->outputcharge = 0; ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) { lj->lioj_buffer_count++; } /* create and build a buffer header for a transfer */ bp = (struct buf *)getpbuf(NULL); /* * get a copy of the kva from the physical buffer */ bp->b_caller1 = p; bp->b_dev = dev; error = bp->b_error = 0; if (cb->aio_lio_opcode == LIO_WRITE) { rw = 0; bflags = B_WRITE; } else { rw = 1; bflags = B_READ; } bp->b_bcount = cb->aio_nbytes; bp->b_bufsize = cb->aio_nbytes; bp->b_flags = B_PHYS | B_CALL | bflags; bp->b_iodone = aio_physwakeup; bp->b_saveaddr = bp->b_data; bp->b_data = (void *) cb->aio_buf; bp->b_blkno = btodb(cb->aio_offset); if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { error = EFAULT; goto doerror; } if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { error = EFAULT; goto doerror; } /* bring buffer into kernel space */ vmapbuf(bp); s = splbio(); aiocbe->bp = bp; bp->b_spc = (void *)aiocbe; TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; num_buf_aio++; fstrategy = bdev->d_strategy; bp->b_error = 0; splx(s); /* perform transfer */ (*fstrategy)(bp); s = splbio(); /* * If we had an error invoking the request, or an error in processing * the request before we have returned, we process it as an error * in transfer. Note that such an I/O error is not indicated immediately, * but is returned using the aio_error mechanism. In this case, aio_suspend * will return immediately. */ if (bp->b_error || (bp->b_flags & B_ERROR)) { struct aiocb *job = aiocbe->uuaiocb; aiocbe->uaiocb._aiocb_private.status = 0; suword(&job->_aiocb_private.status, 0); aiocbe->uaiocb._aiocb_private.error = bp->b_error; suword(&job->_aiocb_private.error, bp->b_error); ki->kaio_buffer_finished_count++; if (aiocbe->jobstate != JOBST_JOBBFINISHED) { aiocbe->jobstate = JOBST_JOBBFINISHED; aiocbe->jobflags |= AIOCBLIST_DONE; TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); } } splx(s); return 0; doerror: ki->kaio_buffer_count--; if (lj) { lj->lioj_buffer_count--; } aiocbe->bp = NULL; relpbuf(bp, NULL); return error; } /* * This waits/tests physio completion. */ int aio_fphysio(p, iocb, flgwait) struct proc *p; struct aiocblist *iocb; int flgwait; { int s; struct buf *bp; int error; bp = iocb->bp; s = splbio(); if (flgwait == 0) { if ((bp->b_flags & B_DONE) == 0) { splx(s); return EINPROGRESS; } } while ((bp->b_flags & B_DONE) == 0) { if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { if ((bp->b_flags & B_DONE) == 0) { splx(s); return EINPROGRESS; } else { break; } } } /* release mapping into kernel space */ vunmapbuf(bp); iocb->bp = 0; error = 0; /* * check for an error */ if (bp->b_flags & B_ERROR) { error = bp->b_error; } relpbuf(bp, NULL); return (error); } /* * Queue a new AIO request. Choosing either the threaded or direct physio * VCHR technique is done in this code. */ static int _aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) { struct filedesc *fdp; struct file *fp; unsigned int fd; int error; int opcode; struct aiocblist *aiocbe; struct aioproclist *aiop; struct kaioinfo *ki; if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) { TAILQ_REMOVE(&aio_freejobs, aiocbe, list); } else { aiocbe = zalloc (aiocb_zone); } aiocbe->inputcharge = 0; aiocbe->outputcharge = 0; suword(&job->_aiocb_private.status, -1); suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.kernelinfo, -1); error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); if (error) { suword(&job->_aiocb_private.error, error); TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); return error; } /* * Save userspace address of the job info */ aiocbe->uuaiocb = job; /* * Get the opcode */ if (type != LIO_NOP) { aiocbe->uaiocb.aio_lio_opcode = type; } opcode = aiocbe->uaiocb.aio_lio_opcode; /* * Get the fd info for process */ fdp = p->p_fd; /* * Range check file descriptor */ fd = aiocbe->uaiocb.aio_fildes; if (fd >= fdp->fd_nfiles) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.error, EBADF); } return EBADF; } fp = fdp->fd_ofiles[fd]; if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.error, EBADF); } return EBADF; } if (aiocbe->uaiocb.aio_offset == -1LL) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.error, EINVAL); } return EINVAL; } error = suword(&job->_aiocb_private.kernelinfo, jobrefid); if (error) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.error, EINVAL); } return error; } aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; if (jobrefid == LONG_MAX) jobrefid = 1; else jobrefid++; if (opcode == LIO_NOP) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.status, 0); suword(&job->_aiocb_private.kernelinfo, 0); } return 0; } if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); if (type == 0) { suword(&job->_aiocb_private.status, 0); suword(&job->_aiocb_private.error, EINVAL); } return EINVAL; } suword(&job->_aiocb_private.error, EINPROGRESS); aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; aiocbe->userproc = p; aiocbe->jobflags = 0; aiocbe->lio = lj; ki = p->p_aioinfo; if ((error = aio_qphysio(p, aiocbe)) == 0) { return 0; } else if (error > 0) { suword(&job->_aiocb_private.status, 0); aiocbe->uaiocb._aiocb_private.error = error; suword(&job->_aiocb_private.error, error); return error; } /* * No buffer for daemon I/O */ aiocbe->bp = NULL; ki->kaio_queue_count++; if (lj) { lj->lioj_queue_count++; } TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); aiocbe->jobstate = JOBST_JOBQGLOBAL; num_queue_count++; error = 0; /* * If we don't have a free AIO process, and we are below our * quota, then start one. Otherwise, depend on the subsequent * I/O completions to pick-up this job. If we don't sucessfully * create the new process (thread) due to resource issues, we * return an error for now (EAGAIN), which is likely not the * correct thing to do. */ retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { num_aio_resv_start++; if ((error = aio_newproc()) == 0) { num_aio_resv_start--; p->p_retval[0] = 0; goto retryproc; } num_aio_resv_start--; } return error; } /* * This routine queues an AIO request, checking for quotas. */ static int aio_aqueue(struct proc *p, struct aiocb *job, int type) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { aio_init_aioinfo(p); } if (num_queue_count >= max_queue_count) return EAGAIN; ki = p->p_aioinfo; if (ki->kaio_queue_count >= ki->kaio_qallowed_count) return EAGAIN; return _aio_aqueue(p, job, NULL, type); } /* * Support the aio_return system call, as a side-effect, kernel * resources are released. */ int aio_return(struct proc *p, struct aio_return_args *uap) { int s; int jobref; struct aiocblist *cb, *ncb; struct aiocb *ujob; struct kaioinfo *ki; ki = p->p_aioinfo; if (ki == NULL) { return EINVAL; } ujob = uap->aiocbp; jobref = fuword(&ujob->_aiocb_private.kernelinfo); if (jobref == -1 || jobref == 0) return EINVAL; for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { if (ujob == cb->uuaiocb) { p->p_retval[0] = cb->uaiocb._aiocb_private.status; } else { p->p_retval[0] = EFAULT; } if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); return 0; } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { ncb = TAILQ_NEXT(cb, plist); if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { splx(s); if (ujob == cb->uuaiocb) { p->p_retval[0] = cb->uaiocb._aiocb_private.status; } else { p->p_retval[0] = EFAULT; } aio_free_entry(cb); return 0; } } splx(s); return (EINVAL); } /* * Allow a process to wakeup when any of the I/O requests are * completed. */ int aio_suspend(struct proc *p, struct aio_suspend_args *uap) { struct timeval atv; struct timespec ts; struct aiocb *const *cbptr, *cbp; struct kaioinfo *ki; struct aiocblist *cb; int i; int njoblist; int error, s, timo; int *ijoblist; struct aiocb **ujoblist; if (uap->nent >= AIO_LISTIO_MAX) return EINVAL; timo = 0; if (uap->timeout) { /* * Get timespec struct */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) { return error; } if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return EAGAIN; njoblist = 0; ijoblist = zalloc(aiol_zone); ujoblist = zalloc(aiol_zone); cbptr = uap->aiocbp; for(i = 0; i < uap->nent; i++) { cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); if (cbp == 0) continue; ujoblist[njoblist] = cbp; ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); njoblist++; } if (njoblist == 0) { zfree(aiol_zone, ijoblist); zfree(aiol_zone, ujoblist); return 0; } error = 0; while (1) { for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, plist)) { for(i = 0; i < njoblist; i++) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == ijoblist[i]) { if (ujoblist[i] != cb->uuaiocb) error = EINVAL; zfree(aiol_zone, ijoblist); zfree(aiol_zone, ujoblist); return error; } } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, plist)) { for(i = 0; i < njoblist; i++) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == ijoblist[i]) { splx(s); if (ujoblist[i] != cb->uuaiocb) error = EINVAL; zfree(aiol_zone, ijoblist); zfree(aiol_zone, ujoblist); return error; } } } ki->kaio_flags |= KAIO_WAKEUP; error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); splx(s); if (error == EINTR) { zfree(aiol_zone, ijoblist); zfree(aiol_zone, ujoblist); return EINTR; } else if (error == EWOULDBLOCK) { zfree(aiol_zone, ijoblist); zfree(aiol_zone, ujoblist); return EAGAIN; } } /* NOTREACHED */ return EINVAL; } /* * aio_cancel at the kernel level is a NOOP right now. It * might be possible to support it partially in user mode, or * in kernel mode later on. */ int aio_cancel(struct proc *p, struct aio_cancel_args *uap) { return ENOSYS; } /* * aio_error is implemented in the kernel level for compatibility * purposes only. For a user mode async implementation, it would be * best to do it in a userland subroutine. */ int aio_error(struct proc *p, struct aio_error_args *uap) { int s; struct aiocblist *cb; struct kaioinfo *ki; int jobref; ki = p->p_aioinfo; if (ki == NULL) return EINVAL; jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); if ((jobref == -1) || (jobref == 0)) return EINVAL; for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { p->p_retval[0] = cb->uaiocb._aiocb_private.error; return 0; } } for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { p->p_retval[0] = EINPROGRESS; return 0; } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { p->p_retval[0] = cb->uaiocb._aiocb_private.error; splx(s); return 0; } } for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { p->p_retval[0] = EINPROGRESS; splx(s); return 0; } } splx(s); /* * Hack for lio */ /* status = fuword(&uap->aiocbp->_aiocb_private.status); if (status == -1) { return fuword(&uap->aiocbp->_aiocb_private.error); } */ return EINVAL; } int aio_read(struct proc *p, struct aio_read_args *uap) { struct filedesc *fdp; struct file *fp; struct uio auio; struct iovec aiov; unsigned int fd; int cnt; struct aiocb iocb; int error, pmodes; pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); if ((pmodes & AIO_PMODE_SYNC) == 0) { return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); } /* * Get control block */ if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) return error; /* * Get the fd info for process */ fdp = p->p_fd; /* * Range check file descriptor */ fd = iocb.aio_fildes; if (fd >= fdp->fd_nfiles) return EBADF; fp = fdp->fd_ofiles[fd]; if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) return EBADF; if (iocb.aio_offset == -1LL) return EINVAL; auio.uio_resid = iocb.aio_nbytes; if (auio.uio_resid < 0) return (EINVAL); /* * Process sync simply -- queue async request. */ if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); } aiov.iov_base = (void *) iocb.aio_buf; aiov.iov_len = iocb.aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = iocb.aio_offset; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; cnt = iocb.aio_nbytes; error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, FOF_OFFSET); if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; cnt -= auio.uio_resid; p->p_retval[0] = cnt; return error; } int aio_write(struct proc *p, struct aio_write_args *uap) { struct filedesc *fdp; struct file *fp; struct uio auio; struct iovec aiov; unsigned int fd; int cnt; struct aiocb iocb; int error; int pmodes; /* * Process sync simply -- queue async request. */ pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); if ((pmodes & AIO_PMODE_SYNC) == 0) { return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); } if ((error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) != 0) return error; /* * Get the fd info for process */ fdp = p->p_fd; /* * Range check file descriptor */ fd = iocb.aio_fildes; if (fd >= fdp->fd_nfiles) return EBADF; fp = fdp->fd_ofiles[fd]; if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) return EBADF; if (iocb.aio_offset == -1LL) return EINVAL; aiov.iov_base = (void *) iocb.aio_buf; aiov.iov_len = iocb.aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = iocb.aio_offset; auio.uio_resid = iocb.aio_nbytes; if (auio.uio_resid < 0) return (EINVAL); auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; cnt = iocb.aio_nbytes; error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, FOF_OFFSET); if (error) { if (auio.uio_resid != cnt) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if (error == EPIPE) psignal(p, SIGPIPE); } } cnt -= auio.uio_resid; p->p_retval[0] = cnt; return error; } int lio_listio(struct proc *p, struct lio_listio_args *uap) { int nent, nentqueued; struct aiocb *iocb, * const *cbptr; struct aiocblist *cb; struct kaioinfo *ki; struct aio_liojob *lj; int error, runningcode; int nerror; int i; int s; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { return EINVAL; } nent = uap->nent; if (nent > AIO_LISTIO_MAX) { return EINVAL; } if (p->p_aioinfo == NULL) { aio_init_aioinfo(p); } if ((nent + num_queue_count) > max_queue_count) { return EAGAIN; } ki = p->p_aioinfo; if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { return EAGAIN; } lj = zalloc(aiolio_zone); if (!lj) { return EAGAIN; } lj->lioj_flags = 0; lj->lioj_buffer_count = 0; lj->lioj_buffer_finished_count = 0; lj->lioj_queue_count = 0; lj->lioj_queue_finished_count = 0; lj->lioj_ki = ki; TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Setup signal */ if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); if (error) return error; lj->lioj_flags |= LIOJ_SIGNAL; lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; } else { lj->lioj_flags &= ~LIOJ_SIGNAL; } /* * get pointers to the list of I/O requests */ nerror = 0; nentqueued = 0; cbptr = uap->acb_list; for(i = 0; i < uap->nent; i++) { iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { error = _aio_aqueue(p, iocb, lj, 0); if (error == 0) { nentqueued++; } else { nerror++; } } } /* * If we haven't queued any, then just return error */ if (nentqueued == 0) { return 0; } /* * Calculate the appropriate error return */ runningcode = 0; if (nerror) runningcode = EIO; if (uap->mode == LIO_WAIT) { while (1) { int found; found = 0; for(i = 0; i < uap->nent; i++) { int jobref, command; /* * Fetch address of the control buf pointer in user space */ iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) continue; /* * Fetch the associated command from user space */ command = fuword(&iocb->aio_lio_opcode); if (command == LIO_NOP) { found++; continue; } jobref = fuword(&iocb->_aiocb_private.kernelinfo); for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } found++; break; } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { found++; break; } } splx(s); } /* * If all I/Os have been disposed of, then we can return */ if (found == nentqueued) { return runningcode; } ki->kaio_flags |= KAIO_WAKEUP; error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); if (error == EINTR) { return EINTR; } else if (error == EWOULDBLOCK) { return EAGAIN; } } } return runningcode; } /* * This is a wierd hack so that we can post a signal. It is safe * to do so from a timeout routine, but *not* from an interrupt routine. */ static void process_signal(void *ljarg) { struct aio_liojob *lj = ljarg; if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } /* * Interrupt handler for physio, performs the necessary process wakeups, * and signals. */ static void aio_physwakeup(bp) struct buf *bp; { struct aiocblist *aiocbe; struct proc *p; struct kaioinfo *ki; struct aio_liojob *lj; int s; s = splbio(); wakeup((caddr_t) bp); bp->b_flags &= ~B_CALL; bp->b_flags |= B_DONE; aiocbe = (struct aiocblist *)bp->b_spc; if (aiocbe) { p = bp->b_caller1; aiocbe->jobstate = JOBST_JOBBFINISHED; aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; aiocbe->uaiocb._aiocb_private.error = 0; aiocbe->jobflags |= AIOCBLIST_DONE; if (bp->b_flags & B_ERROR) { aiocbe->uaiocb._aiocb_private.error = bp->b_error; } lj = aiocbe->lio; if (lj) { lj->lioj_buffer_finished_count++; /* * wakeup/signal if all of the interrupt jobs are done */ if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { /* * post a signal if it is called for */ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { lj->lioj_flags |= LIOJ_SIGNAL_POSTED; timeout(process_signal, lj, 0); } } } ki = p->p_aioinfo; if (ki) { ki->kaio_buffer_finished_count++; TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); /* * and do the wakeup */ if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(p); } } } splx(s); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 49534) +++ head/sys/kern/vfs_bio.c (revision 49535) @@ -1,3106 +1,3106 @@ /* * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.223 1999/07/09 16:41:19 peter Exp $ + * $Id: vfs_bio.c,v 1.224 1999/07/26 06:25:16 alc Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #define VMIO #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf *buf; /* buffer header pool */ struct swqueue bswlist; static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int flushbufqueues(void); static int bd_request; static void buf_daemon __P((void)); /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; int runningbufspace; int vmiodirenable = FALSE; static vm_offset_t bogus_offset; static int bufspace, maxbufspace, vmiospace, bufmallocspace, maxbufmallocspace, hibufspace; #if 0 static int maxvmiobufspace; #endif static int maxbdrun; static int needsbuffer; static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int getnewbufcalls; static int getnewbufrestarts; static int kvafreespace; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, &maxbdrun, 0, ""); #if 0 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, &maxvmiobufspace, 0, ""); #endif SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, &vmiospace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, &kvafreespace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, ""); static int bufhashmask; static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; char *buf_wmesg = BUF_WMESG; extern int vm_swap_size; #define BUF_MAXUSE 24 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ /* * Buffer hash table code. Note that the logical block scans linearly, which * gives us some L1 cache locality. */ static __inline struct bufhashhdr * bufhash(struct vnode *vnp, daddr_t bn) { return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); } /* * kvaspacewakeup: * * Called when kva space is potential available for recovery or when * kva space is recovered in the buffer_map. This function wakes up * anyone waiting for buffer_map kva space. Even though the buffer_map * is larger then maxbufspace, this situation will typically occur * when the buffer_map gets fragmented. */ static __inline void kvaspacewakeup(void) { /* * If someone is waiting for KVA space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ if (needsbuffer & VFS_BIO_NEED_KVASPACE) { needsbuffer &= ~VFS_BIO_NEED_KVASPACE; wakeup(&needsbuffer); } } /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(void) { if (numdirtybuffers < hidirtybuffers) { if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery or when * buffer space is recovered. getnewbuf() will block on this flag when * it is unable to free sufficient buffer space. Buffer space becomes * recoverable when bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { ++numfreebuffers; if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline__ void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } static __inline__ void bd_wakeup(int dirtybuflevel) { if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { bd_request = 1; wakeup(&bd_request); } } /* * Initialize buffer headers and related structures. */ caddr_t bufhashinit(caddr_t vaddr) { /* first, make a null hash table */ for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) ; bufhashtbl = (void *)vaddr; vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; --bufhashmask; return(vaddr); } void bufinit(void) { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); simple_lock_init(&buftimelock); for (i = 0; i <= bufhashmask; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is currently calculated to support all filesystem * blocks to be 8K. If you happen to use a 16K filesystem, the size * of the buffer cache is still the same as it would be for 8K * filesystems. This keeps the size of the buffer cache "in check" * for big block filesystems. * * maxbufspace is calculated as around 50% of the KVA available in * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the * effect of fragmentation. */ maxbufspace = (nbuf + 8) * DFLTBSIZE; if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE) hibufspace = 3 * maxbufspace / 4; #if 0 /* * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed */ maxvmiobufspace = 2 * hibufspace / 3; #endif /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ lodirtybuffers = nbuf / 7 + 10; hidirtybuffers = nbuf / 4 + 20; numdirtybuffers = 0; /* * Try to keep the number of free buffers in the specified range, * and give the syncer access to an emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ if ((maxbdrun = nswbuf / 4) < 4) maxbdrun = 4; kvafreespace = 0; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); } /* * Free the kva allocation for a buffer * Must be called only at splbio or higher, * as this is the only locking for buffer_map. */ static void bfreekva(struct buf * bp) { if (bp->b_kvasize) { vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); bp->b_kvasize = 0; kvaspacewakeup(); } } /* * bremfree: * * Remove the buffer from the appropriate free list. */ void bremfree(struct buf * bp) { int s = splbio(); int old_qindex = bp->b_qindex; if (bp->b_qindex != QUEUE_NONE) { if (bp->b_qindex == QUEUE_EMPTYKVA) { kvafreespace -= bp->b_kvasize; } KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; runningbufspace += bp->b_bufsize; } else { #if !defined(MAX_PERF) if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); #endif } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, and it was on the EMPTY, LRU, or AGE queues, * the buffer was free and we must decrement numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { switch(old_qindex) { case QUEUE_DIRTY: case QUEUE_CLEAN: case QUEUE_EMPTY: case QUEUE_EMPTYKVA: --numfreebuffers; break; default: break; } } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_flags |= B_READ; bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); return (biowait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. We must clear B_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); VOP_STRATEGY(vp, rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bwrite(struct buf * bp) { int oldflags, s; struct vnode *vp; struct mount *mp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not busy???"); #endif s = splbio(); bundirty(bp); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); bp->b_flags |= B_WRITEINPROG | B_CACHE; bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); if (curproc != NULL) curproc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) BUF_KERNPROC(bp); VOP_STRATEGY(bp->b_vp, bp); /* * Collect statistics on synchronous and asynchronous writes. * Writes to block devices are charged to their associated * filesystem (if any). */ if ((vp = bp->b_vp) != NULL) { if (vp->v_type == VBLK) mp = vp->v_specmountpoint; else mp = vp->v_mount; if (mp != NULL) { if ((oldflags & B_ASYNC) == 0) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; } } if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); brelse(bp); return (rtval); } return (0); } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf * bp) { #if 0 struct vnode *vp; #endif #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("bdwrite: buffer is not busy"); #endif if (bp->b_flags & B_INVAL) { brelse(bp); return; } bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have saturated the * buffer cache. */ bd_wakeup(hidirtybuffers); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ #if 0 /* * XXX The soft dependency code is not prepared to * have I/O done when a bdwrite is requested. For * now we just let the write be delayed if it is * requested by the soft dependency code. */ if ((vp = bp->b_vp) && ((vp->v_type == VBLK && vp->v_specmountpoint && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) return; #endif } /* * bdirty: * * Turn buffer into delayed write request. We must clear B_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bdirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_READ|B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; bd_wakeup(hidirtybuffers); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bundirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; numdirtywakeup(); } } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) VOP_BWRITE(bp->b_vp, bp); } /* * bowrite: * * Ordered write. Start output on a buffer, and flag it so that the * device will write it in the order it was queued. The buffer is * released when the output completes. bwrite() ( or the VOP routine * anyway ) is responsible for handling B_INVAL buffers. */ int bowrite(struct buf * bp) { bp->b_flags |= B_ORDERED | B_ASYNC; return (VOP_BWRITE(bp->b_vp, bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { int twenty = (hidirtybuffers - lodirtybuffers) / 5; if (numdirtybuffers > hidirtybuffers + twenty) { int s; s = splbio(); while (numdirtybuffers > hidirtybuffers) { bd_wakeup(hidirtybuffers); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); } splx(s); } } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf * bp) { int s; KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); #if 0 if (bp->b_flags & B_CLUSTER) { relpbuf(bp, NULL); return; } #endif s = splbio(); if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) { /* * Failed write, redirty. Must clear B_ERROR to prevent * pages from being scrapped. Note: B_INVAL is ignored * here but will presumably be dealt with later. */ bp->b_flags &= ~B_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); if (bp->b_flags & B_DELWRI) { --numdirtybuffers; numdirtywakeup(); } bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If B_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. B_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; struct vnode *vp; vp = bp->b_vp; /* * Get the base offset and length of the buffer. Note that * for block sizes that are less then PAGE_SIZE, the b_data * base of the buffer does not represent exactly b_offset and * neither b_offset nor b_size are necessarily page aligned. * Instead, the starting position of b_offset is: * * b_data + (b_offset & PAGE_MASK) * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if (m == bogus_page) { obj = (vm_object_t) vp->v_object; poff = OFF_TO_IDX(bp->b_offset); for (j = i; j < bp->b_npages; j++) { m = bp->b_pages[j]; if (m == bogus_page) { m = vm_page_lookup(obj, poff + j); #if !defined(MAX_PERF) if (!m) { panic("brelse: page missing\n"); } #endif bp->b_pages[j] = m; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } if (bp->b_flags & (B_NOCACHE|B_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_set_invalid(m, poffset, presid); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); #endif if (BUF_REFCNT(bp) > 1) { /* Temporary panic to verify exclusive locking */ /* This panic goes away when we allow shared refs */ panic("brelse: multiple refs"); /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; if (bp->b_kvasize) bp->b_qindex = QUEUE_EMPTYKVA; else bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; kvafreespace += bp->b_kvasize; if (bp->b_kvasize) kvaspacewakeup(); /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_flags |= B_INVAL; bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* remaining buffers */ } else { switch(bp->b_flags & (B_DELWRI|B_AGE)) { case B_DELWRI | B_AGE: bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist); break; case B_DELWRI: bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); break; case B_AGE: bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); break; default: bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); break; } } /* * If B_INVAL, clear B_DELWRI. We've already placed the buffer * on the correct queue. */ if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; numdirtywakeup(); } runningbufspace -= bp->b_bufsize; /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free. */ if (bp->b_bufsize) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } /* * Release a buffer back to the appropriate queue but do not try to free * it. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. */ void bqrelse(struct buf * bp) { int s; s = splbio(); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); #endif if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ panic("bqrelse: multiple refs"); BUF_UNLOCK(bp); splx(s); return; } if (bp->b_flags & B_LOCKED) { bp->b_flags &= ~B_ERROR; bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); } else { bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } runningbufspace -= bp->b_bufsize; if ((bp->b_flags & B_LOCKED) == 0 && ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { bufcountwakeup(); } /* * Something we can maybe wakeup */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } static void vfs_vmio_release(bp) struct buf *bp; { int i, s; vm_page_t m; s = splvm(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has * no valid data. */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } } } bufspace -= bp->b_bufsize; vmiospace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) bufspacewakeup(); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block is currently memory resident. */ struct buf * gbincore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; bh = bufhash(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp != NULL) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { break; } bp = bp->b_hash.le_next; } return (bp); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf * bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { if ((bpa = gbincore(vp, lblkno + i)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } for (j = 1; i + j <= maxcl && j <= lblkno; j++) { if ((bpa = gbincore(vp, lblkno - j)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno - ((j * size) >> DEV_BSHIFT))) break; } else { break; } } --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); splx(s); return nwritten; } } BUF_LOCK(bp, LK_EXCLUSIVE); bremfree(bp); bp->b_flags |= B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp->b_vp, bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; struct buf *dbp; int outofspace; int nqindex; int defrag = 0; ++getnewbufcalls; --getnewbufrestarts; restart: ++getnewbufrestarts; /* * Calculate whether we are out of buffer space. This state is * recalculated on every restart. If we are out of space, we * have to turn off defragmentation. Setting defrag to -1 when * outofspace is positive means "defrag while freeing buffers". * The looping conditional will be muffed up if defrag is left * positive when outofspace is positive. */ dbp = NULL; outofspace = 0; if (bufspace >= hibufspace) { if ((curproc->p_flag & P_BUFEXHAUST) == 0 || bufspace >= maxbufspace) { outofspace = 1; if (defrag > 0) defrag = -1; } } /* * defrag state is semi-persistant. 1 means we are flagged for * defragging. -1 means we actually defragged something. */ /* nop */ /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * Normally we want to find an EMPTYKVA buffer. That is, a * buffer with kva already allocated. If there are no EMPTYKVA * buffers we back up to the truely EMPTY buffers. When defragging * we do not bother backing up since we have to locate buffers with * kva to defrag. If we are out of space we skip both EMPTY and * EMPTYKVA and dig right into the CLEAN queue. * * In this manner we avoid scanning unnecessary buffers. It is very * important for us to do this because the buffer cache is almost * constantly out of space or in need of defragmentation. */ if ((curproc->p_flag & P_BUFEXHAUST) == 0 && numfreebuffers < lofreebuffers) { nqindex = QUEUE_CLEAN; nbp = NULL; } else { nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { if (defrag <= 0) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } if (outofspace || nbp == NULL) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* fall through */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* fall through */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); /* * If we are defragging and the buffer isn't useful for fixing * that problem we continue. If we are out of space and the * buffer isn't useful for fixing that problem we continue. */ if (defrag > 0 && bp->b_kvasize == 0) continue; if (outofspace > 0 && bp->b_bufsize == 0) continue; /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("getnewbuf: locked buf"); bremfree(bp); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; LIST_INIT(&bp->b_dep); /* * Ok, now that we have a free buffer, if we are defragging * we have to recover the kvaspace. If we are out of space * we have to free the buffer (which we just did), but we * do not have to recover kva space unless we hit a defrag * hicup. Being able to avoid freeing the kva space leads * to a significant reduction in overhead. */ if (defrag > 0) { defrag = -1; bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (outofspace > 0) { outofspace = -1; bp->b_flags |= B_INVAL; if (defrag < 0) bfreekva(bp); brelse(bp); goto restart; } /* * We are done */ break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; char *waitmsg; dosleep: if (defrag > 0) { flags = VFS_BIO_NEED_KVASPACE; waitmsg = "nbufkv"; } else if (outofspace > 0) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } /* XXX */ (void) speedup_syncer(); needsbuffer |= flags; while (needsbuffer & flags) { if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) return (NULL); } } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. */ vm_offset_t addr = 0; maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; if (maxsize != bp->b_kvasize) { bfreekva(bp); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. Try * to defragment. */ if (defrag <= 0) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } /* * Uh oh. We couldn't seem to defragment */ bp = NULL; goto dosleep; } } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; } bp->b_data = bp->b_kvabase; } return(bp); } /* * waitfreebuffers: * * Wait for sufficient free buffers. Only called from normal processes. */ static void waitfreebuffers(int slpflag, int slptimeo) { while (numfreebuffers < hifreebuffers) { if (numfreebuffers >= hifreebuffers) break; needsbuffer |= VFS_BIO_NEED_FREE; if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) break; } } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct proc *bufdaemonproc; static int bd_interval; static int bd_flushto; static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) static void buf_daemon() { int s; /* * This process is allowed to take the buffer cache to the limit */ curproc->p_flag |= P_BUFEXHAUST; s = splbio(); bd_interval = 5 * hz; /* dynamically adjusted */ bd_flushto = hidirtybuffers; /* dynamically adjusted */ while (TRUE) { bd_request = 0; /* * Do the flush. Limit the number of buffers we flush in one * go. The failure condition occurs when processes are writing * buffers faster then we can dispose of them. In this case * we may be flushing so often that the previous set of flushes * have not had time to complete, causing us to run out of * physical buffers and block. */ { int runcount = maxbdrun; while (numdirtybuffers > bd_flushto && runcount) { --runcount; if (flushbufqueues() == 0) break; } } /* * If nobody is requesting anything we sleep */ if (bd_request == 0) tsleep(&bd_request, PVM, "psleep", bd_interval); /* * We calculate how much to add or subtract from bd_flushto * and bd_interval based on how far off we are from the * optimal number of dirty buffers, which is 20% below the * hidirtybuffers mark. We cannot use hidirtybuffers straight * because being right on the mark will cause getnewbuf() * to oscillate our wakeup. * * The larger the error in either direction, the more we adjust * bd_flushto and bd_interval. The time interval is adjusted * by 2 seconds per whole-buffer-range of error. This is an * exponential convergence algorithm, with large errors * producing large changes and small errors producing small * changes. */ { int brange = hidirtybuffers - lodirtybuffers; int middb = hidirtybuffers - brange / 5; int deltabuf = middb - numdirtybuffers; bd_flushto += deltabuf / 20; bd_interval += deltabuf * (2 * hz) / (brange * 1); } if (bd_flushto < lodirtybuffers) bd_flushto = lodirtybuffers; if (bd_flushto > hidirtybuffers) bd_flushto = hidirtybuffers; if (bd_interval < hz / 10) bd_interval = hz / 10; if (bd_interval > 5 * hz) bd_interval = 5 * hz; } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushbufqueues(void) { struct buf *bp; int r = 0; bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); while (bp) { KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); if ((bp->b_flags & B_DELWRI) != 0) { if (bp->b_flags & B_INVAL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("flushbufqueues: locked buf"); bremfree(bp); brelse(bp); ++r; break; } vfs_bio_awrite(bp); ++r; break; } bp = TAILQ_NEXT(bp, b_freelist); } return(r); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; int s = splbio(); bp = gbincore(vp, blkno); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) return 0; obj = vp->v_object; size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) return 0; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) return 0; } return 1; } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) == 0) return; object = bp->b_pages[0]->object; if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p writeable but not mightbedirty\n", object); if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) { vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_test_dirty(bp->b_pages[i]); } /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and B_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; #if !defined(MAX_PERF) if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); #endif s = splbio(); loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. */ if (curproc->p_flag & P_BUFEXHAUST) { if (numfreebuffers == 0) { needsbuffer |= VFS_BIO_NEED_ANY; tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); } } else if (numfreebuffers < lofreebuffers) { waitfreebuffers(slpflag, slptimeo); } if ((bp = gbincore(vp, blkno))) { /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "getblk", slpflag, slptimeo) == ENOLCK) goto loop; splx(s); return (struct buf *) NULL; } /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp->b_vp, bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp->b_vp, bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { VOP_BWRITE(bp->b_vp, bp); goto loop; } splx(s); bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ int bsize, maxsize, vmio; off_t offset; if (vp->v_type == VBLK) bsize = DEV_BSIZE; else if (vp->v_mountedhere) bsize = vp->v_mountedhere->mnt_stat.f_iosize; else if (vp->v_mount) bsize = vp->v_mount->mnt_stat.f_iosize; else bsize = size; offset = (off_t)blkno * bsize; vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF); maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = bufhash(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG && vp->v_type != VBLK) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } allocbuf(bp, size); splx(s); bp->b_flags &= ~B_DONE; } return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int s; s = splbio(); while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); #endif if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) newbsize = mbsize; else #endif newbsize = round_page(size); if (newbsize < bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; if (bp->b_bufsize) bufspacewakeup(); bp->b_data = bp->b_kvabase; bp->b_bufsize = 0; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } #endif vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufspace += mbsize; bufmallocspace += mbsize; runningbufspace += bp->b_bufsize; return 1; } #endif origbuf = NULL; origbufsize = 0; #if !defined(NO_B_MALLOC) /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; if (bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = 0; bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } #endif vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); #if !defined(NO_B_MALLOC) if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } #endif } } else { vm_page_t m; int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); #endif /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_busy(m, TRUE, "biodep")) ; bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = vp->v_object; while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); if (m == NULL) { VM_WAIT; vm_pageout_deficit += desiredpages - bp->b_npages; } else { vm_page_wire(m); vm_page_wakeup(m); bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test PG_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ if (vm_page_sleep_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. Should we wakeup the * page daemon? */ if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } vm_page_flag_clear(m, PG_ZERO); vm_page_wire(m); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (bp->b_flags & B_VMIO) vmiospace += (newbsize - bp->b_bufsize); bufspace += (newbsize - bp->b_bufsize); runningbufspace += (newbsize - bp->b_bufsize); if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } /* * biowait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into a EINTR * error and cleared. */ int biowait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) { #if defined(NO_SCHEDULE_MODS) tsleep(bp, PRIBIO, "biowait", 0); #else if (bp->b_flags & B_READ) tsleep(bp, PRIBIO, "biord", 0); else tsleep(bp, PRIBIO, "biowr", 0); #endif } splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_flags & B_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * biodone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void biodone(register struct buf * bp) { int s; s = splbio(); KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; if (bp->b_flags & B_FREEBUF) { brelse(bp); splx(s); return; } if ((bp->b_flags & B_READ) == 0) { vwakeup(bp); } /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) (*bioops.io_complete)(bp); if (bp->b_flags & B_VMIO) { int i, resid; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; obj = vp->v_object; #if defined(VFS_BIO_DEBUG) if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if (vp->v_object == NULL) { panic("biodone: missing VM object"); } if ((vp->v_flag & VOBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); #if !defined(MAX_PERF) if (!obj) { panic("biodone: no object"); } #endif #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount; if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) { bp->b_flags |= B_CACHE; } for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif vm_object_pip_subtract(obj, 1); bp->b_flags &= ~B_CACHE; continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%lu)/m->pindex(%d) mismatch\n", (unsigned long)foff, m->pindex); } #endif resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iosize) resid = iosize; /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } vm_page_flag_clear(m, PG_ZERO); /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { #if !defined(MAX_PERF) printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); #endif if (vp->v_type != VBLK) #if !defined(MAX_PERF) printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, (int) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", (int) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", m->valid, m->dirty, m->wire_count); #endif panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff += resid; iosize -= resid; } if (obj) vm_object_pip_wakeupn(obj, 0); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) brelse(bp); else bqrelse(bp); } else { wakeup(bp); } splx(s); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); #if !defined(MAX_PERF) if (!m) { panic("vfs_unbusy_pages: page missing\n"); } #endif bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } vm_object_pip_wakeupn(obj, 0); } } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as B_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); vfs_setdirty(bp); retry: for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (vm_page_sleep_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ vm_page_protect(m, VM_PROT_NONE); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~PAGE_MASK; } if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; vm_ooffset_t eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } } } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { if (bp->b_flags & B_VMIO) { int i; int n; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear B_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { bp->b_flags &= ~(B_INVAL|B_ERROR); if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) != mask)) { bzero(bp->b_data, bp->b_bufsize); } bp->b_pages[0]->valid |= mask; bp->b_resid = 0; return; } ea = sa = bp->b_data; for(i=0;ib_npages;i++,sa=ea) { int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE; ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)ulmin((u_long)ea, (u_long)bp->b_data + bp->b_bufsize); mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(sa, ea - sa); } } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_pages[i]->valid |= mask; vm_page_flag_clear(bp->b_pages[i], PG_ZERO); } bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); if (!p) { vm_pageout_deficit += (to - from) >> PAGE_SHIFT; VM_WAIT; goto tryagain; } vm_page_wire(p); p->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(p, PG_ZERO); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[index] = p; vm_page_wakeup(p); } bp->b_npages = index; } void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { #if !defined(MAX_PERF) if (p->busy) { printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", bp->b_blkno, bp->b_lblkno); } #endif bp->b_pages[index] = NULL; pmap_kremove(pg); vm_page_busy(p); vm_page_unwire(p, 0); vm_page_free(p); } } bp->b_npages = newnpages; } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, " "b_blkno = %d, b_pblkno = %d\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, major(bp->b_dev), minor(bp->b_dev), bp->b_data, bp->b_blkno, bp->b_pblkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ Index: head/sys/kern/vfs_export.c =================================================================== --- head/sys/kern/vfs_export.c (revision 49534) +++ head/sys/kern/vfs_export.c (revision 49535) @@ -1,2978 +1,2976 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.213 1999/07/20 09:47:44 phk Exp $ + * $Id: vfs_subr.c,v 1.214 1999/07/26 06:25:17 alc Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static struct vnode *checkalias2 __P((struct vnode *nvp, dev_t dev, struct mount *mp)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); static void vfree __P((struct vnode *)); static void vgonel __P((struct vnode *vp, struct proc *p)); static unsigned long numvnodes; SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ struct tobefreelist vnode_tobefree_list; /* vnode free list */ static u_long wantfreevnodes = 25; SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); static u_long freevnodes = 0; SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif struct mntlist mountlist; /* mounted filesystem list */ struct simplelock mountlist_slock; struct simplelock mntvnode_slock; int nfs_mount_type = -1; #ifndef NULL_SIMPLELOCKS static struct simplelock mntid_slock; static struct simplelock vnode_free_list_slock; static struct simplelock spechash_slock; #endif struct nfs_public nfs_pub; /* publicly exported FS */ static vm_zone_t vnode_zone; /* * The workitem queue. */ #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + cnt.v_page_count / 4; simple_lock_init(&mntvnode_slock); simple_lock_init(&mntid_slock); simple_lock_init(&spechash_slock); TAILQ_INIT(&vnode_free_list); TAILQ_INIT(&vnode_tobefree_list); simple_lock_init(&vnode_free_list_slock); CIRCLEQ_INIT(&mountlist); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct simplelock *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; if (interlkp) { simple_unlock(interlkp); } /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); if (interlkp) { simple_lock(interlkp); } return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); return (mp); } } simple_unlock(&mountlist_slock); return ((struct mount *) 0); } /* * Get a new unique fsid */ void vfs_getnewfsid(mp) struct mount *mp; { static u_short xxxfs_mntid; fsid_t tfsid; int mtype; simple_lock(&mntid_slock); mtype = mp->mnt_vfc->vfc_typenum; mp->mnt_stat.f_fsid.val[0] = makeudev(255, mtype); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (vfs_getvfs(&tfsid)) { xxxfs_mntid++; tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; simple_unlock(&mntid_slock); } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s; struct proc *p = curproc; /* XXX */ struct vnode *vp, *tvp, *nvp; vm_object_t object; TAILQ_HEAD(freelst, vnode) vnode_tmp_list; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); simple_lock(&vnode_free_list_slock); TAILQ_INIT(&vnode_tmp_list); for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } vp->v_flag &= ~(VTBFREE|VAGE); vp->v_flag |= VFREE; if (vp->v_usecount) panic("tobe free vnode isn't"); freevnodes++; } if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else { for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); if (!simple_lock_try(&vp->v_interlock)) continue; if (vp->v_usecount) panic("free vnode isn't"); object = vp->v_object; if (object && (object->resident_page_count || object->ref_count)) { printf("object inconsistant state: RPC: %d, RC: %d\n", object->resident_page_count, object->ref_count); /* Don't recycle if it's caching some pages */ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); continue; } else if (LIST_FIRST(&vp->v_cache_src)) { /* Don't recycle if active in the namecache */ simple_unlock(&vp->v_interlock); continue; } else { break; } } } for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { nvp = TAILQ_NEXT(tvp, v_freelist); TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); simple_unlock(&tvp->v_interlock); } if (vp) { vp->v_flag |= VDOOMED; TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; simple_unlock(&vnode_free_list_slock); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { simple_unlock(&vp->v_interlock); } #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ vp->v_maxio = 0; } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); simple_lock_init(&vp->v_interlock); vp->v_dd = vp; cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { simple_lock(&mntvnode_slock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { simple_unlock(&mntvnode_slock); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); simple_unlock(&mntvnode_slock); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; VOP_BWRITE(bp->b_vp, bp); } } else { bremfree(bp); (void) VOP_BWRITE(bp->b_vp, bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ simple_lock(&vp->v_interlock); object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } simple_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } VOP_BWRITE(bp->b_vp, bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= B_VNCLEAN; bp->b_xflags &= ~B_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; long starttime; int s; struct proc *p = updateproc; p->p_flag |= P_BUFEXHAUST; for (;;) { starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ if (bioops.io_sync) (*bioops.io_sync)(NULL); /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { int s; s = splhigh(); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); splx(s); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); #if !defined(MAX_PERF) /* XXX REMOVE ME */ if (bp->b_vnbufs.tqe_next != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } #endif bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { #if !defined(MAX_PERF) if ((bp->b_flags & B_PAGING) == 0) { panic( "pbreassignbuf() on non phys bp %p", bp ); } #endif bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; #if !defined(MAX_PERF) /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); #endif s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VBLK: if (newvp->v_specmountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= B_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise guess. */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & B_VNDIRTY)) { TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= B_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a block device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; /* dev2udev() results in a CDEV, so we need to cheat here. */ vp->v_type = VBLK; if ((nvp = checkalias2(vp, dev, (struct mount *)0)) != NULL) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; udev_t nvp_rdev; struct mount *mp; { dev_t dev; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); return (checkalias2(nvp, dev, mp)); } static struct vnode * checkalias2(nvp, dev, mp) register struct vnode *nvp; dev_t dev; struct mount *mp; { struct proc *p = curproc; /* XXX */ struct vnode *vp; struct vnode **vpp; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); vpp = &dev->si_hlist; loop: simple_lock(&spechash_slock); for (vp = *vpp; vp; vp = vp->v_specnext) { if (nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. * Only alias active device nodes. * Not sure why we don't re-use this like we do below. */ simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { simple_unlock(&spechash_slock); vgonel(vp, p); goto loop; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { /* * It dissappeared, and we may have slept. * Restart from the beginning */ simple_unlock(&spechash_slock); goto loop; } break; } /* * It would be a lot clearer what is going on here if * this had been expressed as: * if ( vp && (vp->v_tag == VT_NULL)) * and the clauses had been swapped. */ if (vp == NULL || vp->v_tag != VT_NON) { struct specinfo *sinfo; /* * Put the new vnode into the hash chain. * and if there was an alias, connect them. */ nvp->v_specnext = *vpp; *vpp = nvp; nvp->v_specinfo = sinfo = dev; simple_unlock(&spechash_slock); if (vp != NULLVP) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } /* * if ( vp && (vp->v_tag == VT_NULL)) * We have a vnode alias, but it is a trashed. * Make it look like it's newly allocated. (by getnewvnode()) * The caller should use this instead. */ simple_unlock(&spechash_slock); VOP_UNLOCK(vp, 0, p); simple_lock(&vp->v_interlock); vclean(vp, 0, p); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vget", 0); return (ENOENT); } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ simple_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); simple_unlock(&vp->v_interlock); } return (error); } simple_unlock(&vp->v_interlock); return (0); } void vref(struct vnode *vp) { simple_lock(&vp->v_interlock); vp->v_usecount++; simple_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; simple_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); simple_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * One less who cares about this vnode. */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; simple_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { simple_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { simple_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { simple_unlock(&mntvnode_slock); vgonel(vp, p); simple_lock(&mntvnode_slock); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { simple_unlock(&mntvnode_slock); if (vp->v_type != VBLK && vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } simple_lock(&mntvnode_slock); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif simple_unlock(&vp->v_interlock); busy++; } simple_unlock(&mntvnode_slock); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; vm_object_t obj; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. */ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); if ((obj = vp->v_object) != NULL) { if (obj->ref_count == 0) { /* * This is a normal way of shutting down the object/vnode * association. */ vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); } } /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) vrele(vp); cache_purge(vp); if (vp->v_vnlock) { #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ #ifdef DIAGNOSTIC if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); #endif #endif FREE(vp->v_vnlock, M_VNODE); vp->v_vnlock = NULL; } if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; struct proc *p = curproc; /* XXX */ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); return (0); } /* * Ensure that vp will not be vgone'd while we * are eliminating its aliases. */ vp->v_flag |= VXLOCK; simple_unlock(&vp->v_interlock); while (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type || vp == vq) continue; simple_unlock(&spechash_slock); vgone(vq); break; } if (vq == NULLVP) { simple_unlock(&spechash_slock); } } /* * Remove the lock so that vgone below will * really eliminate the vnode after which time * vgone will awaken any sleepers. */ simple_lock(&vp->v_interlock); vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup(vp); } } vgonel(vp, p); return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct simplelock *inter_lkp; struct proc *p; { simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { simple_unlock(inter_lkp); } vgonel(vp, p); return (1); } simple_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ simple_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ static void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); simple_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { simple_lock(&spechash_slock); if (vp->v_hashchain == vp) { vp->v_hashchain = vp->v_specnext; } else { for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } simple_unlock(&spechash_slock); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the back * pointer and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VFREE) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); } else if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; freevnodes++; } else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); splx(s); } vp->v_type = VBAD; simple_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; int rc = 0; simple_lock(&spechash_slock); for (vp = dev->si_hlist; vp; vp = vp->v_specnext) { if (type != vp->v_type) continue; *vpp = vp; rc = 1; break; } simple_unlock(&spechash_slock); return (rc); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); simple_lock(&spechash_slock); for (count = 0, vq = vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { simple_unlock(&spechash_slock); vgone(vq); goto loop; } count += vq->v_usecount; } simple_unlock(&spechash_slock); return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { if (VOP_ISLOCKED(vp)) vprint((char *)0, vp); } simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); static int vfs_sysctl SYSCTL_HANDLER_ARGS { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf SYSCTL_HANDLER_ARGS { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if 0 #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } again: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { simple_unlock(&mntvnode_slock); goto again; } nvp = vp->v_mntvnodes.le_next; simple_unlock(&mntvnode_slock); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); return (0); } #endif /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ #if 0 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { struct vnode *vq; int error = 0; if (vp->v_specmountpoint != NULL) return (EBUSY); if (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type) continue; if (vq->v_specmountpoint != NULL) { error = EBUSY; break; } } simple_unlock(&spechash_slock); } return (error); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp, *nmp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; error = dounmount(mp, MNT_FORCE, p); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { nvp = vp->v_mntvnodes.le_next; if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { obj = vp->v_object; if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp)) continue; } simple_lock(&vp->v_interlock); if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); anyio = 1; } vput(vp); } } else { simple_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { struct vattr vat; vm_object_t object; int error = 0; if (vp->v_type != VBLK && vn_canvmio(vp) == FALSE) return 0; retry: if ((object = vp->v_object) == NULL) { if (vp->v_type == VREG || vp->v_type == VDIR) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) goto retn; object = vnode_pager_alloc(vp, vat.va_size, 0, 0); } else if (bdevsw(vp->v_rdev) != NULL) { /* * This simply allocates the biggest object possible * for a VBLK vnode. This should be fixed, but doesn't * cause any problems (yet). */ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); } else { goto retn; } /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ object->ref_count--; vp->v_usecount--; } else { if (object->flags & OBJ_DEAD) { VOP_UNLOCK(vp, 0, p); tsleep(object, PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); goto retry; } } KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); vp->v_flag |= VOBJBUF; retn: return error; } static void vfree(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } void vbusy(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; simple_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); simple_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ simple_lock(&mountlist_slock); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { simple_unlock(&mountlist_slock); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VBLK or VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VBLK && vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 49534) +++ head/sys/kern/vfs_subr.c (revision 49535) @@ -1,2978 +1,2976 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.213 1999/07/20 09:47:44 phk Exp $ + * $Id: vfs_subr.c,v 1.214 1999/07/26 06:25:17 alc Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static struct vnode *checkalias2 __P((struct vnode *nvp, dev_t dev, struct mount *mp)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); static void vfree __P((struct vnode *)); static void vgonel __P((struct vnode *vp, struct proc *p)); static unsigned long numvnodes; SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ struct tobefreelist vnode_tobefree_list; /* vnode free list */ static u_long wantfreevnodes = 25; SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); static u_long freevnodes = 0; SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif struct mntlist mountlist; /* mounted filesystem list */ struct simplelock mountlist_slock; struct simplelock mntvnode_slock; int nfs_mount_type = -1; #ifndef NULL_SIMPLELOCKS static struct simplelock mntid_slock; static struct simplelock vnode_free_list_slock; static struct simplelock spechash_slock; #endif struct nfs_public nfs_pub; /* publicly exported FS */ static vm_zone_t vnode_zone; /* * The workitem queue. */ #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + cnt.v_page_count / 4; simple_lock_init(&mntvnode_slock); simple_lock_init(&mntid_slock); simple_lock_init(&spechash_slock); TAILQ_INIT(&vnode_free_list); TAILQ_INIT(&vnode_tobefree_list); simple_lock_init(&vnode_free_list_slock); CIRCLEQ_INIT(&mountlist); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct simplelock *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; if (interlkp) { simple_unlock(interlkp); } /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); if (interlkp) { simple_lock(interlkp); } return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); return (mp); } } simple_unlock(&mountlist_slock); return ((struct mount *) 0); } /* * Get a new unique fsid */ void vfs_getnewfsid(mp) struct mount *mp; { static u_short xxxfs_mntid; fsid_t tfsid; int mtype; simple_lock(&mntid_slock); mtype = mp->mnt_vfc->vfc_typenum; mp->mnt_stat.f_fsid.val[0] = makeudev(255, mtype); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (vfs_getvfs(&tfsid)) { xxxfs_mntid++; tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; simple_unlock(&mntid_slock); } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s; struct proc *p = curproc; /* XXX */ struct vnode *vp, *tvp, *nvp; vm_object_t object; TAILQ_HEAD(freelst, vnode) vnode_tmp_list; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); simple_lock(&vnode_free_list_slock); TAILQ_INIT(&vnode_tmp_list); for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } vp->v_flag &= ~(VTBFREE|VAGE); vp->v_flag |= VFREE; if (vp->v_usecount) panic("tobe free vnode isn't"); freevnodes++; } if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else { for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); if (!simple_lock_try(&vp->v_interlock)) continue; if (vp->v_usecount) panic("free vnode isn't"); object = vp->v_object; if (object && (object->resident_page_count || object->ref_count)) { printf("object inconsistant state: RPC: %d, RC: %d\n", object->resident_page_count, object->ref_count); /* Don't recycle if it's caching some pages */ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); continue; } else if (LIST_FIRST(&vp->v_cache_src)) { /* Don't recycle if active in the namecache */ simple_unlock(&vp->v_interlock); continue; } else { break; } } } for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { nvp = TAILQ_NEXT(tvp, v_freelist); TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); simple_unlock(&tvp->v_interlock); } if (vp) { vp->v_flag |= VDOOMED; TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; simple_unlock(&vnode_free_list_slock); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { simple_unlock(&vp->v_interlock); } #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ vp->v_maxio = 0; } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); simple_lock_init(&vp->v_interlock); vp->v_dd = vp; cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { simple_lock(&mntvnode_slock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { simple_unlock(&mntvnode_slock); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); simple_unlock(&mntvnode_slock); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; VOP_BWRITE(bp->b_vp, bp); } } else { bremfree(bp); (void) VOP_BWRITE(bp->b_vp, bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ simple_lock(&vp->v_interlock); object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } simple_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } VOP_BWRITE(bp->b_vp, bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= B_VNCLEAN; bp->b_xflags &= ~B_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; long starttime; int s; struct proc *p = updateproc; p->p_flag |= P_BUFEXHAUST; for (;;) { starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ if (bioops.io_sync) (*bioops.io_sync)(NULL); /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { int s; s = splhigh(); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); splx(s); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); #if !defined(MAX_PERF) /* XXX REMOVE ME */ if (bp->b_vnbufs.tqe_next != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } #endif bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { #if !defined(MAX_PERF) if ((bp->b_flags & B_PAGING) == 0) { panic( "pbreassignbuf() on non phys bp %p", bp ); } #endif bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; #if !defined(MAX_PERF) /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); #endif s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VBLK: if (newvp->v_specmountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= B_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise guess. */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & B_VNDIRTY)) { TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= B_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a block device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; /* dev2udev() results in a CDEV, so we need to cheat here. */ vp->v_type = VBLK; if ((nvp = checkalias2(vp, dev, (struct mount *)0)) != NULL) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; udev_t nvp_rdev; struct mount *mp; { dev_t dev; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); return (checkalias2(nvp, dev, mp)); } static struct vnode * checkalias2(nvp, dev, mp) register struct vnode *nvp; dev_t dev; struct mount *mp; { struct proc *p = curproc; /* XXX */ struct vnode *vp; struct vnode **vpp; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); vpp = &dev->si_hlist; loop: simple_lock(&spechash_slock); for (vp = *vpp; vp; vp = vp->v_specnext) { if (nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. * Only alias active device nodes. * Not sure why we don't re-use this like we do below. */ simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { simple_unlock(&spechash_slock); vgonel(vp, p); goto loop; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { /* * It dissappeared, and we may have slept. * Restart from the beginning */ simple_unlock(&spechash_slock); goto loop; } break; } /* * It would be a lot clearer what is going on here if * this had been expressed as: * if ( vp && (vp->v_tag == VT_NULL)) * and the clauses had been swapped. */ if (vp == NULL || vp->v_tag != VT_NON) { struct specinfo *sinfo; /* * Put the new vnode into the hash chain. * and if there was an alias, connect them. */ nvp->v_specnext = *vpp; *vpp = nvp; nvp->v_specinfo = sinfo = dev; simple_unlock(&spechash_slock); if (vp != NULLVP) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } /* * if ( vp && (vp->v_tag == VT_NULL)) * We have a vnode alias, but it is a trashed. * Make it look like it's newly allocated. (by getnewvnode()) * The caller should use this instead. */ simple_unlock(&spechash_slock); VOP_UNLOCK(vp, 0, p); simple_lock(&vp->v_interlock); vclean(vp, 0, p); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vget", 0); return (ENOENT); } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ simple_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); simple_unlock(&vp->v_interlock); } return (error); } simple_unlock(&vp->v_interlock); return (0); } void vref(struct vnode *vp) { simple_lock(&vp->v_interlock); vp->v_usecount++; simple_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; simple_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); simple_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * One less who cares about this vnode. */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; simple_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { simple_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { simple_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { simple_unlock(&mntvnode_slock); vgonel(vp, p); simple_lock(&mntvnode_slock); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { simple_unlock(&mntvnode_slock); if (vp->v_type != VBLK && vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } simple_lock(&mntvnode_slock); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif simple_unlock(&vp->v_interlock); busy++; } simple_unlock(&mntvnode_slock); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; vm_object_t obj; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. */ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); if ((obj = vp->v_object) != NULL) { if (obj->ref_count == 0) { /* * This is a normal way of shutting down the object/vnode * association. */ vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); } } /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) vrele(vp); cache_purge(vp); if (vp->v_vnlock) { #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ #ifdef DIAGNOSTIC if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); #endif #endif FREE(vp->v_vnlock, M_VNODE); vp->v_vnlock = NULL; } if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; struct proc *p = curproc; /* XXX */ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); return (0); } /* * Ensure that vp will not be vgone'd while we * are eliminating its aliases. */ vp->v_flag |= VXLOCK; simple_unlock(&vp->v_interlock); while (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type || vp == vq) continue; simple_unlock(&spechash_slock); vgone(vq); break; } if (vq == NULLVP) { simple_unlock(&spechash_slock); } } /* * Remove the lock so that vgone below will * really eliminate the vnode after which time * vgone will awaken any sleepers. */ simple_lock(&vp->v_interlock); vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup(vp); } } vgonel(vp, p); return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct simplelock *inter_lkp; struct proc *p; { simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { simple_unlock(inter_lkp); } vgonel(vp, p); return (1); } simple_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ simple_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ static void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); simple_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { simple_lock(&spechash_slock); if (vp->v_hashchain == vp) { vp->v_hashchain = vp->v_specnext; } else { for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } simple_unlock(&spechash_slock); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the back * pointer and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VFREE) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); } else if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; freevnodes++; } else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); splx(s); } vp->v_type = VBAD; simple_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; int rc = 0; simple_lock(&spechash_slock); for (vp = dev->si_hlist; vp; vp = vp->v_specnext) { if (type != vp->v_type) continue; *vpp = vp; rc = 1; break; } simple_unlock(&spechash_slock); return (rc); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); simple_lock(&spechash_slock); for (count = 0, vq = vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { simple_unlock(&spechash_slock); vgone(vq); goto loop; } count += vq->v_usecount; } simple_unlock(&spechash_slock); return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { if (VOP_ISLOCKED(vp)) vprint((char *)0, vp); } simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); static int vfs_sysctl SYSCTL_HANDLER_ARGS { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf SYSCTL_HANDLER_ARGS { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if 0 #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } again: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { simple_unlock(&mntvnode_slock); goto again; } nvp = vp->v_mntvnodes.le_next; simple_unlock(&mntvnode_slock); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); return (0); } #endif /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ #if 0 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { struct vnode *vq; int error = 0; if (vp->v_specmountpoint != NULL) return (EBUSY); if (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_type != vp->v_type) continue; if (vq->v_specmountpoint != NULL) { error = EBUSY; break; } } simple_unlock(&spechash_slock); } return (error); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp, *nmp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; error = dounmount(mp, MNT_FORCE, p); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { nvp = vp->v_mntvnodes.le_next; if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { obj = vp->v_object; if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp)) continue; } simple_lock(&vp->v_interlock); if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); anyio = 1; } vput(vp); } } else { simple_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { struct vattr vat; vm_object_t object; int error = 0; if (vp->v_type != VBLK && vn_canvmio(vp) == FALSE) return 0; retry: if ((object = vp->v_object) == NULL) { if (vp->v_type == VREG || vp->v_type == VDIR) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) goto retn; object = vnode_pager_alloc(vp, vat.va_size, 0, 0); } else if (bdevsw(vp->v_rdev) != NULL) { /* * This simply allocates the biggest object possible * for a VBLK vnode. This should be fixed, but doesn't * cause any problems (yet). */ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); } else { goto retn; } /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ object->ref_count--; vp->v_usecount--; } else { if (object->flags & OBJ_DEAD) { VOP_UNLOCK(vp, 0, p); tsleep(object, PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); goto retry; } } KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); vp->v_flag |= VOBJBUF; retn: return error; } static void vfree(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } void vbusy(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; simple_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); simple_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ simple_lock(&mountlist_slock); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { simple_unlock(&mountlist_slock); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VBLK or VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VBLK && vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } Index: head/sys/miscfs/devfs/devfs_vnops.c =================================================================== --- head/sys/miscfs/devfs/devfs_vnops.c (revision 49534) +++ head/sys/miscfs/devfs/devfs_vnops.c (revision 49535) @@ -1,2135 +1,2134 @@ /* * Copyright 1997,1998 Julian Elischer. All rights reserved. * julian@freebsd.org * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: devfs_vnops.c,v 1.74 1999/05/11 19:54:35 phk Exp $ + * $Id: devfs_vnops.c,v 1.75 1999/06/26 02:46:17 mckusick Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include /* definitions of spec functions we use */ #include #include #include #include #include #include #include #include #include #include /* * Insert description here */ /* * Convert a component of a pathname into a pointer to a locked node. * This is a very central and rather complicated routine. * If the file system is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and DNUNLOCK * instead of two DNUNLOCKs. * * Overall outline of devfs_lookup: * * check accessibility of directory * null terminate the component (lookup leaves the whole string alone) * look for name in cache, if found, then if at end of path * and deleting or creating, drop it, else return name * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * node and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache * On return to lookup, remove the null termination we put in at the start. * * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent node unlocked. */ static int devfs_lookup(struct vop_lookup_args *ap) /*struct vop_lookup_args { struct vnode * a_dvp; directory vnode ptr struct vnode ** a_vpp; where to put the result struct componentname * a_cnp; the name we want };*/ { struct componentname *cnp = ap->a_cnp; struct vnode *dir_vnode = ap->a_dvp; struct vnode **result_vnode = ap->a_vpp; dn_p dir_node; /* the directory we are searching */ dn_p new_node; /* the node we are searching for */ devnm_p new_nodename; int flags = cnp->cn_flags; int op = cnp->cn_nameiop; /* LOOKUP, CREATE, RENAME, or DELETE */ int lockparent = flags & LOCKPARENT; int wantparent = flags & (LOCKPARENT|WANTPARENT); int error = 0; struct proc *p = cnp->cn_proc; char heldchar; /* the char at the end of the name componet */ *result_vnode = NULL; /* safe not sorry */ /*XXX*/ DBPRINT(("lookup\n")); if (dir_vnode->v_usecount == 0) printf("dir had no refs "); if (devfs_vntodn(dir_vnode,&dir_node)) { printf("vnode has changed?\n"); vprint("=",dir_vnode); return(EINVAL); } /* * Check accessiblity of directory. */ if (dir_node->type != DEV_DIR) /* XXX or symlink? */ { return (ENOTDIR); } if ((error = VOP_ACCESS(dir_vnode, VEXEC, cnp->cn_cred, p)) != 0) { return (error); } /* * We now have a segment name to search for, and a directory to search. * */ /***********************************************************************\ * SEARCH FOR NAME * * while making sure the component is null terminated for the strcmp * \***********************************************************************/ heldchar = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; new_nodename = dev_findname(dir_node,cnp->cn_nameptr); cnp->cn_nameptr[cnp->cn_namelen] = heldchar; if(!new_nodename) { /*******************************************************\ * Failed to find it.. (That may be good) * \*******************************************************/ new_node = NULL; /* to be safe */ /* * If creating, and at end of pathname * then can consider * allowing file to be created. */ if (!(flags & ISLASTCN) || !(op == CREATE || op == RENAME)) { return ENOENT; } /* * Access for write is interpreted as allowing * creation of files in the directory. */ if ((error = VOP_ACCESS(dir_vnode, VWRITE, cnp->cn_cred, p)) != 0) { DBPRINT(("MKACCESS ")); return (error); } /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to add a new entry. * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory vnode in namei_data->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; /*XXX why? */ if (!lockparent) VOP_UNLOCK(dir_vnode, 0, p); return (EJUSTRETURN); } /***************************************************************\ * Found it.. this is not always a good thing.. * \***************************************************************/ new_node = new_nodename->dnp; new_node->last_lookup = new_nodename; /* for unlink */ /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. * If the wantparent flag isn't set, we return only * the directory (in namei_data->ni_dvp), otherwise we go * on and lock the node, being careful with ".". */ if (op == DELETE && (flags & ISLASTCN)) { /* * Write access to directory required to delete files. */ if ((error = VOP_ACCESS(dir_vnode, VWRITE, cnp->cn_cred, p)) != 0) return (error); /* * we are trying to delete '.'. What does this mean? XXX */ if (dir_node == new_node) { VREF(dir_vnode); *result_vnode = dir_vnode; return (0); } /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ devfs_dntovn(new_node,result_vnode); #ifdef NOTYET if ((dir_node->mode & ISVTX) && cnp->cn_cred->cr_uid != 0 && cnp->cn_cred->cr_uid != dir_node->uid && cnp->cn_cred->cr_uid != new_node->uid) { VOP_UNLOCK(*result_vnode, 0, p); return (EPERM); } #endif if (!lockparent) VOP_UNLOCK(dir_vnode, 0, p); return (0); } /* * If rewriting (RENAME), return the vnode and the * information required to rewrite the present directory * Must get node of directory entry to verify it's a * regular file, or empty directory. */ if (op == RENAME && wantparent && (flags & ISLASTCN)) { /* * Are we allowed to change the holding directory? */ if ((error = VOP_ACCESS(dir_vnode, VWRITE, cnp->cn_cred, p)) != 0) return (error); /* * Careful about locking second node. * This can only occur if the target is ".". */ if (dir_node == new_node) return (EISDIR); devfs_dntovn(new_node,result_vnode); /* hmm save the 'from' name (we need to delete it) */ cnp->cn_flags |= SAVENAME; if (!lockparent) VOP_UNLOCK(dir_vnode, 0, p); return (0); } /* * Step through the translation in the name. We do not unlock the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "saved_dir_node" XXX. We must get the target * node before unlocking * the directory to insure that the node will not be removed * before we get it. We prevent deadlock by always fetching * nodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the lock for the * node associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the file system has any hard links other than ".." * that point backwards in the directory structure. */ if (flags & ISDOTDOT) { VOP_UNLOCK(dir_vnode, 0, p); /* race to get the node */ devfs_dntovn(new_node,result_vnode); if (lockparent && (flags & ISLASTCN)) vn_lock(dir_vnode, LK_EXCLUSIVE | LK_RETRY, p); } else if (dir_node == new_node) { VREF(dir_vnode); /* we want ourself, ie "." */ *result_vnode = dir_vnode; } else { devfs_dntovn(new_node,result_vnode); if (!lockparent || (flags & ISLASTCN)) VOP_UNLOCK(dir_vnode, 0, p); } DBPRINT(("GOT\n")); return (0); } /* */ static int devfs_access(struct vop_access_args *ap) /*struct vop_access_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ { /* * mode is filled with a combination of VREAD, VWRITE, * and/or VEXEC bits turned on. In an octal number these * are the Y in 0Y00. */ struct vnode *vp = ap->a_vp; int mode = ap->a_mode; struct ucred *cred = ap->a_cred; dn_p file_node; int error; gid_t *gp; int i; DBPRINT(("access\n")); if ((error = devfs_vntodn(vp,&file_node)) != 0) { printf("devfs_vntodn returned %d ",error); return error; } /* * if we are not running as a process, we are in the * kernel and we DO have permission */ if (ap->a_p == NULL) return 0; /* * Access check is based on only one of owner, group, public. * If not owner, then check group. If not a member of the * group, then check public access. */ if (cred->cr_uid != file_node->uid) { /* failing that.. try groups */ mode >>= 3; gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) { if (file_node->gid == *gp) { goto found; } } /* failing that.. try general access */ mode >>= 3; found: ; } if ((file_node->mode & mode) == mode) return (0); /* * Root gets to do anything. * but only use suser_xxx prives as a last resort * (Use of super powers is recorded in ap->a_p->p_acflag) */ if( suser_xxx(cred, ap->a_p, 0) == 0) /* XXX what if no proc? */ return 0; return (EACCES); } static int devfs_getattr(struct vop_getattr_args *ap) /*struct vop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; dn_p file_node; int error; DBPRINT(("getattr\n")); if ((error = devfs_vntodn(vp,&file_node)) != 0) { printf("devfs_vntodn returned %d ",error); return error; } vap->va_rdev = 0;/* default value only */ vap->va_mode = file_node->mode; switch (file_node->type) { case DEV_DIR: vap->va_rdev = (udev_t)file_node->dvm; vap->va_mode |= (S_IFDIR); break; case DEV_CDEV: vap->va_rdev = dev2udev(file_node->by.Cdev.dev); vap->va_mode |= (S_IFCHR); break; case DEV_BDEV: vap->va_rdev = dev2udev(file_node->by.Bdev.dev); vap->va_mode |= (S_IFBLK); break; case DEV_SLNK: break; } vap->va_type = vp->v_type; vap->va_nlink = file_node->links; vap->va_uid = file_node->uid; vap->va_gid = file_node->gid; vap->va_fsid = (intptr_t)(void *)file_node->dvm; vap->va_fileid = (intptr_t)(void *)file_node; vap->va_size = file_node->len; /* now a u_quad_t */ vap->va_blocksize = 512; /* * XXX If the node times are in Jan 1, 1970, then * update them to the boot time. * When we made the node, the date/time was not yet known. */ if(file_node->ctime.tv_sec < (24 * 3600)) { TIMEVAL_TO_TIMESPEC(&boottime,&(file_node->ctime)); TIMEVAL_TO_TIMESPEC(&boottime,&(file_node->mtime)); TIMEVAL_TO_TIMESPEC(&boottime,&(file_node->atime)); } if (file_node->flags & IN_ACCESS) { nanotime(&file_node->atime); file_node->flags &= ~IN_ACCESS; } vap->va_ctime = file_node->ctime; vap->va_mtime = file_node->mtime; vap->va_atime = file_node->atime; vap->va_gen = 0; vap->va_flags = 0; vap->va_bytes = file_node->len; /* u_quad_t */ vap->va_filerev = 0; /* XXX */ /* u_quad_t */ vap->va_vaflags = 0; /* XXX */ return 0; } static int devfs_setattr(struct vop_setattr_args *ap) /*struct vop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; int error = 0; gid_t *gp; int i; dn_p file_node; if (vap->va_flags != VNOVAL) /* XXX needs to be implemented */ return (EOPNOTSUPP); if ((error = devfs_vntodn(vp,&file_node)) != 0) { printf("devfs_vntodn returned %d ",error); return error; } DBPRINT(("setattr\n")); if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL )) { return EINVAL; } /* * Anyone can touch the files in such a way that the times are set * to NOW (e.g. run 'touch') if they have write permissions * however only the owner or root can set "un-natural times. * They also don't need write permissions. */ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { #if 0 /* * This next test is pointless under devfs for now.. * as there is only one devfs hiding under potentially many * mountpoints and actual device node are really 'mounted' under * a FAKE mountpoint inside the kernel only, no matter where it * APPEARS they are mounted to the outside world.. * A readonly devfs doesn't exist anyway. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #endif if (((vap->va_vaflags & VA_UTIMES_NULL) == 0) && (cred->cr_uid != file_node->uid) && suser_xxx(cred, p, 0)) return (EPERM); if(VOP_ACCESS(vp, VWRITE, cred, p)) return (EACCES); file_node->atime = vap->va_atime; file_node->mtime = vap->va_mtime; nanotime(&file_node->ctime); return (0); } /* * Change the permissions.. must be root or owner to do this. */ if (vap->va_mode != (u_short)VNOVAL) { if ((cred->cr_uid != file_node->uid) && suser_xxx(cred, p, 0)) return (EPERM); /* set drwxwxrwx stuff */ file_node->mode &= ~07777; file_node->mode |= vap->va_mode & 07777; } /* * Change the owner.. must be root to do this. */ if (vap->va_uid != (uid_t)VNOVAL) { if (suser_xxx(cred, p, 0)) return (EPERM); file_node->uid = vap->va_uid; } /* * Change the group.. must be root or owner to do this. * If we are the owner, we must be in the target group too. * don't use suser_xxx() unless you have to as it reports * whether you needed suser_xxx powers or not. */ if (vap->va_gid != (gid_t)VNOVAL) { if (cred->cr_uid == file_node->uid){ gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) { if (vap->va_gid == *gp) goto cando; } } /* * we can't do it with normal privs, * do we have an ace up our sleeve? */ if( suser_xxx(cred, p, 0)) return (EPERM); cando: file_node->gid = vap->va_gid; } #if 0 /* * Copied from somewhere else * but only kept as a marker and reminder of the fact that * flags should be handled some day */ if (vap->va_flags != VNOVAL) { if (error = suser_xxx(cred, p, 0)) return error; if (cred->cr_uid == 0) ; else { } } #endif return error; } static int devfs_xread(struct vop_read_args *ap) /*struct vop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ { int error = 0; dn_p file_node; DBPRINT(("read\n")); if ((error = devfs_vntodn(ap->a_vp,&file_node)) != 0) { printf("devfs_vntodn returned %d ",error); return error; } switch (ap->a_vp->v_type) { case VREG: return(EINVAL); case VDIR: return VOP_READDIR(ap->a_vp,ap->a_uio,ap->a_cred, NULL,NULL,NULL); case VCHR: case VBLK: panic("devfs: vnode methods"); default: panic("devfs_read(): bad file type"); break; } } /* * Write data to a file or directory. */ static int devfs_xwrite(struct vop_write_args *ap) /*struct vop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ { switch (ap->a_vp->v_type) { case VREG: return(EINVAL); case VDIR: return(EISDIR); case VCHR: case VBLK: panic("devfs: vnode methods"); default: panic("devfs_write(): bad file type"); } } static int devfs_remove(struct vop_remove_args *ap) /*struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; dn_p tp, tdp; devnm_p tnp; int doingdirectory = 0; int error = 0; uid_t ouruid = cnp->cn_cred->cr_uid; DBPRINT(("remove\n")); /* * Lock our directories and get our name pointers * assume that the names are null terminated as they * are the end of the path. Get pointers to all our * devfs structures. */ if ((error = devfs_vntodn(dvp, &tdp)) != 0) { abortit: VOP_ABORTOP(dvp, cnp); return (error); } if ((error = devfs_vntodn(vp, &tp)) != 0) goto abortit; /* * Assuming we are atomic, dev_lookup left this for us */ tnp = tp->last_lookup; /* * Check we are doing legal things WRT the new flags */ if ((tp->flags & (IMMUTABLE | APPEND)) || (tdp->flags & APPEND) /*XXX eh?*/ ) { error = EPERM; goto abortit; } /* * Make sure that we don't try do something stupid */ if ((tp->type) == DEV_DIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ( (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags&ISDOTDOT) ) { error = EINVAL; goto abortit; } doingdirectory++; } /*********************************** * Start actually doing things.... * ***********************************/ getnanotime(&(tdp->mtime)); /* * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. * XXX shoudn't this be in generic code? */ if ((tdp->mode & S_ISTXT) && ouruid != 0 && ouruid != tdp->uid && ouruid != tp->uid ) { error = EPERM; goto abortit; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (( doingdirectory) && (tp->links > 2)) { printf("nlink = %d\n",tp->links); /*XXX*/ error = ENOTEMPTY; goto abortit; } dev_free_name(tnp); tp = NULL; return (error); } /* */ static int devfs_link(struct vop_link_args *ap) /*struct vop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; dn_p fp, tdp; devnm_p tnp; int error = 0; DBPRINT(("link\n")); /* * First catch an arbitrary restriction for this FS */ if(cnp->cn_namelen > DEVMAXNAMESIZE) { error = ENAMETOOLONG; goto abortit; } /* * Lock our directories and get our name pointers * assume that the names are null terminated as they * are the end of the path. Get pointers to all our * devfs structures. */ if ((error = devfs_vntodn(tdvp,&tdp)) != 0) goto abortit; if ((error = devfs_vntodn(vp,&fp)) != 0) goto abortit; /* * trying to move it out of devfs? (v_tag == VT_DEVFS) */ if ( (vp->v_tag != VT_DEVFS) || (vp->v_tag != tdvp->v_tag) ) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, cnp); goto out; } /* * Check we are doing legal things WRT the new flags */ if (fp->flags & (IMMUTABLE | APPEND)) { error = EPERM; goto abortit; } /*********************************** * Start actually doing things.... * ***********************************/ getnanotime(&(tdp->atime)); error = dev_add_name(cnp->cn_nameptr, tdp, NULL, fp, &tnp); out: return (error); } /* * Rename system call. Seems overly complicated to me... * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. * * When the target exists, both the directory * and target vnodes are locked. * the source and source-parent vnodes are referenced * * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to node if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int devfs_rename(struct vop_rename_args *ap) /*struct vop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; dn_p fp, fdp, tp, tdp; devnm_p fnp,tnp; int doingdirectory = 0; int error = 0; /* * First catch an arbitrary restriction for this FS */ if(tcnp->cn_namelen > DEVMAXNAMESIZE) { error = ENAMETOOLONG; goto abortit; } /* * Lock our directories and get our name pointers * assume that the names are null terminated as they * are the end of the path. Get pointers to all our * devfs structures. */ if ((error = devfs_vntodn(tdvp,&tdp)) != 0) goto abortit; if ((error = devfs_vntodn(fdvp,&fdp)) != 0) goto abortit; if ((error = devfs_vntodn(fvp,&fp)) != 0) goto abortit; fnp = fp->last_lookup; if (tvp) { if ((error = devfs_vntodn(tvp,&tp)) != 0) goto abortit; tnp = tp->last_lookup; } else { tp = NULL; tnp = NULL; } /* * trying to move it out of devfs? (v_tag == VT_DEVFS) * if we move a dir across mnt points. we need to fix all * the mountpoint pointers! XXX * so for now keep dirs within the same mount */ if ( (fvp->v_tag != VT_DEVFS) || (fvp->v_tag != tdvp->v_tag) || (tvp && (fvp->v_tag != tvp->v_tag)) || ((fp->type == DEV_DIR) && (fp->dvm != tdp->dvm ))) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) /* eh? */ vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ vrele(fdvp); vrele(fvp); return (error); } /* * Check we are doing legal things WRT the new flags */ if ((tp && (tp->flags & (IMMUTABLE | APPEND))) || (fp->flags & (IMMUTABLE | APPEND)) || (fdp->flags & APPEND)) { error = EPERM; goto abortit; } /* * Make sure that we don't try do something stupid */ if ((fp->type) == DEV_DIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || (fcnp->cn_flags&ISDOTDOT) || (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.') || (tcnp->cn_flags&ISDOTDOT) || (tdp == fp )) { error = EINVAL; goto abortit; } doingdirectory++; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". */ if (doingdirectory && (tdp != fdp)) { dn_p tmp,ntmp; error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); tmp = tdp; do { if(tmp == fp) { /* XXX unlock stuff here probably */ error = EINVAL; goto out; } ntmp = tmp; } while ((tmp = tmp->by.Dir.parent) != ntmp); } /*********************************** * Start actually doing things.... * ***********************************/ getnanotime(&(fp->atime)); /* * Check if just deleting a link name. */ if (fvp == tvp) { if (fvp->v_type == VDIR) { error = EINVAL; goto abortit; } /* Release destination completely. */ VOP_ABORTOP(tdvp, tcnp); vput(tdvp); vput(tvp); /* Delete source. */ VOP_ABORTOP(fdvp, fcnp); /*XXX*/ vrele(fdvp); vrele(fvp); dev_free_name(fnp); return 0; } /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, too bad :) */ fp->links++; /* * If the target exists zap it (unless it's a non-empty directory) * We could do that as well but won't */ if (tp) { int ouruid = tcnp->cn_cred->cr_uid; /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. * XXX shoudn't this be in generic code? */ if ((tdp->mode & S_ISTXT) && ouruid != 0 && ouruid != tdp->uid && ouruid != tp->uid ) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (( doingdirectory) && (tp->links > 2)) { printf("nlink = %d\n",tp->links); /*XXX*/ error = ENOTEMPTY; goto bad; } dev_free_name(tnp); tp = NULL; } dev_add_name(tcnp->cn_nameptr,tdp,fnp->as.front.realthing,fp,&tnp); fnp->dnp = NULL; fp->links--; /* one less link to it.. */ dev_free_name(fnp); fp->links--; /* we added one earlier*/ if (tdp) vput(tdvp); if (tp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (tp) vput(tvp); vput(tdvp); out: if (vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p) == 0) { fp->links--; /* we added one earlier*/ vput(fvp); } else vrele(fvp); return (error); } static int devfs_symlink(struct vop_symlink_args *ap) /*struct vop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ { struct vnode *vp; int error; dn_p dnp; union typeinfo by; devnm_p nm_p; DBPRINT(("symlink\n")); if((error = devfs_vntodn(ap->a_dvp, &dnp)) != 0) { return (error); } by.Slnk.name = ap->a_target; by.Slnk.namelen = strlen(ap->a_target); dev_add_entry(ap->a_cnp->cn_nameptr, dnp, DEV_SLNK, &by, NULL, NULL, &nm_p); if((error = devfs_dntovn(nm_p->dnp, &vp)) != 0) { return (error); } VOP_SETATTR(vp, ap->a_vap, ap->a_cnp->cn_cred, ap->a_cnp->cn_proc); *ap->a_vpp = NULL; vput(vp); return 0; } /* * Vnode op for readdir */ static int devfs_readdir(struct vop_readdir_args *ap) /*struct vop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *eofflag; int *ncookies; u_int **cookies; } */ { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct dirent dirent; dn_p dir_node; devnm_p name_node; char *name; int error = 0; int reclen; int nodenumber; int startpos,pos; DBPRINT(("readdir\n")); /* set up refs to dir */ if ((error = devfs_vntodn(vp,&dir_node)) != 0) return error; if(dir_node->type != DEV_DIR) return(ENOTDIR); pos = 0; startpos = uio->uio_offset; name_node = dir_node->by.Dir.dirlist; nodenumber = 0; getnanotime(&(dir_node->atime)); while ((name_node || (nodenumber < 2)) && (uio->uio_resid > 0)) { switch(nodenumber) { case 0: dirent.d_fileno = (uintptr_t)(void *)dir_node; name = "."; dirent.d_namlen = 1; dirent.d_type = DT_DIR; break; case 1: if(dir_node->by.Dir.parent) dirent.d_fileno = (uintptr_t)(void *)dir_node->by.Dir.parent; else dirent.d_fileno = (uintptr_t)(void *)dir_node; name = ".."; dirent.d_namlen = 2; dirent.d_type = DT_DIR; break; default: dirent.d_fileno = (uintptr_t)(void *)name_node->dnp; dirent.d_namlen = strlen(name_node->name); name = name_node->name; switch(name_node->dnp->type) { case DEV_BDEV: dirent.d_type = DT_BLK; break; case DEV_CDEV: dirent.d_type = DT_CHR; break; case DEV_DDEV: dirent.d_type = DT_SOCK; /*XXX*/ break; case DEV_DIR: dirent.d_type = DT_DIR; break; case DEV_SLNK: dirent.d_type = DT_LNK; break; default: dirent.d_type = DT_UNKNOWN; } } reclen = dirent.d_reclen = GENERIC_DIRSIZ(&dirent); if(pos >= startpos) /* made it to the offset yet? */ { if (uio->uio_resid < reclen) /* will it fit? */ break; strcpy( dirent.d_name,name); if ((error = uiomove ((caddr_t)&dirent, dirent.d_reclen, uio)) != 0) break; } pos += reclen; if((nodenumber >1) && name_node) name_node = name_node->next; nodenumber++; } uio->uio_offset = pos; return (error); } /* */ static int devfs_readlink(struct vop_readlink_args *ap) /*struct vop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; dn_p lnk_node; int error = 0; DBPRINT(("readlink\n")); /* set up refs to dir */ if ((error = devfs_vntodn(vp,&lnk_node)) != 0) return error; if(lnk_node->type != DEV_SLNK) return(EINVAL); if ((error = VOP_ACCESS(vp, VREAD, ap->a_cred, NULL)) != 0) { /* XXX */ return error; } error = uiomove(lnk_node->by.Slnk.name, lnk_node->by.Slnk.namelen, uio); return error; } #ifdef notyet static int devfs_abortop(struct vop_abortop_args *ap) /*struct vop_abortop_args { struct vnode *a_dvp; struct componentname *a_cnp; } */ { DBPRINT(("abortop\n")); if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return 0; } #endif /* notyet */ static int devfs_reclaim(struct vop_reclaim_args *ap) /*struct vop_reclaim_args { struct vnode *a_vp; } */ { dn_p file_node = NULL; int error; DBPRINT(("reclaim\n")); if ((error = devfs_vntodn(ap->a_vp,&file_node)) != 0) { printf("devfs_vntodn returned %d ",error); return error; } ap->a_vp->v_data = NULL; if (file_node) { file_node->vn = 0; file_node->vn_id = 0; } return(0); } /* * Print out the contents of a /devfs vnode. */ static int devfs_print(struct vop_print_args *ap) /*struct vop_print_args { struct vnode *a_vp; } */ { printf("tag VT_DEVFS, devfs vnode\n"); return (0); } /**************************************************************************\ * pseudo ops * \**************************************************************************/ /*proto*/ void devfs_dropvnode(dn_p dnp) { struct vnode *vn_p; #ifdef PARANOID if(!dnp) { printf("devfs: dn count dropped too early\n"); } #endif vn_p = dnp->vn; /* * check if we have a vnode....... */ if((vn_p) && ( dnp->vn_id == vn_p->v_id) && (dnp == (dn_p)vn_p->v_data)) { VOP_REVOKE(vn_p, REVOKEALL); } dnp->vn = NULL; /* be pedantic about this */ } /* struct vnode *speclisth[SPECHSZ];*/ /* till specfs goes away */ /* * Open a special file. struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } *ap; */ /* ARGSUSED */ static int devfs_open( struct vop_open_args *ap) { struct proc *p = ap->a_p; struct vnode *vp = ap->a_vp; int error; dn_p dnp; if ((error = devfs_vntodn(vp,&dnp)) != 0) return error; switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*dnp->by.Cdev.cdevsw->d_open)( dnp->by.Cdev.dev, ap->a_mode, S_IFCHR, p); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); /* NOT REACHED */ case VBLK: error = (*dnp->by.Bdev.bdevsw->d_open)( dnp->by.Bdev.dev, ap->a_mode, S_IFBLK, p); break; default: break; } return (error); } /* * Vnode op for read struct vop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ /* ARGSUSED */ static int devfs_read( struct vop_read_args *ap) { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn, nextbn; long bsize, bscale; struct partinfo dpart; int n, on; d_ioctl_t *ioctl; int error = 0; dev_t dev; dn_p dnp; if ((error = devfs_vntodn(vp,&dnp)) != 0) return error; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("devfs_read mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("devfs_read proc"); #endif if (uio->uio_resid == 0) return (0); switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*dnp->by.Cdev.cdevsw->d_read) (dnp->by.Cdev.dev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); break; case VBLK: if (uio->uio_offset < 0) return (EINVAL); bsize = BLKDEV_IOSIZE; dev = dnp->by.Bdev.dev; /* * This is a hack! */ if ( (ioctl = dnp->by.Bdev.bdevsw->d_ioctl) != NULL && (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && dpart.part->p_fstype == FS_BSDFFS && dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) bsize = dpart.part->p_frag * dpart.part->p_fsize; bscale = btodb(bsize); /* * Get buffers with this data from the buffer cache. * If it's not there the strategy() entrypoint will be called. * We may do this in several chunks. */ do { bn = btodb(uio->uio_offset) & ~(bscale - 1); on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (vp->v_lastr + bscale == bn) { nextbn = bn + bscale; error = breadn(vp, bn, (int)bsize, &nextbn, (int *)&bsize, 1, NOCRED, &bp); } else error = bread(vp, bn, (int)bsize, NOCRED, &bp); vp->v_lastr = bn; n = min(n, bsize - bp->b_resid); if (error) { brelse(bp); return (error); } /* * Copy it to the user's space */ error = uiomove((char *)bp->b_data + on, n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); break; default: panic("devfs_read type"); } if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) dnp->flags |= IN_ACCESS; return (error); } /* * Vnode op for write struct vop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ /* ARGSUSED */ static int devfs_write( struct vop_write_args *ap) { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn; int bsize, blkmask; struct partinfo dpart; register int n, on; int error = 0; dn_p dnp; if ((error = devfs_vntodn(vp,&dnp)) != 0) return error; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("devfs_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("devfs_write proc"); #endif switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*dnp->by.Cdev.cdevsw->d_write) (dnp->by.Cdev.dev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); bsize = BLKDEV_IOSIZE; if ((dnp->by.Bdev.bdevsw->d_ioctl != NULL) && ((*dnp->by.Bdev.bdevsw->d_ioctl)(dnp->by.Bdev.dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) && (dpart.part->p_fstype == FS_BSDFFS) && (dpart.part->p_frag != 0) && (dpart.part->p_fsize != 0)) { bsize = dpart.part->p_frag * dpart.part->p_fsize; } blkmask = btodb(bsize) - 1; do { bn = btodb(uio->uio_offset) & ~blkmask; on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (n == bsize) bp = getblk(vp, bn, bsize, 0, 0); else error = bread(vp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); if (n + on == bsize) bawrite(bp); else bdwrite(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("devfs_write type"); } /* NOTREACHED */ } /* * Device ioctl operation. struct vop_ioctl_args { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ /* ARGSUSED */ static int devfs_ioctl(struct vop_ioctl_args *ap) { dn_p dnp; int error; if ((error = devfs_vntodn(ap->a_vp,&dnp)) != 0) return error; switch (ap->a_vp->v_type) { case VCHR: return ((*dnp->by.Cdev.cdevsw->d_ioctl)(dnp->by.Cdev.dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); case VBLK: return ((*dnp->by.Bdev.bdevsw->d_ioctl)(dnp->by.Bdev.dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); default: panic("devfs_ioctl"); /* NOTREACHED */ } } /* struct vop_poll_args { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct proc *a_p; } *ap; */ /* ARGSUSED */ static int devfs_poll(struct vop_poll_args *ap) { dn_p dnp; int error; if ((error = devfs_vntodn(ap->a_vp,&dnp)) != 0) return error; switch (ap->a_vp->v_type) { case VCHR: return (*dnp->by.Cdev.cdevsw->d_poll)(dnp->by.Cdev.dev, ap->a_events, ap->a_p); default: return (vop_defaultop((struct vop_generic_args *)ap)); } } /* * Synch buffers associated with a block device struct vop_fsync_args { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ /* ARGSUSED */ static int devfs_fsync(struct vop_fsync_args *ap) { register struct vnode *vp = ap->a_vp; register struct buf *bp; struct buf *nbp; int s; dn_p dnp; int error; if ((error = devfs_vntodn(vp,&dnp)) != 0) return error; if (vp->v_type == VCHR) return (0); /* * Flush all dirty buffers associated with a block device. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("devfs_fsync: not dirty"); if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); splx(s); } else { bremfree(bp); splx(s); bawrite(bp); } goto loop; } if (ap->a_waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spfsyn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("devfs_fsync: dirty", vp); splx(s); goto loop; } #endif } splx(s); return (0); } /* * * struct vop_inactive_args { * struct vnode *a_vp; * struct proc *a_p; * } */ static int devfs_inactive(struct vop_inactive_args *ap) { VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } /* * Just call the device strategy routine struct vop_strategy_args { struct vnode *a_vp; struct buf *a_bp; } */ static int devfs_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; dn_p dnp; int error; if ((ap->a_vp->v_type != VCHR) && (ap->a_vp->v_type != VBLK)) panic ("devfs_strat:badvnode type"); if ((error = devfs_vntodn(ap->a_vp,&dnp)) != 0) return error; if (((bp->b_flags & B_READ) == 0) && (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) (*bioops.io_start)(bp); switch (ap->a_vp->v_type) { case VCHR: (*dnp->by.Cdev.cdevsw->d_strategy)(bp); break; case VBLK: (*dnp->by.Bdev.bdevsw->d_strategy)(bp); break; default: /* XXX set error code? */ break; } return (0); } /* * This is a noop, simply returning what one has been given. struct vop_bmap_args { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ static int devfs_bmap(struct vop_bmap_args *ap) { if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Device close routine struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ /* ARGSUSED */ static int devfs_close(struct vop_close_args *ap) { register struct vnode *vp = ap->a_vp; int error; dn_p dnp; if ((error = devfs_vntodn(vp,&dnp)) != 0) return error; switch (vp->v_type) { case VCHR: /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (vcount(vp) == 2 && ap->a_p && (vp->v_flag & VXLOCK) == 0 && vp == ap->a_p->p_session->s_ttyvp) { vrele(vp); ap->a_p->p_session->s_ttyvp = NULL; } /* * If the vnode is locked, then we are in the midst * of forcably closing the device, otherwise we only * close on last reference. */ if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) return (0); return ((*dnp->by.Cdev.cdevsw->d_close)(dnp->by.Cdev.dev, ap->a_fflag, S_IFCHR, ap->a_p)); /* NOT REACHED */ case VBLK: /* * On last close of a block device (that isn't mounted) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); VOP_UNLOCK(vp, 0, ap->a_p); if (error) return (error); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ if ((vcount(vp) > 1) && (vp->v_flag & VXLOCK) == 0) return (0); return ((*dnp->by.Bdev.bdevsw->d_close)(dnp->by.Bdev.dev, ap->a_fflag, S_IFBLK, ap->a_p)); /* NOT REACHED */ default: panic("devfs_close: not special"); } } /* * Print out the contents of a special device vnode. struct vop_print_args { struct vnode *a_vp; } */ /* * Special device advisory byte-level locks. struct vop_advlock_args { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ /* ARGSUSED */ static int devfs_advlock(struct vop_advlock_args *ap) { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* * Special device bad operation */ static int devfs_badop(void) { panic("devfs_badop called"); /* NOTREACHED */ } static void devfs_getpages_iodone(struct buf *bp) { bp->b_flags |= B_DONE; wakeup(bp); } static int devfs_getpages(struct vop_getpages_args *ap) { vm_offset_t kva; int error; int i, pcount, size, s; daddr_t blkno; struct buf *bp; vm_page_t m; vm_ooffset_t offset; int toff, nextoff, nread; struct vnode *vp = ap->a_vp; int blksiz; int gotreqpage; error = 0; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * Calculate the offset of the transfer. */ offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; /* XXX sanity check before we go into details. */ /* XXX limits should be defined elsewhere. */ #define DADDR_T_BIT 32 #define OFFSET_MAX ((1LL << (DADDR_T_BIT + DEV_BSHIFT)) - 1) if (offset < 0 || offset > OFFSET_MAX) { /* XXX still no %q in kernel. */ printf("devfs_getpages: preposterous offset 0x%x%08x\n", (u_int)((u_quad_t)offset >> 32), (u_int)(offset & 0xffffffff)); return (VM_PAGER_ERROR); } blkno = btodb(offset); /* * Round up physical size for real devices, use the * fundamental blocksize of the fs if possible. */ if (vp && vp->v_mount) { if (vp->v_type != VBLK) { vprint("Non VBLK", vp); } blksiz = vp->v_mount->mnt_stat.f_bsize; if (blksiz < DEV_BSIZE) { blksiz = DEV_BSIZE; } } else blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* * Map the pages to be read into the kva. */ pmap_qenter(kva, ap->a_m, pcount); /* Build a minimal buffer header. */ bp->b_flags = B_READ | B_CALL; bp->b_iodone = devfs_getpages_iodone; /* B_PHYS is not set, but it is nice to fill this in. */ bp->b_rcred = bp->b_wcred = curproc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_blkno = blkno; bp->b_lblkno = blkno; pbgetvp(ap->a_vp, bp); bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; /* Do the input. */ VOP_STRATEGY(bp->b_vp, bp); s = splbio(); /* We definitely need to be at splbio here. */ while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PVM, "spread", 0); splx(s); if ((bp->b_flags & B_ERROR) != 0) { if (bp->b_error) error = bp->b_error; else error = EIO; } nread = size - bp->b_resid; if (nread < ap->a_count) { bzero((caddr_t)kva + nread, ap->a_count - nread); } pmap_qremove(kva, pcount); gotreqpage = 0; for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { nextoff = toff + PAGE_SIZE; m = ap->a_m[i]; m->flags &= ~PG_ZERO; if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } else if (toff < nread) { int nvalid = ((nread + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1); vm_page_set_validclean(m, 0, nvalid); } else { m->valid = 0; m->dirty = 0; } if (i != ap->a_reqpage) { /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error || (m->valid == VM_PAGE_BITS_ALL)) { if (m->valid) { if (m->flags & PG_WANTED) { vm_page_activate(m); } else { vm_page_deactivate(m); } vm_page_wakeup(m); } else { vm_page_free(m); } } else { vm_page_free(m); } } else if (m->valid) { gotreqpage = 1; } } if (!gotreqpage) { m = ap->a_m[ap->a_reqpage]; #ifndef MAX_PERF printf("devfs_getpages: I/O read failure: (error code=%d)\n", error); printf(" size: %d, resid: %ld, a_count: %d, valid: 0x%x\n", size, bp->b_resid, ap->a_count, m->valid); printf(" nread: %d, reqpage: %d, pindex: %d, pcount: %d\n", nread, ap->a_reqpage, m->pindex, pcount); #endif /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_OK; } /* These are the operations used by directories etc in a devfs */ vop_t **devfs_vnodeop_p; static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) devfs_access }, { &vop_bmap_desc, (vop_t *) devfs_badop }, { &vop_getattr_desc, (vop_t *) devfs_getattr }, { &vop_inactive_desc, (vop_t *) devfs_inactive }, { &vop_link_desc, (vop_t *) devfs_link }, { &vop_lookup_desc, (vop_t *) devfs_lookup }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_print_desc, (vop_t *) devfs_print }, { &vop_read_desc, (vop_t *) devfs_xread }, { &vop_readdir_desc, (vop_t *) devfs_readdir }, { &vop_readlink_desc, (vop_t *) devfs_readlink }, { &vop_reclaim_desc, (vop_t *) devfs_reclaim }, { &vop_remove_desc, (vop_t *) devfs_remove }, { &vop_rename_desc, (vop_t *) devfs_rename }, { &vop_setattr_desc, (vop_t *) devfs_setattr }, { &vop_symlink_desc, (vop_t *) devfs_symlink }, { &vop_write_desc, (vop_t *) devfs_xwrite }, { NULL, NULL } }; static struct vnodeopv_desc devfs_vnodeop_opv_desc = { &devfs_vnodeop_p, devfs_vnodeop_entries }; VNODEOP_SET(devfs_vnodeop_opv_desc); vop_t **devfs_spec_vnodeop_p; static struct vnodeopv_entry_desc devfs_spec_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) devfs_access }, { &vop_advlock_desc, (vop_t *) devfs_advlock }, { &vop_bmap_desc, (vop_t *) devfs_bmap }, { &vop_close_desc, (vop_t *) devfs_close }, { &vop_create_desc, (vop_t *) devfs_badop }, { &vop_fsync_desc, (vop_t *) devfs_fsync }, { &vop_getattr_desc, (vop_t *) devfs_getattr }, { &vop_getpages_desc, (vop_t *) devfs_getpages }, { &vop_inactive_desc, (vop_t *) devfs_inactive }, { &vop_ioctl_desc, (vop_t *) devfs_ioctl }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) devfs_badop }, { &vop_lookup_desc, (vop_t *) devfs_lookup }, { &vop_mkdir_desc, (vop_t *) devfs_badop }, { &vop_mknod_desc, (vop_t *) devfs_badop }, { &vop_open_desc, (vop_t *) devfs_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) devfs_poll }, { &vop_print_desc, (vop_t *) devfs_print }, { &vop_read_desc, (vop_t *) devfs_read }, { &vop_readdir_desc, (vop_t *) devfs_badop }, { &vop_readlink_desc, (vop_t *) devfs_badop }, { &vop_reallocblks_desc, (vop_t *) devfs_badop }, { &vop_reclaim_desc, (vop_t *) devfs_reclaim }, { &vop_remove_desc, (vop_t *) devfs_badop }, { &vop_rename_desc, (vop_t *) devfs_badop }, { &vop_rmdir_desc, (vop_t *) devfs_badop }, { &vop_setattr_desc, (vop_t *) devfs_setattr }, { &vop_strategy_desc, (vop_t *) devfs_strategy }, { &vop_symlink_desc, (vop_t *) devfs_symlink }, { &vop_write_desc, (vop_t *) devfs_write }, { NULL, NULL } }; static struct vnodeopv_desc devfs_spec_vnodeop_opv_desc = { &devfs_spec_vnodeop_p, devfs_spec_vnodeop_entries }; VNODEOP_SET(devfs_spec_vnodeop_opv_desc); Index: head/sys/miscfs/specfs/specdev.h =================================================================== --- head/sys/miscfs/specfs/specdev.h (revision 49534) +++ head/sys/miscfs/specfs/specdev.h (nonexistent) @@ -1,76 +0,0 @@ -/* - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)specdev.h 8.6 (Berkeley) 5/21/95 - * $Id: specdev.h,v 1.17 1999/05/11 19:54:39 phk Exp $ - */ - -/* - * This structure defines the information maintained about - * special devices. It is allocated in checkalias and freed - * in vgone. - */ -struct specinfo { - struct mount *si_mountpoint; - int si_bsize_phys; /* minimum physical block size */ - int si_bsize_best; /* optimal block size / VBLK */ - int si_bsize_max; /* maximum block size */ - - udev_t si_udev; - SLIST_ENTRY(specinfo) si_hash; - struct vnode *si_hlist; -}; -/* - * Exported shorthand - */ -#define v_hashchain v_specinfo->si_hlist -#define v_specmountpoint v_specinfo->si_mountpoint - -/* - * Special device management - */ -#define SPECHSZ 64 -#define SPECHASH(rdev) (((unsigned)(minor(rdev)))%SPECHSZ) - - -/* - * Prototypes for special file operations on vnodes. - */ -extern vop_t **spec_vnodeop_p; -struct nameidata; -struct componentname; -struct ucred; -struct flock; -struct buf; -struct uio; - -int spec_vnoperate __P((struct vop_generic_args *)); Property changes on: head/sys/miscfs/specfs/specdev.h ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/miscfs/specfs/spec_vnops.c =================================================================== --- head/sys/miscfs/specfs/spec_vnops.c (revision 49534) +++ head/sys/miscfs/specfs/spec_vnops.c (revision 49535) @@ -1,963 +1,961 @@ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 - * $Id: spec_vnops.c,v 1.89 1999/06/26 02:46:21 mckusick Exp $ + * $Id: spec_vnops.c,v 1.90 1999/07/20 09:47:45 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include static int spec_advlock __P((struct vop_advlock_args *)); static int spec_badop __P((void)); static int spec_bmap __P((struct vop_bmap_args *)); static int spec_close __P((struct vop_close_args *)); static int spec_freeblks __P((struct vop_freeblks_args *)); static int spec_fsync __P((struct vop_fsync_args *)); static int spec_getattr __P((struct vop_getattr_args *)); static int spec_getpages __P((struct vop_getpages_args *)); static int spec_inactive __P((struct vop_inactive_args *)); static int spec_ioctl __P((struct vop_ioctl_args *)); static int spec_lookup __P((struct vop_lookup_args *)); static int spec_open __P((struct vop_open_args *)); static int spec_poll __P((struct vop_poll_args *)); static int spec_print __P((struct vop_print_args *)); static int spec_read __P((struct vop_read_args *)); static int spec_strategy __P((struct vop_strategy_args *)); static int spec_write __P((struct vop_write_args *)); vop_t **spec_vnodeop_p; static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_ebadf }, { &vop_advlock_desc, (vop_t *) spec_advlock }, { &vop_bmap_desc, (vop_t *) spec_bmap }, { &vop_close_desc, (vop_t *) spec_close }, { &vop_create_desc, (vop_t *) spec_badop }, { &vop_freeblks_desc, (vop_t *) spec_freeblks }, { &vop_fsync_desc, (vop_t *) spec_fsync }, { &vop_getattr_desc, (vop_t *) spec_getattr }, { &vop_getpages_desc, (vop_t *) spec_getpages }, { &vop_inactive_desc, (vop_t *) spec_inactive }, { &vop_ioctl_desc, (vop_t *) spec_ioctl }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) spec_badop }, { &vop_lookup_desc, (vop_t *) spec_lookup }, { &vop_mkdir_desc, (vop_t *) spec_badop }, { &vop_mknod_desc, (vop_t *) spec_badop }, { &vop_open_desc, (vop_t *) spec_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) spec_poll }, { &vop_print_desc, (vop_t *) spec_print }, { &vop_read_desc, (vop_t *) spec_read }, { &vop_readdir_desc, (vop_t *) spec_badop }, { &vop_readlink_desc, (vop_t *) spec_badop }, { &vop_reallocblks_desc, (vop_t *) spec_badop }, { &vop_reclaim_desc, (vop_t *) vop_null }, { &vop_remove_desc, (vop_t *) spec_badop }, { &vop_rename_desc, (vop_t *) spec_badop }, { &vop_rmdir_desc, (vop_t *) spec_badop }, { &vop_setattr_desc, (vop_t *) vop_ebadf }, { &vop_strategy_desc, (vop_t *) spec_strategy }, { &vop_symlink_desc, (vop_t *) spec_badop }, { &vop_write_desc, (vop_t *) spec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; VNODEOP_SET(spec_vnodeop_opv_desc); int spec_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } static void spec_getpages_iodone __P((struct buf *bp)); /* * Trivial lookup routine that always fails. */ static int spec_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * Open a special file. */ /* ARGSUSED */ static int spec_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct proc *p = ap->a_p; struct vnode *bvp, *vp = ap->a_vp; dev_t bdev, dev = vp->v_rdev; int error; struct cdevsw *dsw; /* * Don't allow open if fs is mounted -nodev. */ if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) return (ENXIO); switch (vp->v_type) { case VCHR: dsw = devsw(dev); if ( (dsw == NULL) || (dsw->d_open == NULL)) return ENXIO; if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. */ if (securelevel >= 2 && dsw->d_bmaj != -1 && (dsw->d_flags & D_TYPEMASK) == D_DISK) return (EPERM); /* * When running in secure mode, do not allow opens * for writing of /dev/mem, /dev/kmem, or character * devices whose corresponding block devices are * currently mounted. */ if (securelevel >= 1) { if ((bdev = chrtoblk(dev)) != NODEV && vfinddev(bdev, VBLK, &bvp) && bvp->v_usecount > 0 && (error = vfs_mountedon(bvp))) return (error); if (iskmemdev(dev)) return (EPERM); } } if ((dsw->d_flags & D_TYPEMASK) == D_TTY) vp->v_flag |= VISTTY; VOP_UNLOCK(vp, 0, p); error = (*dsw->d_open)(dev, ap->a_mode, S_IFCHR, p); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); /* NOT REACHED */ case VBLK: dsw = bdevsw(dev); if ( (dsw == NULL) || (dsw->d_open == NULL)) return ENXIO; /* * When running in very secure mode, do not allow * opens for writing of any disk block devices. */ if (securelevel >= 2 && ap->a_cred != FSCRED && (ap->a_mode & FWRITE) && (dsw->d_flags & D_TYPEMASK) == D_DISK) return (EPERM); /* * Do not allow opens of block devices that are * currently mounted. */ error = vfs_mountedon(vp); if (error) return (error); return ((*dsw->d_open)(dev, ap->a_mode, S_IFBLK, p)); /* NOT REACHED */ default: break; } return (0); } /* * Vnode op for read */ /* ARGSUSED */ static int spec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn, nextbn; long bsize, bscale; struct partinfo dpart; int n, on; d_ioctl_t *ioctl; int error = 0; dev_t dev; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("spec_read mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_read proc"); #endif if (uio->uio_resid == 0) return (0); switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*devsw(vp->v_rdev)->d_read) (vp->v_rdev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_offset < 0) return (EINVAL); dev = vp->v_rdev; /* * Calculate block size for block device. The block size must * be larger then the physical minimum. */ bsize = vp->v_specinfo->si_bsize_best; if ((ioctl = bdevsw(dev)->d_ioctl) != NULL && (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && dpart.part->p_fstype == FS_BSDFFS && dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) bsize = dpart.part->p_frag * dpart.part->p_fsize; bscale = btodb(bsize); do { bn = btodb(uio->uio_offset) & ~(bscale - 1); on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (vp->v_lastr + bscale == bn) { nextbn = bn + bscale; error = breadn(vp, bn, (int)bsize, &nextbn, (int *)&bsize, 1, NOCRED, &bp); } else error = bread(vp, bn, (int)bsize, NOCRED, &bp); vp->v_lastr = bn; n = min(n, bsize - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove((char *)bp->b_data + on, n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("spec_read type"); } /* NOTREACHED */ } /* * Vnode op for write */ /* ARGSUSED */ static int spec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct buf *bp; daddr_t bn; int bsize, blkmask; struct partinfo dpart; register int n, on; int error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("spec_write mode"); if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) panic("spec_write proc"); #endif switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp, 0, p); error = (*devsw(vp->v_rdev)->d_write) (vp->v_rdev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); case VBLK: if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); /* * Calculate block size for block device. The block size must * be larger then the physical minimum. */ bsize = vp->v_specinfo->si_bsize_best; if ((*bdevsw(vp->v_rdev)->d_ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) { if (dpart.part->p_fstype == FS_BSDFFS && dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) bsize = dpart.part->p_frag * dpart.part->p_fsize; } blkmask = btodb(bsize) - 1; do { bn = btodb(uio->uio_offset) & ~blkmask; on = uio->uio_offset % bsize; n = min((unsigned)(bsize - on), uio->uio_resid); if (n == bsize) bp = getblk(vp, bn, bsize, 0, 0); else error = bread(vp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); if (n + on == bsize) bawrite(bp); else bdwrite(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); default: panic("spec_write type"); } /* NOTREACHED */ } /* * Device ioctl operation. */ /* ARGSUSED */ static int spec_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { dev_t dev = ap->a_vp->v_rdev; switch (ap->a_vp->v_type) { case VCHR: return ((*devsw(dev)->d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); case VBLK: return ((*bdevsw(dev)->d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_p)); default: panic("spec_ioctl"); /* NOTREACHED */ } } /* ARGSUSED */ static int spec_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register dev_t dev; switch (ap->a_vp->v_type) { case VCHR: dev = ap->a_vp->v_rdev; return (*devsw(dev)->d_poll)(dev, ap->a_events, ap->a_p); default: return (vop_defaultop((struct vop_generic_args *)ap)); } } /* * Synch buffers associated with a block device */ /* ARGSUSED */ static int spec_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct buf *bp; struct buf *nbp; int s; if (vp->v_type == VCHR) return (0); /* * Flush all dirty buffers associated with a block device. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("spec_fsync: not dirty"); if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); splx(s); } else { bremfree(bp); splx(s); bawrite(bp); } goto loop; } if (ap->a_waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spfsyn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("spec_fsync: dirty", vp); splx(s); goto loop; } #endif } splx(s); return (0); } static int spec_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { VOP_UNLOCK(ap->a_vp, 0, ap->a_p); return (0); } /* * Just call the device strategy routine */ static int spec_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { struct buf *bp; bp = ap->a_bp; if (((bp->b_flags & B_READ) == 0) && (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) (*bioops.io_start)(bp); (*bdevsw(bp->b_dev)->d_strategy)(bp); return (0); } static int spec_freeblks(ap) struct vop_freeblks_args /* { struct vnode *a_vp; daddr_t a_addr; daddr_t a_length; } */ *ap; { struct cdevsw *bsw; struct buf *bp; bsw = bdevsw(ap->a_vp->v_rdev); if ((bsw->d_flags & D_CANFREE) == 0) return (0); bp = geteblk(ap->a_length); bp->b_flags |= B_FREEBUF; bp->b_dev = ap->a_vp->v_rdev; bp->b_blkno = ap->a_addr; bp->b_offset = dbtob(ap->a_addr); bp->b_bcount = ap->a_length; (*bsw->d_strategy)(bp); return (0); } /* * This is a noop, simply returning what one has been given. */ static int spec_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Device close routine */ /* ARGSUSED */ static int spec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; d_close_t *devclose; int mode, error; switch (vp->v_type) { case VCHR: /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (vcount(vp) == 2 && ap->a_p && (vp->v_flag & VXLOCK) == 0 && vp == ap->a_p->p_session->s_ttyvp) { vrele(vp); ap->a_p->p_session->s_ttyvp = NULL; } /* * If the vnode is locked, then we are in the midst * of forcably closing the device, otherwise we only * close on last reference. */ if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) return (0); devclose = devsw(dev)->d_close; mode = S_IFCHR; break; case VBLK: /* * On last close of a block device (that isn't mounted) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); VOP_UNLOCK(vp, 0, ap->a_p); if (error) return (error); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ if ((vcount(vp) > 1) && (vp->v_flag & VXLOCK) == 0) return (0); devclose = bdevsw(dev)->d_close; mode = S_IFBLK; break; default: panic("spec_close: not special"); } return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); } /* * Print out the contents of a special device vnode. */ static int spec_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), minor(ap->a_vp->v_rdev)); return (0); } /* * Special device advisory byte-level locks. */ /* ARGSUSED */ static int spec_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* * Special device bad operation */ static int spec_badop() { panic("spec_badop called"); /* NOTREACHED */ } static void spec_getpages_iodone(bp) struct buf *bp; { bp->b_flags |= B_DONE; wakeup(bp); } static int spec_getpages(ap) struct vop_getpages_args *ap; { vm_offset_t kva; int error; int i, pcount, size, s; daddr_t blkno; struct buf *bp; vm_page_t m; vm_ooffset_t offset; int toff, nextoff, nread; struct vnode *vp = ap->a_vp; int blksiz; int gotreqpage; error = 0; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * Calculate the offset of the transfer and do sanity check. * FreeBSD currently only supports an 8 TB range due to b_blkno * being in DEV_BSIZE ( usually 512 ) byte chunks on call to * VOP_STRATEGY. XXX */ offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; #define DADDR_T_BIT (sizeof(daddr_t)*8) #define OFFSET_MAX ((1LL << (DADDR_T_BIT + DEV_BSHIFT)) - 1) if (offset < 0 || offset > OFFSET_MAX) { /* XXX still no %q in kernel. */ printf("spec_getpages: preposterous offset 0x%x%08x\n", (u_int)((u_quad_t)offset >> 32), (u_int)(offset & 0xffffffff)); return (VM_PAGER_ERROR); } blkno = btodb(offset); /* * Round up physical size for real devices. We cannot round using * v_mount's block size data because v_mount has nothing to do with * the device. i.e. it's usually '/dev'. We need the physical block * size for the device itself. * * We can't use v_specmountpoint because it only exists when the * block device is mounted. However, we can use v_specinfo. */ if (vp->v_type == VBLK) blksiz = vp->v_specinfo->si_bsize_phys; else blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* * Map the pages to be read into the kva. */ pmap_qenter(kva, ap->a_m, pcount); /* Build a minimal buffer header. */ bp->b_flags = B_READ | B_CALL; bp->b_iodone = spec_getpages_iodone; /* B_PHYS is not set, but it is nice to fill this in. */ bp->b_rcred = bp->b_wcred = curproc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_blkno = blkno; bp->b_lblkno = blkno; pbgetvp(ap->a_vp, bp); bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; /* Do the input. */ VOP_STRATEGY(bp->b_vp, bp); s = splbio(); /* We definitely need to be at splbio here. */ while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PVM, "spread", 0); splx(s); if ((bp->b_flags & B_ERROR) != 0) { if (bp->b_error) error = bp->b_error; else error = EIO; } nread = size - bp->b_resid; if (nread < ap->a_count) { bzero((caddr_t)kva + nread, ap->a_count - nread); } pmap_qremove(kva, pcount); gotreqpage = 0; for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { nextoff = toff + PAGE_SIZE; m = ap->a_m[i]; m->flags &= ~PG_ZERO; if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } else if (toff < nread) { /* * Since this is a VM request, we have to supply the * unaligned offset to allow vm_page_set_validclean() * to zero sub-DEV_BSIZE'd portions of the page. */ vm_page_set_validclean(m, 0, nread - toff); } else { m->valid = 0; m->dirty = 0; } if (i != ap->a_reqpage) { /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error || (m->valid == VM_PAGE_BITS_ALL)) { if (m->valid) { if (m->flags & PG_WANTED) { vm_page_activate(m); } else { vm_page_deactivate(m); } vm_page_wakeup(m); } else { vm_page_free(m); } } else { vm_page_free(m); } } else if (m->valid) { gotreqpage = 1; /* * Since this is a VM request, we need to make the * entire page presentable by zeroing invalid sections. */ if (m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, FALSE); } } if (!gotreqpage) { m = ap->a_m[ap->a_reqpage]; #ifndef MAX_PERF printf( "spec_getpages: I/O read failure: (error code=%d) bp %p vp %p\n", error, bp, bp->b_vp); printf( " size: %d, resid: %ld, a_count: %d, valid: 0x%x\n", size, bp->b_resid, ap->a_count, m->valid); printf( " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", nread, ap->a_reqpage, (u_long)m->pindex, pcount); #endif /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_OK; } /* ARGSUSED */ static int spec_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vattr *vap = ap->a_vap; struct partinfo dpart; bzero(vap, sizeof (*vap)); if (vp->v_type == VBLK) { if (vp->v_specinfo) vap->va_blocksize = vp->v_specmountpoint->mnt_stat.f_iosize; else vap->va_blocksize = BLKDEV_IOSIZE; } else if (vp->v_type == VCHR) { vap->va_blocksize = MAXBSIZE; } if ((*bdevsw(vp->v_rdev)->d_ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, ap->a_p) == 0) { vap->va_bytes = dbtob(dpart.disklab->d_partitions [minor(vp->v_rdev)].p_size); vap->va_size = vap->va_bytes; } return (0); } Index: head/sys/msdosfs/msdosfs_vfsops.c =================================================================== --- head/sys/msdosfs/msdosfs_vfsops.c (revision 49534) +++ head/sys/msdosfs/msdosfs_vfsops.c (revision 49535) @@ -1,1017 +1,1016 @@ -/* $Id: msdosfs_vfsops.c,v 1.44 1999/05/08 06:40:00 phk Exp $ */ +/* $Id: msdosfs_vfsops.c,v 1.45 1999/05/31 11:28:02 phk Exp $ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include -#include /* XXX */ /* defines v_rdev */ #include #include #include #include #include /* defines ALLPERMS */ #include #include #include #include #include #include MALLOC_DEFINE(M_MSDOSFSMNT, "MSDOSFS mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "MSDOSFS FAT", "MSDOSFS file allocation table"); static int update_mp __P((struct mount *mp, struct msdosfs_args *argp)); static int mountmsdosfs __P((struct vnode *devvp, struct mount *mp, struct proc *p, struct msdosfs_args *argp)); static int msdosfs_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); static int msdosfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int msdosfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int msdosfs_root __P((struct mount *, struct vnode **)); static int msdosfs_start __P((struct mount *, int, struct proc *)); static int msdosfs_statfs __P((struct mount *, struct statfs *, struct proc *)); static int msdosfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int msdosfs_unmount __P((struct mount *, int, struct proc *)); static int msdosfs_vget __P((struct mount *mp, ino_t ino, struct vnode **vpp)); static int msdosfs_vptofh __P((struct vnode *, struct fid *)); static int update_mp(mp, argp) struct mount *mp; struct msdosfs_args *argp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error; pmp->pm_gid = argp->gid; pmp->pm_uid = argp->uid; pmp->pm_mask = argp->mask & ALLPERMS; pmp->pm_flags |= argp->flags & MSDOSFSMNT_MNTOPT; if (pmp->pm_flags & MSDOSFSMNT_U2WTABLE) { bcopy(argp->u2w, pmp->pm_u2w, sizeof(pmp->pm_u2w)); bcopy(argp->d2u, pmp->pm_d2u, sizeof(pmp->pm_d2u)); bcopy(argp->u2d, pmp->pm_u2d, sizeof(pmp->pm_u2d)); } if (pmp->pm_flags & MSDOSFSMNT_ULTABLE) { bcopy(argp->ul, pmp->pm_ul, sizeof(pmp->pm_ul)); bcopy(argp->lu, pmp->pm_lu, sizeof(pmp->pm_lu)); } #ifndef __FreeBSD__ /* * GEMDOS knows nothing (yet) about win95 */ if (pmp->pm_flags & MSDOSFSMNT_GEMDOSFS) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; #endif if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else if (!(pmp->pm_flags & (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) { struct vnode *rootvp; /* * Try to divine whether to support Win'95 long filenames */ if (FAT32(pmp)) pmp->pm_flags |= MSDOSFSMNT_LONGNAME; else { if ((error = msdosfs_root(mp, &rootvp)) != 0) return error; pmp->pm_flags |= findwin95(VTODE(rootvp)) ? MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME; vput(rootvp); } } return 0; } #ifndef __FreeBSD__ int msdosfs_mountroot() { register struct mount *mp; struct proc *p = curproc; /* XXX */ size_t size; int error; struct msdosfs_args args; if (root_device->dv_class != DV_DISK) return (ENODEV); /* * Get vnodes for swapdev and rootdev. */ if (bdevvp(rootdev, &rootvp)) panic("msdosfs_mountroot: can't setup rootvp"); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); mp->mnt_op = &msdosfs_vfsops; mp->mnt_flag = 0; LIST_INIT(&mp->mnt_vnodelist); args.flags = 0; args.uid = 0; args.gid = 0; args.mask = 0777; if ((error = mountmsdosfs(rootvp, mp, p, &args)) != 0) { free(mp, M_MOUNT); return (error); } if ((error = update_mp(mp, &args)) != 0) { (void)msdosfs_unmount(mp, 0, p); free(mp, M_MOUNT); return (error); } if ((error = vfs_lock(mp)) != 0) { (void)msdosfs_unmount(mp, 0, p); free(mp, M_MOUNT); return (error); } CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); mp->mnt_vnodecovered = NULLVP; (void) copystr("/", mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void)msdosfs_statfs(mp, &mp->mnt_stat, p); vfs_unlock(mp); return (0); } #endif /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(mp, path, data, ndp, p) struct mount *mp; char *path; caddr_t data; struct nameidata *ndp; struct proc *p; { struct vnode *devvp; /* vnode for blk device to mount */ struct msdosfs_args args; /* will hold data from mount request */ /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; size_t size; int error, flags; mode_t accessmode; error = copyin(data, (caddr_t)&args, sizeof(struct msdosfs_args)); if (error) return (error); if (args.magic != MSDOSFS_ARGSMAGIC) args.flags = 0; /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); error = 0; if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, NULLVP, flags); } if (!error && (mp->mnt_flag & MNT_RELOAD)) /* not yet implemented */ error = EOPNOTSUPP; if (error) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (p->p_ucred->cr_uid != 0) { devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p); if (error) { VOP_UNLOCK(devvp, 0, p); return (error); } VOP_UNLOCK(devvp, 0, p); } pmp->pm_flags &= ~MSDOSFSMNT_RONLY; } if (args.fspec == 0) { #ifdef __notyet__ /* doesn't work correctly with current mountd XXX */ if (args.flags & MSDOSFSMNT_MNTOPT) { pmp->pm_flags &= ~MSDOSFSMNT_MNTOPT; pmp->pm_flags |= args.flags & MSDOSFSMNT_MNTOPT; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; } #endif /* * Process export requests. */ return (vfs_export(mp, &pmp->pm_export, &args.export)); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); error = namei(ndp); if (error) return (error); devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { vrele(devvp); return (ENOTBLK); } if (bdevsw(devvp->v_rdev) == NULL) { vrele(devvp); return (ENXIO); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (p->p_ucred->cr_uid != 0) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p); if (error) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp, p, &args); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { if (devvp != pmp->pm_devvp) error = EINVAL; /* XXX needs translation */ else vrele(devvp); } if (error) { vrele(devvp); return (error); } error = update_mp(mp, &args); if (error) { msdosfs_unmount(mp, MNT_FORCE, p); return error; } (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); (void) msdosfs_statfs(mp, &mp->mnt_stat, p); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(devvp, mp, p, argp) struct vnode *devvp; struct mount *mp; struct proc *p; struct msdosfs_args *argp; { struct msdosfsmount *pmp; struct buf *bp; dev_t dev = devvp->v_rdev; #ifndef __FreeBSD__ struct partinfo dpart; int bsize = 0, dtype = 0, tmp; #endif union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; u_int8_t SecPerClust; int ronly, error; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); if (vcount(devvp) > 1 && devvp != rootvp) return (EBUSY); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); bp = NULL; /* both used in error_exit */ pmp = NULL; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { /* * We need the disklabel to calculate the size of a FAT entry * later on. Also make sure the partition contains a filesystem * of type FS_MSDOS. This doesn't work for floppies, so we have * to check for them too. * * At least some parts of the msdos fs driver seem to assume * that the size of a disk block will always be 512 bytes. * Let's check it... */ error = VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p); if (error) goto error_exit; tmp = dpart.part->p_fstype; dtype = dpart.disklab->d_type; bsize = dpart.disklab->d_secsize; if (bsize != 512 || (dtype!=DTYPE_FLOPPY && tmp!=FS_MSDOS)) { error = EINVAL; goto error_exit; } } #endif /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. */ #ifdef PC98 error = bread(devvp, 0, 1024, NOCRED, &bp); #else error = bread(devvp, 0, 512, NOCRED, &bp); #endif if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsPBP; #ifndef __FreeBSD__ if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) { #endif #ifdef PC98 if ((bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) && (bsp->bs50.bsBootSectSig0 != 0 /* PC98 DOS 3.3x */ || bsp->bs50.bsBootSectSig1 != 0) && (bsp->bs50.bsBootSectSig0 != 0x90 /* PC98 DOS 5.0 */ || bsp->bs50.bsBootSectSig1 != 0x3d) && (bsp->bs50.bsBootSectSig0 != 0x46 /* PC98 DOS 3.3B */ || bsp->bs50.bsBootSectSig1 != 0xfa)) { #else if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { #endif error = EINVAL; goto error_exit; } #ifndef __FreeBSD__ } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK); bzero((caddr_t)pmp, sizeof *pmp); pmp->pm_mountp = mp; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; #ifndef __FreeBSD__ if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) { #endif /* XXX - We should probably check more values here */ if (!pmp->pm_BytesPerSec || !SecPerClust || !pmp->pm_Heads || pmp->pm_Heads > 255 #ifdef PC98 || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 255) { #else || !pmp->pm_SecPerTrack || pmp->pm_SecPerTrack > 63) { #endif error = EINVAL; goto error_exit; } #ifndef __FreeBSD__ } #endif if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (pmp->pm_HugeSectors > 0xffffffff / (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) { /* * We cannot deal currently with this size of disk * due to fileid limitations (see msdosfs_getattr and * msdosfs_readdir) */ error = EINVAL; printf("mountmsdosfs(): disk too big, sorry\n"); goto error_exit; } if (pmp->pm_RootDirEnts == 0) { if (bsp->bs710.bsBootSectSig2 != BOOTSIG2 || bsp->bs710.bsBootSectSig3 != BOOTSIG3 || pmp->pm_Sectors || pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; printf("mountmsdosfs(): bad FAT32 filesystem\n"); goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if (FAT32(pmp)) { /* * GEMDOS doesn't know fat32. */ error = EINVAL; goto error_exit; } /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < bsize) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_HugeSectors * (pmp->pm_BytesPerSec / bsize) > dpart.part->p_size) ) { error = EINVAL; goto error_exit; } /* * XXX - Many parts of the msdos fs driver seem to assume that * the number of bytes per logical sector (BytesPerSec) will * always be the same as the number of bytes per disk block * Let's pretend it is. */ tmp = pmp->pm_BytesPerSec / bsize; pmp->pm_BytesPerSec = bsize; pmp->pm_HugeSectors *= tmp; pmp->pm_HiddenSects *= tmp; pmp->pm_ResSectors *= tmp; pmp->pm_Sectors *= tmp; pmp->pm_FATsecs *= tmp; SecPerClust *= tmp; } #endif pmp->pm_fatblk = pmp->pm_ResSectors; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo); } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + pmp->pm_BytesPerSec - 1) / pmp->pm_BytesPerSec;/* in sectors */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_nmbrofclusters = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust; pmp->pm_maxcluster = pmp->pm_nmbrofclusters + 1; pmp->pm_fatsize = pmp->pm_FATsecs * pmp->pm_BytesPerSec; #ifndef __FreeBSD__ if (argp->flags & MSDOSFSMNT_GEMDOSFS) { if ((pmp->pm_nmbrofclusters <= (0xff0 - 2)) && ((dtype == DTYPE_FLOPPY) || ((dtype == DTYPE_VNODE) && ((pmp->pm_Heads == 1) || (pmp->pm_Heads == 2)))) ) { pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } else #endif if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one fat entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * pmp->pm_BytesPerSec; else pmp->pm_fatblocksize = DFLTBSIZE; pmp->pm_fatblocksec = pmp->pm_fatblocksize / pmp->pm_BytesPerSec; pmp->pm_bnshift = ffs(pmp->pm_BytesPerSec) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * pmp->pm_BytesPerSec; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check FSInfo. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, 1024, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4) && !bcmp(fp->fsisig4, "\0\0\125\252", 4)) pmp->pm_nxtfree = getulong(fp->fsinxtfree); else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Check and validate (or perhaps invalidate?) the fsinfo structure? XXX */ /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(((pmp->pm_maxcluster + N_INUSEBITS - 1) / N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_dev = dev; pmp->pm_devvp = devvp; /* * Have the inuse map filled in. */ if ((error = fillinusemap(pmp)) != 0) goto error_exit; /* * If they want fat updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the fat being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else pmp->pm_fmod = 1; mp->mnt_data = (qaddr_t) pmp; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_flag |= MNT_LOCAL; devvp->v_specmountpoint = mp; return 0; error_exit: if (bp) brelse(bp); (void) VOP_CLOSE(devvp, ronly ? FREAD : FREAD | FWRITE, NOCRED, p); if (pmp) { if (pmp->pm_inusemap) free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } static int msdosfs_start(mp, flags, p) struct mount *mp; int flags; struct proc *p; { return (0); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { struct msdosfsmount *pmp; int error, flags; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, NULLVP, flags); if (error) return error; pmp = VFSTOMSDOSFS(mp); pmp->pm_devvp->v_specmountpoint = NULL; #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; printf("msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("flag %08lx, usecount %d, writecount %d, holdcnt %ld\n", vp->v_flag, vp->v_usecount, vp->v_writecount, vp->v_holdcnt); printf("lastr %d, id %lu, mount %p, op %p\n", vp->v_lastr, vp->v_id, vp->v_mount, vp->v_op); printf("freef %p, freeb %p, mount %p\n", vp->v_freelist.tqe_next, vp->v_freelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_cleanblkhd), TAILQ_FIRST(&vp->v_dirtyblkhd), vp->v_numoutput, vp->v_type); printf("union %p, tag %d, data[0] %08x, data[1] %08x\n", vp->v_socket, vp->v_tag, ((u_int *)vp->v_data)[0], ((u_int *)vp->v_data)[1]); } #endif error = VOP_CLOSE(pmp->pm_devvp, (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE, NOCRED, p); vrele(pmp->pm_devvp); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } static int msdosfs_root(mp, vpp) struct mount *mp; struct vnode **vpp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_quotactl(mp, cmds, uid, arg, p) struct mount *mp; int cmds; uid_t uid; caddr_t arg; struct proc *p; { return EOPNOTSUPP; } static int msdosfs_statfs(mp, sbp, p) struct mount *mp; struct statfs *sbp; struct proc *p; { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_nmbrofclusters; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN); return (0); } static int msdosfs_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { struct vnode *vp, *nvp; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; /* * If we ever switch to not updating all of the fats all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update fats here */ } } /* * Write back each (modified) denode. */ simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; dep = VTODE(vp); if (vp->v_type == VNON || ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (TAILQ_EMPTY(&vp->v_dirtyblkhd) || waitfor == MNT_LAZY))) { simple_unlock(&vp->v_interlock); continue; } simple_unlock(&mntvnode_slock); error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, cred, waitfor, p); if (error) allerror = error; VOP_UNLOCK(vp, 0, p); vrele(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(pmp->pm_devvp, cred, waitfor, p); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0, p); } return (allerror); } static int msdosfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; struct netcred *np; int error; np = vfs_export_lookup(mp, &pmp->pm_export, nam); if (np == NULL) return (EACCES); error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); *exflagsp = np->netc_exflags; *credanonp = &np->netc_anon; return (0); } static int msdosfs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { struct denode *dep; struct defid *defhp; dep = VTODE(vp); defhp = (struct defid *)fhp; defhp->defid_len = sizeof(struct defid); defhp->defid_dirclust = dep->de_dirclust; defhp->defid_dirofs = dep->de_diroffset; /* defhp->defid_gen = dep->de_gen; */ return (0); } static int msdosfs_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { return EOPNOTSUPP; } static struct vfsops msdosfs_vfsops = { msdosfs_mount, msdosfs_start, msdosfs_unmount, msdosfs_root, msdosfs_quotactl, msdosfs_statfs, msdosfs_sync, msdosfs_vget, msdosfs_fhtovp, msdosfs_vptofh, msdosfs_init }; VFS_SET(msdosfs_vfsops, msdos, 0); Index: head/sys/msdosfs/msdosfs_vnops.c =================================================================== --- head/sys/msdosfs/msdosfs_vnops.c (revision 49534) +++ head/sys/msdosfs/msdosfs_vnops.c (revision 49535) @@ -1,1986 +1,1985 @@ -/* $Id: msdosfs_vnops.c,v 1.86 1999/06/26 02:46:26 mckusick Exp $ */ +/* $Id: msdosfs_vnops.c,v 1.87 1999/07/25 04:01:32 bde Exp $ */ /* $NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include /* defines plimit structure in proc struct */ #include #include #include #include #include #include #include -#include /* XXX */ /* defines v_rdev */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for MSDOSFS vnode operations */ static int msdosfs_create __P((struct vop_create_args *)); static int msdosfs_mknod __P((struct vop_mknod_args *)); static int msdosfs_close __P((struct vop_close_args *)); static int msdosfs_access __P((struct vop_access_args *)); static int msdosfs_getattr __P((struct vop_getattr_args *)); static int msdosfs_setattr __P((struct vop_setattr_args *)); static int msdosfs_read __P((struct vop_read_args *)); static int msdosfs_write __P((struct vop_write_args *)); static int msdosfs_fsync __P((struct vop_fsync_args *)); static int msdosfs_remove __P((struct vop_remove_args *)); static int msdosfs_link __P((struct vop_link_args *)); static int msdosfs_rename __P((struct vop_rename_args *)); static int msdosfs_mkdir __P((struct vop_mkdir_args *)); static int msdosfs_rmdir __P((struct vop_rmdir_args *)); static int msdosfs_symlink __P((struct vop_symlink_args *)); static int msdosfs_readdir __P((struct vop_readdir_args *)); static int msdosfs_abortop __P((struct vop_abortop_args *)); static int msdosfs_bmap __P((struct vop_bmap_args *)); static int msdosfs_strategy __P((struct vop_strategy_args *)); static int msdosfs_print __P((struct vop_print_args *)); static int msdosfs_pathconf __P((struct vop_pathconf_args *ap)); static int msdosfs_getpages __P((struct vop_getpages_args *)); static int msdosfs_putpages __P((struct vop_putpages_args *)); /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retreive the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int msdosfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode ndirent; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct timespec ts; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap); #endif /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_create: no name"); #endif bzero(&ndirent, sizeof(ndirent)); error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = (ap->a_vap->va_mode & VWRITE) ? ATTR_ARCHIVE : ATTR_ARCHIVE | ATTR_READONLY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; ndirent.de_devvp = pdep->de_devvp; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; if ((cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); *ap->a_vpp = DETOV(dep); return (0); bad: zfree(namei_zone, cnp->cn_pnbuf); return (error); } static int msdosfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { switch (ap->a_vap->va_type) { case VDIR: return (msdosfs_mkdir((struct vop_mkdir_args *)ap)); break; case VREG: return (msdosfs_create((struct vop_create_args *)ap)); break; default: zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (EINVAL); } /* NOTREACHED */ } static int msdosfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct timespec ts; simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); } simple_unlock(&vp->v_interlock); return 0; } static int msdosfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; mode_t mask, file_mode, mode = ap->a_mode; register gid_t *gp; int i; file_mode = (S_IXUSR|S_IXGRP|S_IXOTH) | (S_IRUSR|S_IRGRP|S_IROTH) | ((dep->de_Attributes & ATTR_READONLY) ? 0 : (S_IWUSR|S_IWGRP|S_IWOTH)); file_mode &= pmp->pm_mask; /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* User id 0 always gets access. */ if (cred->cr_uid == 0) return 0; mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == pmp->pm_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return (file_mode & mask) == mask ? 0 : EACCES; } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (pmp->pm_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return (file_mode & mask) == mask ? 0 : EACCES; } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return (file_mode & mask) == mask ? 0 : EACCES; } static int msdosfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; mode_t mode; struct timespec ts; u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); u_long fileid; getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); vap->va_fsid = dev2udev(dep->de_dev); /* * The following computation of the fileid must be the same as that * used in msdosfs_readdir() to compute d_fileno. If not, pwd * doesn't work. */ if (dep->de_Attributes & ATTR_DIRECTORY) { fileid = cntobn(pmp, dep->de_StartCluster) * dirsperblk; if (dep->de_StartCluster == MSDOSFSROOT) fileid = 1; } else { fileid = cntobn(pmp, dep->de_dirclust) * dirsperblk; if (dep->de_dirclust == MSDOSFSROOT) fileid = roottobn(pmp, 0) * dirsperblk; fileid += dep->de_diroffset / sizeof(struct direntry); } vap->va_fileid = fileid; if ((dep->de_Attributes & ATTR_READONLY) == 0) mode = S_IRWXU|S_IRWXG|S_IRWXO; else mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_mode = mode & pmp->pm_mask; vap->va_uid = pmp->pm_uid; vap->va_gid = pmp->pm_gid; vap->va_nlink = 1; vap->va_rdev = 0; vap->va_size = dep->de_FileSize; dos2unixtime(dep->de_MDate, dep->de_MTime, 0, &vap->va_mtime); if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) { dos2unixtime(dep->de_ADate, 0, 0, &vap->va_atime); dos2unixtime(dep->de_CDate, dep->de_CTime, dep->de_CHun, &vap->va_ctime); } else { vap->va_atime = vap->va_mtime; vap->va_ctime = vap->va_mtime; } vap->va_flags = 0; if ((dep->de_Attributes & ATTR_ARCHIVE) == 0) vap->va_flags |= SF_ARCHIVED; vap->va_gen = 0; vap->va_blocksize = pmp->pm_bpcluster; vap->va_bytes = (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask; vap->va_type = ap->a_vp->v_type; vap->va_filerev = dep->de_modrev; return (0); } static int msdosfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): vp %p, vap %p, cred %p, p %p\n", ap->a_vp, vap, cred, ap->a_p); #endif /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): returning EINVAL\n"); printf(" va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n", vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid); printf(" va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n", vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen); printf(" va_uid %x, va_gid %x\n", vap->va_uid, vap->va_gid); #endif return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return (error); /* * We are very inconsistent about handling unsupported * attributes. We ignored the access time and the * read and execute bits. We were strict for the other * attributes. * * Here we are strict, stricter than ufs in not allowing * users to attempt to set SF_SETTABLE bits or anyone to * set unsupported bits. However, we ignore attempts to * set ATTR_ARCHIVE for directories `cp -pr' from a more * sensible file system attempts it a lot. */ if (cred->cr_uid != 0) { if (vap->va_flags & SF_SETTABLE) return EPERM; } if (vap->va_flags & ~SF_ARCHIVED) return EOPNOTSUPP; if (vap->va_flags & SF_ARCHIVED) dep->de_Attributes &= ~ATTR_ARCHIVE; else if (!(dep->de_Attributes & ATTR_DIRECTORY)) dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { uid_t uid; gid_t gid; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); uid = vap->va_uid; if (uid == (uid_t)VNOVAL) uid = pmp->pm_uid; gid = vap->va_gid; if (gid == (gid_t)VNOVAL) gid = pmp->pm_gid; if ((cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid || (gid != pmp->pm_gid && !groupmember(gid, cred))) && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return error; if (uid != pmp->pm_uid || gid != pmp->pm_gid) return EINVAL; } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); /* NOT REACHED */ case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } error = detrunc(dep, vap->va_size, 0, cred, ap->a_p); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(ap->a_vp, VWRITE, cred, ap->a_p)))) return (error); if (vp->v_type != VDIR) { if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 && vap->va_atime.tv_sec != VNOVAL) unix2dostime(&vap->va_atime, &dep->de_ADate, NULL, NULL); if (vap->va_mtime.tv_sec != VNOVAL) unix2dostime(&vap->va_mtime, &dep->de_MDate, &dep->de_MTime, NULL); dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } /* * DOS files only have the ability to have their writability * attribute set, so we use the owner write bit to set the readonly * attribute. */ if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid && (error = suser_xxx(cred, ap->a_p, PRISON_ROOT))) return (error); if (vp->v_type != VDIR) { /* We ignore the read and execute bits. */ if (vap->va_mode & VWRITE) dep->de_Attributes &= ~ATTR_READONLY; else dep->de_Attributes |= ATTR_READONLY; dep->de_flag |= DE_MODIFIED; } } return (deupdat(dep, 1)); } static int msdosfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error = 0; int diff; int blsize; int isadir; int orig_resid; long n; long on; daddr_t lbn; daddr_t rablock; int rasize; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * If they didn't ask for any data, then we are done. */ orig_resid = uio->uio_resid; if (orig_resid <= 0) return (0); isadir = dep->de_Attributes & ATTR_DIRECTORY; do { lbn = de_cluster(pmp, uio->uio_offset); on = uio->uio_offset & pmp->pm_crbomask; n = min((u_long) (pmp->pm_bpcluster - on), uio->uio_resid); diff = dep->de_FileSize - uio->uio_offset; if (diff <= 0) break; if (diff < n) n = diff; /* convert cluster # to block # if a directory */ if (isadir) { error = pcbmap(dep, lbn, &lbn, 0, &blsize); if (error) break; } /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ if (isadir) { error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); } else { rablock = lbn + 1; if (vp->v_lastr + 1 == lbn && de_cn2off(pmp, rablock) < dep->de_FileSize) { rasize = pmp->pm_bpcluster; error = breadn(vp, lbn, pmp->pm_bpcluster, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, pmp->pm_bpcluster, NOCRED, &bp); vp->v_lastr = lbn; } n = min(n, pmp->pm_bpcluster - bp->b_resid); if (error) { brelse(bp); break; } error = uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) dep->de_flag |= DE_ACCESS; return (error); } /* * Write data to a file or directory. */ static int msdosfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int n; int croffset; int resid; u_long osize; int error = 0; u_long count; daddr_t bn, lastcn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; #ifdef MSDOSFS_DEBUG printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n", vp, uio, ioflag, cred); printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n", dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; thisvp = vp; break; case VDIR: return EISDIR; default: panic("msdosfs_write(): bad file type"); } if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); /* * If they've exceeded their filesize limit, tell them about it. */ if (p && ((uio->uio_offset + uio->uio_resid) > p->p_rlimit[RLIMIT_FSIZE].rlim_cur)) { psignal(p, SIGXFSZ); return (EFBIG); } /* * If the offset we are starting the write at is beyond the end of * the file, then they've done a seek. Unix filesystems allow * files with holes in them, DOS doesn't so we must fill the hole * with zeroed blocks. */ if (uio->uio_offset > dep->de_FileSize) { error = deextend(dep, uio->uio_offset, cred); if (error) return (error); } /* * Remember some values in case the write fails. */ resid = uio->uio_resid; osize = dep->de_FileSize; /* * If we write beyond the end of the file, extend it to its ultimate * size ahead of the time to hopefully get a contiguous area. */ if (uio->uio_offset + resid > osize) { count = de_clcount(pmp, uio->uio_offset + resid) - de_clcount(pmp, osize); error = extendfile(dep, count, NULL, NULL, 0); if (error && (error != ENOSPC || (ioflag & IO_UNIT))) goto errexit; lastcn = dep->de_fc[FC_LASTFC].fc_frcn; } else lastcn = de_clcount(pmp, osize) - 1; do { if (de_cluster(pmp, uio->uio_offset) > lastcn) { error = ENOSPC; break; } croffset = uio->uio_offset & pmp->pm_crbomask; n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; /* The object size needs to be set before buffer is allocated */ vnode_pager_setsize(vp, dep->de_FileSize); } bn = de_cluster(pmp, uio->uio_offset); if ((uio->uio_offset & pmp->pm_crbomask) == 0 && (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > de_cluster(pmp, uio->uio_offset) || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0); clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the fat table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) bp->b_blkno = -1; } if (bp->b_blkno == -1) { brelse(bp); if (!error) error = EIO; /* XXX */ break; } } else { /* * The block we need to write into exists, so read it in. */ error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp); if (error) { brelse(bp); break; } } /* * Should these vnode_pager_* functions be done on dir * files? */ /* * Copy the data from user space into the buf header. */ error = uiomove(bp->b_data + croffset, n, uio); /* * If they want this synchronous then write it and wait for * it. Otherwise, if on a cluster boundary write it * asynchronously so we can move on to the next block * without delay. Otherwise do a delayed write because we * may want to write somemore into the block later. */ if (ioflag & IO_SYNC) (void) bwrite(bp); else if (n + croffset == pmp->pm_bpcluster) bawrite(bp); else bdwrite(bp); dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); /* * If the write failed and they want us to, truncate the file back * to the size it was before the write was attempted. */ errexit: if (error) { if (ioflag & IO_UNIT) { detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL); if (uio->uio_resid != resid) error = 0; } } else if (ioflag & IO_SYNC) error = deupdat(dep, 1); return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int msdosfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; int s; struct buf *bp, *nbp; /* * Flush all dirty buffers associated with a vnode. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("msdosfs_fsync: not dirty"); bremfree(bp); splx(s); (void) bwrite(bp); goto loop; } while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "msdosfsn", 0); } #ifdef DIAGNOSTIC if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vprint("msdosfs_fsync: dirty", vp); goto loop; } #endif splx(s); return (deupdat(VTODE(vp), ap->a_waitfor == MNT_WAIT)); } static int msdosfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); struct denode *ddep = VTODE(ap->a_dvp); int error; if (ap->a_vp->v_type == VDIR) error = EPERM; else error = removede(ddep, dep); #ifdef MSDOSFS_DEBUG printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount); #endif return (error); } /* * DOS filesystems don't know what links are. But since we already called * msdosfs_lookup() with create and lockparent, the parent is locked so we * have to free it before we return the error. */ static int msdosfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { VOP_ABORTOP(ap->a_tdvp, ap->a_cnp); return (EOPNOTSUPP); } /* * Renames on files require moving the denode to a new hash queue since the * denode's location is used to compute which hash queue to put the file * in. Unless it is a rename in place. For example "mv a b". * * What follows is the basic algorithm: * * if (file move) { * if (dest file exists) { * remove dest file * } * if (dest and src in same directory) { * rewrite name in existing directory slot * } else { * write new entry in dest directory * update offset and dirclust in denode * move denode to new hash chain * clear old directory entry * } * } else { * directory move * if (dest directory exists) { * if (dest is not empty) { * return ENOTEMPTY * } * remove dest directory * } * if (dest and src in same directory) { * rewrite name in existing entry * } else { * be sure dest is not a child of src directory * write entry in dest directory * update "." and ".." in moved directory * clear old directory entry for moved directory * } * } * * On entry: * source's parent directory is unlocked * source file or directory is unlocked * destination's parent directory is locked * destination file or directory is locked if it exists * * On exit: * all denodes should be released * * Notes: * I'm not sure how the memory containing the pathnames pointed at by the * componentname structures is freed, there may be some memory bleeding * for each rename done. */ static int msdosfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; struct denode *ip, *xp, *dp, *zp; u_char toname[11], oldname[11]; u_long from_diroffset, to_diroffset; u_char to_count; int doingdirectory = 0, newparent = 0; int error; u_long cn; daddr_t bn; struct denode *fddep; /* from file's parent directory */ struct denode *fdep; /* from file or directory */ struct denode *tddep; /* to file's parent directory */ struct denode *tdep; /* to file or directory */ struct msdosfsmount *pmp; struct direntry *dotdotp; struct buf *bp; fddep = VTODE(ap->a_fdvp); fdep = VTODE(ap->a_fvp); tddep = VTODE(ap->a_tdvp); tdep = tvp ? VTODE(tvp) : NULL; pmp = fddep->de_pmp; pmp = VFSTOMSDOSFS(fdvp->v_mount); #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("msdosfs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); return (error); } /* * If source and dest are the same, do nothing. */ if (tvp == fvp) { error = 0; goto abortit; } error = vn_lock(fvp, LK_EXCLUSIVE, p); if (error) goto abortit; dp = VTODE(fdvp); ip = VTODE(fvp); /* * Be sure we are not renaming ".", "..", or an alias of ".". This * leads to a crippled directory tree. It's pretty tough to do a * "ls" or "pwd" with the "." directory entry missing, and "cd .." * doesn't work if the ".." entry is missing. */ if (ip->de_Attributes & ATTR_DIRECTORY) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) || (ip->de_flag & DE_RENAME)) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } ip->de_flag |= DE_RENAME; doingdirectory++; } /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; /* * Remember direntry place to use for destination */ to_diroffset = dp->de_fndoffset; to_count = dp->de_fndcnt; /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to doscheckpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp, 0, p); if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster) newparent = 1; vrele(fdvp); if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); /* * doscheckpath() vput()'s dp, * so we have to do a relookup afterwards */ error = doscheckpath(ip, dp); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost to startdir"); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; } if (xp != NULL) { /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (xp->de_Attributes & ATTR_DIRECTORY) { if (!dosdirempty(xp)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = removede(dp, xp); if (error) goto bad; vput(tvp); xp = NULL; } /* * Convert the filename in tcnp into a dos filename. We copy this * into the denode and directory entry for the destination * file/directory. */ error = uniqdosname(VTODE(tdvp), tcnp, toname); if (error) goto abortit; /* * Since from wasn't locked at various places above, * have to do a relookup here. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost from startdir"); if (!newparent) VOP_UNLOCK(tdvp, 0, p); (void) relookup(fdvp, &fvp, fcnp); if (fvp == NULL) { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); vrele(ap->a_fvp); if (newparent) VOP_UNLOCK(tdvp, 0, p); vrele(tdvp); return 0; } xp = VTODE(fvp); zp = VTODE(fdvp); from_diroffset = zp->de_fndoffset; /* * Ensure that the directory entry still exists and has not * changed till now. If the source is a file the entry may * have been unlinked or renamed. In either case there is * no further work to be done. If the source is a directory * then it cannot have been rmdir'ed or renamed; this is * prohibited by the DE_RENAME flag. */ if (xp != ip) { if (doingdirectory) panic("rename: lost dir entry"); vrele(ap->a_fvp); VOP_UNLOCK(fvp, 0, p); if (newparent) VOP_UNLOCK(fdvp, 0, p); xp = NULL; } else { vrele(fvp); xp = NULL; /* * First write a new entry in the destination * directory and mark the entry in the source directory * as deleted. Then move the denode to the correct hash * chain for its new location in the filesystem. And, if * we moved a directory, then update its .. entry to point * to the new parent directory. */ bcopy(ip->de_Name, oldname, 11); bcopy(toname, ip->de_Name, 11); /* update denode */ dp->de_fndoffset = to_diroffset; dp->de_fndcnt = to_count; error = createde(ip, dp, (struct denode **)0, tcnp); if (error) { bcopy(oldname, ip->de_Name, 11); if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } ip->de_refcnt++; zp->de_fndoffset = from_diroffset; error = removede(zp, ip); if (error) { /* XXX should really panic here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } if (!doingdirectory) { error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0, &ip->de_dirclust, 0); if (error) { /* XXX should really panic here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0, p); VOP_UNLOCK(fvp, 0, p); goto bad; } if (ip->de_dirclust == MSDOSFSROOT) ip->de_diroffset = to_diroffset; else ip->de_diroffset = to_diroffset & pmp->pm_crbomask; } reinsert(ip); if (newparent) VOP_UNLOCK(fdvp, 0, p); } /* * If we moved a directory to a new parent directory, then we must * fixup the ".." entry in the moved directory. */ if (doingdirectory && newparent) { cn = ip->de_StartCluster; if (cn == MSDOSFSROOT) { /* this should never happen */ panic("msdosfs_rename(): updating .. in root directory?"); } else bn = cntobn(pmp, cn); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); if (error) { /* XXX should really panic here, fs is corrupt */ brelse(bp); VOP_UNLOCK(fvp, 0, p); goto bad; } dotdotp = (struct direntry *)bp->b_data + 1; putushort(dotdotp->deStartCluster, dp->de_StartCluster); if (FAT32(pmp)) putushort(dotdotp->deHighClust, dp->de_StartCluster >> 16); error = bwrite(bp); if (error) { /* XXX should really panic here, fs is corrupt */ VOP_UNLOCK(fvp, 0, p); goto bad; } } VOP_UNLOCK(fvp, 0, p); bad: if (xp) vput(tvp); vput(tdvp); out: ip->de_flag &= ~DE_RENAME; vrele(fdvp); vrele(fvp); return (error); } static struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", " ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", " ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; static int msdosfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct direntry *denp; struct msdosfsmount *pmp = pdep->de_pmp; struct buf *bp; u_long newcluster, pcl; int bn; int error; struct denode ndirent; struct timespec ts; /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; bzero(&ndirent, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0); bzero(bp->b_data, pmp->pm_bpcluster); bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = 0; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pdep->de_StartCluster >> 16); } error = bwrite(bp); if (error) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_mkdir: no name"); #endif error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; ndirent.de_dev = pdep->de_dev; ndirent.de_devvp = pdep->de_devvp; error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; if ((cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); *ap->a_vpp = DETOV(dep); return (0); bad: clusterfree(pmp, newcluster, NULL); bad2: zfree(namei_zone, cnp->cn_pnbuf); return (error); } static int msdosfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct denode *ip, *dp; struct proc *p = cnp->cn_proc; int error; ip = VTODE(vp); dp = VTODE(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) { error = ENOTEMPTY; goto out; } /* * Delete the entry from the directory. For dos filesystems this * gets rid of the directory entry on disk, the in memory copy * still exists but the de_refcnt is <= 0. This prevents it from * being found by deget(). When the vput() on dep is done we give * up access and eventually msdosfs_reclaim() will be called which * will remove it from the denode cache. */ error = removede(dp, ip); if (error) goto out; /* * This is where we decrement the link count in the parent * directory. Since dos filesystems don't do this we just purge * the name cache. */ cache_purge(dvp); VOP_UNLOCK(dvp, 0, p); /* * Truncate the directory that is being deleted. */ error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, p); cache_purge(vp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); out: return (error); } /* * DOS filesystems don't know what symlinks are. */ static int msdosfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { zfree(namei_zone, ap->a_cnp->cn_pnbuf); /* VOP_ABORTOP(ap->a_dvp, ap->a_cnp); ??? */ return (EOPNOTSUPP); } static int msdosfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { int error = 0; int diff; long n; int blsize; long on; u_long cn; u_long fileno; u_long dirsperblk; long bias = 0; daddr_t bn, lbn; struct buf *bp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; struct dirent dirbuf; struct uio *uio = ap->a_uio; u_long *cookies = NULL; int ncookies = 0; off_t offset, off; int chksum = -1; #ifdef MSDOSFS_DEBUG printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n", ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the the filesystem vnode, and hence can * retrieve the wrong block from the buffer cache for a plain file. * So, fail attempts to readdir() on a plain file. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); /* * To be safe, initialize dirbuf */ bzero(dirbuf.d_name, sizeof(dirbuf.d_name)); /* * If the user buffer is smaller than the size of one dos directory * entry or the file offset is not a multiple of the size of a * directory entry, then we fail the read. */ off = offset = uio->uio_offset; if (uio->uio_resid < sizeof(struct direntry) || (offset & (sizeof(struct direntry) - 1))) return (EINVAL); if (ap->a_ncookies) { ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); /* * If they are reading from the root directory then, we simulate * the . and .. entries since these don't exist in the root * directory. We also set the offset bias to make up for having to * simulate these entries. By this I mean that at file offset 64 we * read the first entry in the root directory that lives on disk. */ if (dep->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) { #if 0 printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n", offset); #endif bias = 2 * sizeof(struct direntry); if (offset < bias) { for (n = (int)offset / sizeof(struct direntry); n < 2; n++) { if (FAT32(pmp)) dirbuf.d_fileno = cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else dirbuf.d_fileno = 1; dirbuf.d_type = DT_DIR; switch (n) { case 0: dirbuf.d_namlen = 1; strcpy(dirbuf.d_name, "."); break; case 1: dirbuf.d_namlen = 2; strcpy(dirbuf.d_name, ".."); break; } dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) goto out; error = uiomove((caddr_t) &dirbuf, dirbuf.d_reclen, uio); if (error) goto out; offset += sizeof(struct direntry); off = offset; if (cookies) { *cookies++ = offset; if (--ncookies <= 0) goto out; } } } } off = offset; while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, blsize - bp->b_resid); /* * Convert from dos directory entries to fs-independent * directory entries. */ for (dentp = (struct direntry *)(bp->b_data + on); (char *)dentp < bp->b_data + on + n; dentp++, offset += sizeof(struct direntry)) { #if 0 printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n", dentp, prev, crnt, dentp->deName[0], dentp->deAttributes); #endif /* * If this is an unused entry, we can stop. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); goto out; } /* * Skip deleted entries. */ if (dentp->deName[0] == SLOT_DELETED) { chksum = -1; continue; } /* * Handle Win95 long directory entries */ if (dentp->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn((struct winentry *)dentp, &dirbuf, chksum, pmp->pm_flags & MSDOSFSMNT_U2WTABLE, pmp->pm_u2w); continue; } /* * Skip volume labels */ if (dentp->deAttributes & ATTR_VOLUME) { chksum = -1; continue; } /* * This computation of d_fileno must match * the computation of va_fileid in * msdosfs_getattr. */ if (dentp->deAttributes & ATTR_DIRECTORY) { fileno = getushort(dentp->deStartCluster); if (FAT32(pmp)) fileno |= getushort(dentp->deHighClust) << 16; /* if this is the root directory */ if (fileno == MSDOSFSROOT) if (FAT32(pmp)) fileno = cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; else fileno = cntobn(pmp, fileno) * dirsperblk; dirbuf.d_fileno = fileno; dirbuf.d_type = DT_DIR; } else { dirbuf.d_fileno = offset / sizeof(struct direntry); dirbuf.d_type = DT_REG; } if (chksum != winChksum(dentp->deName)) dirbuf.d_namlen = dos2unixfn(dentp->deName, (u_char *)dirbuf.d_name, dentp->deLowerCase | ((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ? (LCASE_BASE | LCASE_EXT) : 0), pmp->pm_flags & MSDOSFSMNT_U2WTABLE, pmp->pm_d2u, pmp->pm_flags & MSDOSFSMNT_ULTABLE, pmp->pm_ul); else dirbuf.d_name[dirbuf.d_namlen] = 0; chksum = -1; dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) { brelse(bp); goto out; } error = uiomove((caddr_t) &dirbuf, dirbuf.d_reclen, uio); if (error) { brelse(bp); goto out; } if (cookies) { *cookies++ = offset + sizeof(struct direntry); if (--ncookies <= 0) { brelse(bp); goto out; } } off = offset + sizeof(struct direntry); } brelse(bp); } out: /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; uio->uio_offset = off; /* * Set the eofflag (NFS uses it) */ if (ap->a_eofflag) { if (dep->de_FileSize - (offset - bias) <= 0) *ap->a_eofflag = 1; else *ap->a_eofflag = 0; } return (error); } static int msdosfs_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (0); } /* * vp - address of vnode file the file * bn - which cluster we are interested in mapping to a filesystem block number. * vpp - returns the vnode for the block special file holding the filesystem * containing the file of interest * bnp - address of where to return the filesystem relative block number */ static int msdosfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); if (ap->a_vpp != NULL) *ap->a_vpp = dep->de_devvp; if (ap->a_bnp == NULL) return (0); if (ap->a_runp) { /* * Sequential clusters should be counted here. */ *ap->a_runp = 0; } if (ap->a_runb) { *ap->a_runb = 0; } return (pcbmap(dep, ap->a_bn, ap->a_bnp, 0, 0)); } static int msdosfs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct denode *dep = VTODE(bp->b_vp); struct vnode *vp; int error = 0; if (bp->b_vp->v_type == VBLK || bp->b_vp->v_type == VCHR) panic("msdosfs_strategy: spec"); /* * If we don't already know the filesystem relative block number * then get it using pcbmap(). If pcbmap() returns the block * number as -1 then we've got a hole in the file. DOS filesystems * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if (bp->b_blkno == -1) { biodone(bp); return (0); } /* * Read/write the block from/to the disk that contains the desired * file block. */ vp = dep->de_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } static int msdosfs_print(ap) struct vop_print_args /* { struct vnode *vp; } */ *ap; { struct denode *dep = VTODE(ap->a_vp); printf( "tag VT_MSDOSFS, startcluster %lu, dircluster %lu, diroffset %lu ", dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset); printf(" dev %d, %d", major(dep->de_dev), minor(dep->de_dev)); lockmgr_printinfo(&dep->de_lock); printf("\n"); return (0); } static int msdosfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * get page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int msdosfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * put page routine * * XXX By default, wimp out... note that a_offset is ignored (and always * XXX has been). */ int msdosfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } /* Global vfs data structures for msdosfs */ vop_t **msdosfs_vnodeop_p; static struct vnodeopv_entry_desc msdosfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) msdosfs_abortop }, { &vop_access_desc, (vop_t *) msdosfs_access }, { &vop_bmap_desc, (vop_t *) msdosfs_bmap }, { &vop_cachedlookup_desc, (vop_t *) msdosfs_lookup }, { &vop_close_desc, (vop_t *) msdosfs_close }, { &vop_create_desc, (vop_t *) msdosfs_create }, { &vop_fsync_desc, (vop_t *) msdosfs_fsync }, { &vop_getattr_desc, (vop_t *) msdosfs_getattr }, { &vop_inactive_desc, (vop_t *) msdosfs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_link_desc, (vop_t *) msdosfs_link }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_mkdir_desc, (vop_t *) msdosfs_mkdir }, { &vop_mknod_desc, (vop_t *) msdosfs_mknod }, { &vop_pathconf_desc, (vop_t *) msdosfs_pathconf }, { &vop_print_desc, (vop_t *) msdosfs_print }, { &vop_read_desc, (vop_t *) msdosfs_read }, { &vop_readdir_desc, (vop_t *) msdosfs_readdir }, { &vop_reclaim_desc, (vop_t *) msdosfs_reclaim }, { &vop_remove_desc, (vop_t *) msdosfs_remove }, { &vop_rename_desc, (vop_t *) msdosfs_rename }, { &vop_rmdir_desc, (vop_t *) msdosfs_rmdir }, { &vop_setattr_desc, (vop_t *) msdosfs_setattr }, { &vop_strategy_desc, (vop_t *) msdosfs_strategy }, { &vop_symlink_desc, (vop_t *) msdosfs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) msdosfs_write }, { &vop_getpages_desc, (vop_t *) msdosfs_getpages }, { &vop_putpages_desc, (vop_t *) msdosfs_putpages }, { NULL, NULL } }; static struct vnodeopv_desc msdosfs_vnodeop_opv_desc = { &msdosfs_vnodeop_p, msdosfs_vnodeop_entries }; VNODEOP_SET(msdosfs_vnodeop_opv_desc); Index: head/sys/nfs/nfs_common.c =================================================================== --- head/sys/nfs/nfs_common.c (revision 49534) +++ head/sys/nfs/nfs_common.c (revision 49535) @@ -1,2281 +1,2280 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.78 1999/06/27 11:44:19 peter Exp $ + * $Id: nfs_subs.c,v 1.79 1999/07/17 18:43:47 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_int32_t nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; static void (*nfs_prev_lease_updatetime) __P((int)); static int nfs_prev_nfssvc_sy_narg; static sy_call_t *nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER static vop_t *nfs_prev_vop_lease_check; static int nfs_prev_getfh_sy_narg; static sy_call_t *nfs_prev_getfh_sy_call; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO /* << Last is 86 */ }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); LIST_HEAD(nfsnodehashhead, nfsnode); int nfs_webnamei __P((struct nameidata *, struct vnode *, struct proc *)); u_quad_t nfs_curusec() { struct timeval tv; getmicrotime(&tv); return ((u_quad_t)tv.tv_sec * 1000000 + (u_quad_t)tv.tv_usec); } /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_int32_t *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_int32_t *xidp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); /* Get a pretty random xid to start with */ if (!nfs_xid) nfs_xid = random(); /* * Skip zero xid if it should ever happen. */ if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_int32_t *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain. * NOTE: can ony handle iovcnt == 1 */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; #ifdef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); #endif if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; const char *cp; long siz; { register struct mbuf *m1 = NULL, *m2; long left, xfer, len, tlen; u_int32_t *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_int32_t *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_int32_t *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init(vfsp) struct vfsconf *vfsp; { register int i; nfsmount_zone = zinit("NFSMOUNT", sizeof(struct nfsmount), 0, 0, 1); /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfs_mount_type = vfsp->vfc_typenum; nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = (struct nfsmount *)0; } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); nfs_timer(0); /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER nfs_prev_vop_lease_check = default_vnodeop_p[VOFFSET(vop_lease)]; default_vnodeop_p[VOFFSET(vop_lease)] = (vop_t *)nqnfs_vop_lease_check; #endif nfs_prev_lease_updatetime = lease_updatetime; lease_updatetime = nfs_lease_updatetime; nfs_prev_nfssvc_sy_narg = sysent[SYS_nfssvc].sy_narg; sysent[SYS_nfssvc].sy_narg = 2; nfs_prev_nfssvc_sy_call = sysent[SYS_nfssvc].sy_call; sysent[SYS_nfssvc].sy_call = (sy_call_t *)nfssvc; #ifndef NFS_NOSERVER nfs_prev_getfh_sy_narg = sysent[SYS_getfh].sy_narg; sysent[SYS_getfh].sy_narg = 2; nfs_prev_getfh_sy_call = sysent[SYS_getfh].sy_call; sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif nfs_pbuf_freecnt = nswbuf / 2 + 1; return (0); } int nfs_uninit(vfsp) struct vfsconf *vfsp; { untimeout(nfs_timer, (void *)NULL, nfs_timer_handle); nfs_mount_type = -1; #ifndef NFS_NOSERVER default_vnodeop_p[VOFFSET(vop_lease)] = nfs_prev_vop_lease_check; #endif lease_updatetime = nfs_prev_lease_updatetime; sysent[SYS_nfssvc].sy_narg = nfs_prev_nfssvc_sy_narg; sysent[SYS_nfssvc].sy_call = nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = nfs_prev_getfh_sy_narg; sysent[SYS_getfh].sy_call = nfs_prev_getfh_sy_call; #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register int32_t t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if ((error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) != 0) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makeudev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(int32_t, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. * Since the nfsnode does not have a lock, its * vnode lock has to be carried over. */ nvp->v_vnlock = vp->v_vnlock; vp->v_vnlock = NULL; nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; *vpp = vp = nvp; } } np->n_mtime = mtime.tv_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_hyper(&fp->fa3_size); vap->va_blocksize = NFS_FABLKSIZE; vap->va_bytes = fxdr_hyper(&fp->fa3_used); vap->va_fileid = fxdr_unsigned(int32_t, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize); vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.tv_nsec = 0; vap->va_gen = fxdr_unsigned(u_int32_t,fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time_second; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np; register struct vattr *vap; struct nfsmount *nmp; int timeo; np = VTONFS(vp); vap = &np->n_vattr; nmp = VFSTONFS(vp->v_mount); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) printf("nfs_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) printf("nfs_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct sockaddr *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag, pubflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; cnp->cn_pnbuf = zalloc(namei_zone); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag, pubflag); if (error) goto out; if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = zalloc(namei_zone); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') dp = rootvnode; } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * becuase lookup() will dereference ni_startdir. */ cnp->cn_proc = p; VREF(dp); ndp->ni_startdir = dp; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { nfsrv_object_create(ndp->ni_vp); if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else zfree(namei_zone, cnp->cn_pnbuf); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, p); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = zalloc(namei_zone); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = (struct proc *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; if (before_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct sockaddr *nam; int *rdonlyp; int kerbflag; int pubflag; { struct proc *p = curproc; /* XXX */ register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif *vpp = (struct vnode *)0; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } mp = vfs_getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if (saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_object_create(*vpp); if (!lockflag) VOP_UNLOCK(*vpp, 0, p); return (0); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhp) fhandle_t *fhp; { char *cp = (char *)fhp; int i; for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct sockaddr *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = (struct sockaddr_iso *)nam; isoaddr2 = (struct sockaddr_iso *)haddr->had_nam; if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at <= 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_object_create(vp) struct vnode *vp; { if (vp == NULL || vp->v_type != VREG) return (1); return (vfs_object_create(vp, curproc, curproc ? curproc->p_ucred : NULL)); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(list, num) register gid_t *list; register int num; { register int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * copy credentials making sure that the result can be compared with bcmp(). */ void nfsrv_setcred(incred, outcred) register struct ucred *incred, *outcred; { register int i; bzero((caddr_t)outcred, sizeof (struct ucred)); outcred->cr_ref = 1; outcred->cr_uid = incred->cr_uid; outcred->cr_ngroups = incred->cr_ngroups; for (i = 0; i < incred->cr_ngroups; i++) outcred->cr_groups[i] = incred->cr_groups[i]; nfsrvw_sort(outcred->cr_groups, outcred->cr_ngroups); } #endif /* NFS_NOSERVER */ Index: head/sys/nfs/nfs_subs.c =================================================================== --- head/sys/nfs/nfs_subs.c (revision 49534) +++ head/sys/nfs/nfs_subs.c (revision 49535) @@ -1,2281 +1,2280 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.78 1999/06/27 11:44:19 peter Exp $ + * $Id: nfs_subs.c,v 1.79 1999/07/17 18:43:47 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_int32_t nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; static void (*nfs_prev_lease_updatetime) __P((int)); static int nfs_prev_nfssvc_sy_narg; static sy_call_t *nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER static vop_t *nfs_prev_vop_lease_check; static int nfs_prev_getfh_sy_narg; static sy_call_t *nfs_prev_getfh_sy_call; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO /* << Last is 86 */ }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); LIST_HEAD(nfsnodehashhead, nfsnode); int nfs_webnamei __P((struct nameidata *, struct vnode *, struct proc *)); u_quad_t nfs_curusec() { struct timeval tv; getmicrotime(&tv); return ((u_quad_t)tv.tv_sec * 1000000 + (u_quad_t)tv.tv_usec); } /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_int32_t *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_int32_t *xidp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); /* Get a pretty random xid to start with */ if (!nfs_xid) nfs_xid = random(); /* * Skip zero xid if it should ever happen. */ if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_int32_t *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain. * NOTE: can ony handle iovcnt == 1 */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; #ifdef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); #endif if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; const char *cp; long siz; { register struct mbuf *m1 = NULL, *m2; long left, xfer, len, tlen; u_int32_t *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_int32_t *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_int32_t *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init(vfsp) struct vfsconf *vfsp; { register int i; nfsmount_zone = zinit("NFSMOUNT", sizeof(struct nfsmount), 0, 0, 1); /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfs_mount_type = vfsp->vfc_typenum; nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = (struct nfsmount *)0; } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); nfs_timer(0); /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER nfs_prev_vop_lease_check = default_vnodeop_p[VOFFSET(vop_lease)]; default_vnodeop_p[VOFFSET(vop_lease)] = (vop_t *)nqnfs_vop_lease_check; #endif nfs_prev_lease_updatetime = lease_updatetime; lease_updatetime = nfs_lease_updatetime; nfs_prev_nfssvc_sy_narg = sysent[SYS_nfssvc].sy_narg; sysent[SYS_nfssvc].sy_narg = 2; nfs_prev_nfssvc_sy_call = sysent[SYS_nfssvc].sy_call; sysent[SYS_nfssvc].sy_call = (sy_call_t *)nfssvc; #ifndef NFS_NOSERVER nfs_prev_getfh_sy_narg = sysent[SYS_getfh].sy_narg; sysent[SYS_getfh].sy_narg = 2; nfs_prev_getfh_sy_call = sysent[SYS_getfh].sy_call; sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif nfs_pbuf_freecnt = nswbuf / 2 + 1; return (0); } int nfs_uninit(vfsp) struct vfsconf *vfsp; { untimeout(nfs_timer, (void *)NULL, nfs_timer_handle); nfs_mount_type = -1; #ifndef NFS_NOSERVER default_vnodeop_p[VOFFSET(vop_lease)] = nfs_prev_vop_lease_check; #endif lease_updatetime = nfs_prev_lease_updatetime; sysent[SYS_nfssvc].sy_narg = nfs_prev_nfssvc_sy_narg; sysent[SYS_nfssvc].sy_call = nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = nfs_prev_getfh_sy_narg; sysent[SYS_getfh].sy_call = nfs_prev_getfh_sy_call; #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register int32_t t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if ((error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) != 0) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makeudev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(int32_t, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. * Since the nfsnode does not have a lock, its * vnode lock has to be carried over. */ nvp->v_vnlock = vp->v_vnlock; vp->v_vnlock = NULL; nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; *vpp = vp = nvp; } } np->n_mtime = mtime.tv_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_hyper(&fp->fa3_size); vap->va_blocksize = NFS_FABLKSIZE; vap->va_bytes = fxdr_hyper(&fp->fa3_used); vap->va_fileid = fxdr_unsigned(int32_t, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize); vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.tv_nsec = 0; vap->va_gen = fxdr_unsigned(u_int32_t,fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time_second; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np; register struct vattr *vap; struct nfsmount *nmp; int timeo; np = VTONFS(vp); vap = &np->n_vattr; nmp = VFSTONFS(vp->v_mount); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) printf("nfs_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) printf("nfs_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct sockaddr *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag, pubflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; cnp->cn_pnbuf = zalloc(namei_zone); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag, pubflag); if (error) goto out; if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = zalloc(namei_zone); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') dp = rootvnode; } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * becuase lookup() will dereference ni_startdir. */ cnp->cn_proc = p; VREF(dp); ndp->ni_startdir = dp; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { nfsrv_object_create(ndp->ni_vp); if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else zfree(namei_zone, cnp->cn_pnbuf); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, p); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = zalloc(namei_zone); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = (struct proc *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; if (before_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct sockaddr *nam; int *rdonlyp; int kerbflag; int pubflag; { struct proc *p = curproc; /* XXX */ register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif *vpp = (struct vnode *)0; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } mp = vfs_getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if (saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_object_create(*vpp); if (!lockflag) VOP_UNLOCK(*vpp, 0, p); return (0); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhp) fhandle_t *fhp; { char *cp = (char *)fhp; int i; for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct sockaddr *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = (struct sockaddr_iso *)nam; isoaddr2 = (struct sockaddr_iso *)haddr->had_nam; if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at <= 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_object_create(vp) struct vnode *vp; { if (vp == NULL || vp->v_type != VREG) return (1); return (vfs_object_create(vp, curproc, curproc ? curproc->p_ucred : NULL)); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(list, num) register gid_t *list; register int num; { register int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * copy credentials making sure that the result can be compared with bcmp(). */ void nfsrv_setcred(incred, outcred) register struct ucred *incred, *outcred; { register int i; bzero((caddr_t)outcred, sizeof (struct ucred)); outcred->cr_ref = 1; outcred->cr_uid = incred->cr_uid; outcred->cr_ngroups = incred->cr_ngroups; for (i = 0; i < incred->cr_ngroups; i++) outcred->cr_groups[i] = incred->cr_groups[i]; nfsrvw_sort(outcred->cr_groups, outcred->cr_ngroups); } #endif /* NFS_NOSERVER */ Index: head/sys/nfs/nfs_vnops.c =================================================================== --- head/sys/nfs/nfs_vnops.c (revision 49534) +++ head/sys/nfs/nfs_vnops.c (revision 49535) @@ -1,3372 +1,3372 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.137 1999/07/30 04:51:35 wpaul Exp $ + * $Id: nfs_vnops.c,v 1.138 1999/07/31 01:51:58 msmith Exp $ */ /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); static int nfsfifo_write __P((struct vop_write_args *)); static int nfsspec_close __P((struct vop_close_args *)); static int nfsfifo_close __P((struct vop_close_args *)); #define nfs_poll vop_nopoll static int nfs_flush __P((struct vnode *,struct ucred *,int,struct proc *,int)); static int nfs_setattrrpc __P((struct vnode *,struct vattr *,struct ucred *,struct proc *)); static int nfs_lookup __P((struct vop_lookup_args *)); static int nfs_create __P((struct vop_create_args *)); static int nfs_mknod __P((struct vop_mknod_args *)); static int nfs_open __P((struct vop_open_args *)); static int nfs_close __P((struct vop_close_args *)); static int nfs_access __P((struct vop_access_args *)); static int nfs_getattr __P((struct vop_getattr_args *)); static int nfs_setattr __P((struct vop_setattr_args *)); static int nfs_read __P((struct vop_read_args *)); static int nfs_mmap __P((struct vop_mmap_args *)); static int nfs_fsync __P((struct vop_fsync_args *)); static int nfs_remove __P((struct vop_remove_args *)); static int nfs_link __P((struct vop_link_args *)); static int nfs_rename __P((struct vop_rename_args *)); static int nfs_mkdir __P((struct vop_mkdir_args *)); static int nfs_rmdir __P((struct vop_rmdir_args *)); static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *, struct proc *, struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); static int nfs_readlink __P((struct vop_readlink_args *)); static int nfs_print __P((struct vop_print_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); static int nfs_bwrite __P((struct vop_bwrite_args *)); /* * Global vfs data structures for nfs */ vop_t **nfsv2_vnodeop_p; static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) nfs_abortop }, { &vop_access_desc, (vop_t *) nfs_access }, { &vop_advlock_desc, (vop_t *) nfs_advlock }, { &vop_bmap_desc, (vop_t *) nfs_bmap }, { &vop_bwrite_desc, (vop_t *) nfs_bwrite }, { &vop_close_desc, (vop_t *) nfs_close }, { &vop_create_desc, (vop_t *) nfs_create }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_getpages_desc, (vop_t *) nfs_getpages }, { &vop_putpages_desc, (vop_t *) nfs_putpages }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) nfs_link }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_lookup_desc, (vop_t *) nfs_lookup }, { &vop_mkdir_desc, (vop_t *) nfs_mkdir }, { &vop_mknod_desc, (vop_t *) nfs_mknod }, { &vop_mmap_desc, (vop_t *) nfs_mmap }, { &vop_open_desc, (vop_t *) nfs_open }, { &vop_poll_desc, (vop_t *) nfs_poll }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfs_read }, { &vop_readdir_desc, (vop_t *) nfs_readdir }, { &vop_readlink_desc, (vop_t *) nfs_readlink }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_remove_desc, (vop_t *) nfs_remove }, { &vop_rename_desc, (vop_t *) nfs_rename }, { &vop_rmdir_desc, (vop_t *) nfs_rmdir }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_strategy_desc, (vop_t *) nfs_strategy }, { &vop_symlink_desc, (vop_t *) nfs_symlink }, { &vop_write_desc, (vop_t *) nfs_write }, { NULL, NULL } }; static struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; VNODEOP_SET(nfsv2_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **spec_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsspec_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsspec_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_write_desc, (vop_t *) nfsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, nfsv2_specop_entries }; VNODEOP_SET(spec_nfsv2nodeop_opv_desc); vop_t **fifo_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsfifo_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsfifo_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_write_desc, (vop_t *) nfsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, nfsv2_fifoop_entries }; VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); static int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, struct proc *procp)); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); static int nfs_removerpc __P((struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct proc *proc)); static int nfs_renamerpc __P((struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct proc *proc)); static int nfs_renameit __P((struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp)); /* * Global variables */ extern u_int32_t nfs_true, nfs_false; extern u_int32_t nfs_xdrneg1; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfsaccess_cache_hits; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsaccess_cache_hits, 0, "NFS ACCESS cache hit count"); static int nfsaccess_cache_misses; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsaccess_cache_misses, 0, "NFS ACCESS cache miss count"); #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) static int nfs3_access_otw(struct vnode *vp, int wmode, struct proc *p, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; caddr_t bpos, dpos, cp2; register int32_t t1, t2; register caddr_t cp; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred); nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; } nfsm_reqdone; return error; } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && (ap->a_cred->cr_uid == np->n_modeuid) && ((np->n_mode & mode) == mode)) { nfsaccess_cache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsaccess_cache_misses++; error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); if (!error) { if ((np->n_mode & mode) != mode) { error = EACCES; } } } return (error); } else { if ((error = nfsspec_access(ap)) != 0) return (error); /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = ap->a_p; if (vp->v_type == VREG) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = nfs_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } return (error); } } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct vattr vattr; int error; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { #ifdef DIAGNOSTIC printf("open eacces vtyp=%d\n",vp->v_type); #endif return (EACCES); } /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, ap->a_cred, ap->a_p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_brev = np->n_lrev; } } } else { if (np->n_flag & NMODIFIED) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) np->n_direofoffset = 0; if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) np->n_attrstamp = 0; /* For Open/Close consistency */ return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to nfs_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. * for NQNFS - do nothing now, since 2 is dealt with via leases and * 1 should be dealt with via an fsync() system call for * cases where write errors are important. */ /* ARGSUSED */ static int nfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); int error = 0; if (vp->v_type == VREG) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { if (NFS_ISV3(vp)) { error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, 0); np->n_flag &= ~NMODIFIED; } else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); np->n_attrstamp = 0; } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } } return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); if (v3 && nfsaccess_cache_timeout > 0) { nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, ap->a_cred); if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); } nfsstats.rpccnt[NFSPROC_GETATTR]++; nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); if (!error) { nfsm_loadattr(vp, ap->a_vap); } nfsm_reqdone; return (error); } /* * nfs setattr call. */ static int nfs_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags is not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); vnode_pager_setsize(vp, vap->va_size); if (np->n_flag & NMODIFIED) { if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_cred, ap->a_p, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); if (error) { vnode_pager_setsize(vp, np->n_size); return (error); } } tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG && (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); if (error && vap->va_size != VNOVAL) { np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, np->n_size); } return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(vp, vap, cred, procp) register struct vnode *vp; register struct vattr *vap; struct ucred *cred; struct proc *procp; { register struct nfsv2_sattr *sp; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; u_int32_t *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_v3attrbuild(vap, TRUE); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = nfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = nfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = nfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(vp, NFSPROC_SETATTR, procp, cred); if (v3) { nfsm_wcc_data(vp, wccflag); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_reqdone; return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *newvp; u_int32_t *tl; caddr_t cp; int32_t t1, t2; struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; nfsfh_t *fhp; struct nfsnode *np; int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; *vpp = NULLVP; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; int vpid; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p)) != 0) { *vpp = NULLVP; return (error); } newvp = *vpp; vpid = newvp->v_id; /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = vget(newvp, LK_EXCLUSIVE, p); if (!error && lockparent && (flags & ISLASTCN)) error = vn_lock(dvp, LK_EXCLUSIVE, p); } else { error = vget(newvp, LK_EXCLUSIVE, p); if (!lockparent || error || !(flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } if (!error) { if (vpid == newvp->v_id) { if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); } vput(newvp); if (lockparent && dvp != newvp && (flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; if (error) return (error); } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); if (error) { nfsm_postop_attr(dvp, attrflag); m_freem(mrep); goto nfsmout; } nfsm_getfh(fhp, fhsize, v3); /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); return (EISDIR); } error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; if (!lockparent) VOP_UNLOCK(dvp, 0, p); return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } newvp = NFSTOV(np); if (lockparent && (flags & ISLASTCN) && (error = vn_lock(dvp, LK_EXCLUSIVE, p))) { vput(newvp); return (error); } } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } if (!lockparent || !(flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; nfsm_reqdone; if (error) { if (newvp != NULLVP) { vrele(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (!lockparent) VOP_UNLOCK(dvp, 0, p); if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VREG) return (EPERM); return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* * nfs readlink call */ static int nfs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs_readlinkrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); if (v3) nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); if (len == NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); if (np->n_size && np->n_size < NFS_MAXPATHLEN) len = np->n_size; } nfsm_mtouio(uiop, len); } nfsm_reqdone; return (error); } /* * nfs read rpc call * Ditto above */ int nfs_readrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof, attrflag; int v3 = NFS_ISV3(vp); #ifndef nolint eof = 0; #endif nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (v3) { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } else { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_strsiz(retlen, nmp->nm_rsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; if (v3) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } nfsmout: return (error); } /* * nfs write call */ int nfs_writerpc(vp, uiop, cred, iomode, must_commit) register struct vnode *vp; register struct uio *uiop; struct ucred *cred; int *iomode, *must_commit; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { register u_int32_t x; nfsm_build(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; m_freem(mrep); break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base -= backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest committment level * obtained by any of the RPCs. */ if (committed == NFSV3WRITE_FILESYNC) committed = commit; else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); } } } else nfsm_loadattr(vp, (struct vattr *)0); if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); if (error) break; tsiz -= len; } nfsmout: if (vp->v_mount->mnt_flag & MNT_ASYNC) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(dvp, vpp, cnp, vap) register struct vnode *dvp; register struct vnode **vpp; register struct componentname *cnp; register struct vattr *vap; { register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np = (struct nfsnode *)0; struct vattr vattr; char *cp2; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_int32_t rdev; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = txdr_unsigned(vap->va_rdev); else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = nfs_xdrneg1; else { VOP_ABORTOP(dvp, cnp); return (EOPNOTSUPP); } if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } nfsstats.rpccnt[NFSPROC_MKNOD]++; nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); nfsm_v3attrbuild(vap, FALSE); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(umajor(vap->va_rdev)); *tl = txdr_unsigned(uminor(vap->va_rdev)); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = rdev; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (newvp) vput(newvp); } else { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *vpp = newvp; } zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *newvp; int error; error = nfs_mknodrpc(ap->a_dvp, &newvp, ap->a_cnp, ap->a_vap); if (!error) vput(newvp); return (error); } static u_long create_verf; /* * nfs file create call */ static int nfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: nfsstats.rpccnt[NFSPROC_CREATE]++; nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); nfsm_build(tl, u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET if (!TAILQ_EMPTY(&in_ifaddrhead)) *tl++ = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = 0; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } if (newvp) vput(newvp); } else if (v3 && (fmode & O_EXCL)) error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_proc); if (!error) { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs_remove: no name"); if (vp->v_usecount < 1) panic("nfs_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vp->v_usecount == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); zfree(namei_zone, cnp->cn_pnbuf); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs_removeit(sp) register struct sillyrename *sp; { return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, (struct proc *)0)); } /* * Nfs remove rpc, called from nfs_remove() and nfs_removeit(). */ static int nfs_removerpc(dvp, name, namelen, cred, proc) register struct vnode *dvp; const char *name; int namelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_REMOVE, proc, cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file rename call */ static int nfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { register struct vnode *fvp = ap->a_fvp; register struct vnode *tvp = ap->a_tvp; register struct vnode *fdvp = ap->a_fdvp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *tcnp = ap->a_tcnp; register struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. */ VOP_FSYNC(fvp, fcnp->cn_cred, MNT_WAIT, fcnp->cn_proc); if (tvp) VOP_FSYNC(tvp, tcnp->cn_cred, MNT_WAIT, tcnp->cn_proc); /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && tvp->v_usecount > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(sdvp, scnp, sp) struct vnode *sdvp; struct componentname *scnp; register struct sillyrename *sp; { return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_proc)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) register struct vnode *fdvp; const char *fnameptr; int fnamelen; register struct vnode *tdvp; const char *tnameptr; int tnamelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); nfsm_request(fdvp, NFSPROC_RENAME, proc, cred); if (v3) { nfsm_wcc_data(fdvp, fwccflag); nfsm_wcc_data(tdvp, twccflag); } nfsm_reqdone; VTONFS(fdvp)->n_flag |= NMODIFIED; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; if (!twccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs hard link create call */ static int nfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3; if (vp->v_mount != tdvp->v_mount) { VOP_ABORTOP(tdvp, cnp); return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); if (v3) { nfsm_postop_attr(vp, attrflag); nfsm_wcc_data(tdvp, wccflag); } nfsm_reqdone; zfree(namei_zone, cnp->cn_pnbuf); VTONFS(tdvp)->n_flag |= NMODIFIED; if (!attrflag) VTONFS(vp)->n_attrstamp = 0; if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vnode *newvp = (struct vnode *)0; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); if (!v3) { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); if (v3) { if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); nfsm_wcc_data(dvp, wccflag); } nfsm_reqdone; if (newvp) vput(newvp); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; /* * cnp's buffer expected to be freed if SAVESTART not set or * if an error was returned. */ if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * nfs make dir call */ static int nfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register int len; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; int gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } len = cnp->cn_namelen; nfsstats.rpccnt[NFSPROC_MKDIR]++; nfsm_reqhead(dvp, NFSPROC_MKDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { vrele(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, cnp->cn_proc, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vrele(newvp); } else *ap->a_vpp = newvp; if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * nfs remove directory call */ static int nfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); if (dvp == vp) return (EINVAL); nfsstats.rpccnt[NFSPROC_RMDIR]++; nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct uio *uio = ap->a_uio; int tresid, error; struct vattr vattr; if (vp->v_type != VDIR) return (EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKCACHABLE(vp, ND_READ)) { nfsstats.direofcache_hits++; return (0); } } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && np->n_mtime == vattr.va_mtime.tv_sec) { nfsstats.direofcache_hits++; return (0); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; return (error); } /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs_readdirrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; int v3 = NFS_ISV3(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + NFSX_READDIR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; } else { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmp->nm_readdirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; } else { m_freem(mrep); goto nfsmout; } } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; /* null terminate */ uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } if (bigenough) { cookie.nfsuquad[0] = *tl++; if (v3) cookie.nfsuquad[1] = *tl++; } else if (v3) tl += 2; else tl++; more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: return (error); } /* * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int nfs_readdirplusrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register struct vnode *newvp; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2, dpossav1, dpossav2; struct mbuf *mreq, *mrep, *md, *mb, *mb2, *mdsav1, *mdsav2; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; #ifndef nolint dp = (struct dirent *)0; #endif #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirplusrpc bad uio"); #endif ndp->ni_dvp = vp; newvp = NULLVP; /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; nfsm_reqhead(vp, NFSPROC_READDIRPLUS, NFSX_FH(1) + 6 * NFSX_UNSIGNED); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_rsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred); nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; cookie.nfsuquad[1] = *tl++; } else tl += 2; /* * Since the attributes are before the file handle * (sigh), we must skip over the attributes and then * come back and get them. */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { dpossav1 = dpos; mdsav1 = md; nfsm_adv(NFSX_V3FATTR); nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { VREF(vp); newvp = vp; np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, fhsize, &np); if (error) doit = 0; else newvp = NFSTOV(np); } } if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; nfsm_loadattr(newvp, (struct vattr *)0); dpos = dpossav2; md = mdsav2; dp->d_type = IFTODT(VTTOIF(np->n_vattr.va_type)); ndp->ni_vp = newvp; cnp->cn_hash = 0; for (cp = cnp->cn_nameptr, i = 1; i <= len; i++, cp++) cnp->cn_hash += (unsigned char)*cp; cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); } } else { /* Just skip over the file handle */ nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirplusrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; struct componentname *cnp; { register struct sillyrename *sp; struct nfsnode *np; int error; short pid; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crdup(cnp->cn_cred); sp->s_dvp = dvp; VREF(dvp); /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); /* Try lookitups until we get one that isn't there */ while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { sp->s_name[4]++; if (sp->s_name[4] > 'z') { error = EINVAL; goto bad; } } error = nfs_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(dvp, name, len, cred, procp, npp) register struct vnode *dvp; const char *name; int len; struct ucred *cred; struct proc *procp; struct nfsnode **npp; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos, cp2; int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen,M_NFSBIGFH,M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } } else nfsm_loadattr(newvp, (struct vattr *)0); } nfsm_reqdone; if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ static int nfs_commit(vp, offset, cnt, cred, procp) register struct vnode *vp; u_quad_t offset; int cnt; struct ucred *cred; struct proc *procp; { register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; register struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsm_request(vp, NFSPROC_COMMIT, procp, cred); nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; } } nfsm_reqdone; return (error); } /* * Kludge City.. * - make nfs_bmap() essentially a no-op that does no translation * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc * (Maybe I could use the process's page mapping, but I was concerned that * Kernel Write might not be enabled and also figured copyout() would do * a lot more work than bcopy() and also it currently happens in the * context of the swapper process (2). */ static int nfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct vnode *vp = ap->a_vp; if (ap->a_vpp != NULL) *ap->a_vpp = vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs_strategy(ap) struct vop_strategy_args *ap; { register struct buf *bp = ap->a_bp; struct ucred *cr; struct proc *p; int error = 0; KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp)); if (bp->b_flags & B_PHYS) panic("nfs physio"); if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ if (bp->b_flags & B_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(bp, NOCRED, p)) error = nfs_doio(bp, cr, p); return (error); } /* * Mmap a file * * NB Currently unsupported. */ /* ARGSUSED */ static int nfs_mmap(ap) struct vop_mmap_args /* { struct vnode *a_vp; int a_fflags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { return (EINVAL); } /* * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct proc * a_p; } */ *ap; { return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs_flush(vp, cred, waitfor, p, commit) register struct vnode *vp; struct ucred *cred; int waitfor; struct proc *p; int commit; { register struct nfsnode *np = VTONFS(vp); register struct buf *bp; register int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bvecpos >= bvecsize) break; if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT) || BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; bp->b_flags |= B_WRITEINPROG; vfs_busy_pages(bp, 1); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_vnbufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs_commit(vp, off, (int)size, bp->b_wcred, p); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); vp->v_numoutput++; bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); biodone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) panic("nfs_fsync: inconsistent lock"); if (error == ENOLCK) goto loop; if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT); splx(s); VOP_BWRITE(bp->b_vp, bp); goto loop; } splx(s); if (passone) { passone = 0; goto again; } if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) { goto loop; } } if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. * Currently unsupported. */ static int nfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * The following kludge is to allow diskless support to work * until a real NFS lockd is implemented. Basically, just pretend * that this is a local lock. */ return (lf_advlock(ap, &(np->n_lockf), np->n_size)); } /* * Print out the contents of an nfsnode. */ static int nfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); printf("tag VT_NFS, fileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Just call nfs_writebp() with the force argument set to 1. * * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(ap) struct vop_bwrite_args /* { struct vnode *a_bp; } */ *ap; { return (nfs_writebp(ap->a_bp, 1, curproc)); } /* * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set * B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force, procp) register struct buf *bp; int force; struct proc *procp; { int s; int oldflags = bp->b_flags; int retv = 1; off_t off; if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; splx(s); /* * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not * an actual write will have to be scheduled via. VOP_STRATEGY(). * If B_WRITEINPROG is already set, then push it with a write anyhow. */ vfs_busy_pages(bp, 1); if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; bp->b_flags |= B_WRITEINPROG; retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, bp->b_wcred, procp); bp->b_flags &= ~B_WRITEINPROG; if (!retv) { bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~B_NEEDCOMMIT; biodone(bp); } else if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); } } if (retv) { if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); VOP_STRATEGY(bp->b_vp, bp); } if( (oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp, bp->b_vp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vattr *vap; register gid_t *gp; register struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; struct vattr vattr; register int i; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * If you're the super-user, * you always get access. */ if (cred->cr_uid == 0) return (0); vap = &vattr; error = VOP_GETATTR(vp, vap, cred, ap->a_p); if (error) return (error); /* * Access check is based on only one of owner, group, public. * If not owner, then check group. If not a member of the * group, then check public access. */ if (cred->cr_uid != vap->va_uid) { mode >>= 3; gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) if (vap->va_gid == *gp) goto found; mode >>= 3; found: ; } error = (vap->va_mode & mode) == mode ? 0 : EACCES; return (error); } /* * Read wrapper for special devices. */ static int nfsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ static int nfsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the nfsnode then do device close. */ static int nfsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ static int nfsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifos. */ static int nfsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; if (np->n_flag & (NACC | NUPD)) { getnanotime(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } Index: head/sys/nfsclient/nfs_subs.c =================================================================== --- head/sys/nfsclient/nfs_subs.c (revision 49534) +++ head/sys/nfsclient/nfs_subs.c (revision 49535) @@ -1,2281 +1,2280 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.78 1999/06/27 11:44:19 peter Exp $ + * $Id: nfs_subs.c,v 1.79 1999/07/17 18:43:47 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_int32_t nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; static void (*nfs_prev_lease_updatetime) __P((int)); static int nfs_prev_nfssvc_sy_narg; static sy_call_t *nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER static vop_t *nfs_prev_vop_lease_check; static int nfs_prev_getfh_sy_narg; static sy_call_t *nfs_prev_getfh_sy_call; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO /* << Last is 86 */ }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); LIST_HEAD(nfsnodehashhead, nfsnode); int nfs_webnamei __P((struct nameidata *, struct vnode *, struct proc *)); u_quad_t nfs_curusec() { struct timeval tv; getmicrotime(&tv); return ((u_quad_t)tv.tv_sec * 1000000 + (u_quad_t)tv.tv_usec); } /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_int32_t *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_int32_t *xidp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); /* Get a pretty random xid to start with */ if (!nfs_xid) nfs_xid = random(); /* * Skip zero xid if it should ever happen. */ if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_int32_t *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain. * NOTE: can ony handle iovcnt == 1 */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; #ifdef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); #endif if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; const char *cp; long siz; { register struct mbuf *m1 = NULL, *m2; long left, xfer, len, tlen; u_int32_t *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_int32_t *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_int32_t *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init(vfsp) struct vfsconf *vfsp; { register int i; nfsmount_zone = zinit("NFSMOUNT", sizeof(struct nfsmount), 0, 0, 1); /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfs_mount_type = vfsp->vfc_typenum; nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = (struct nfsmount *)0; } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); nfs_timer(0); /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER nfs_prev_vop_lease_check = default_vnodeop_p[VOFFSET(vop_lease)]; default_vnodeop_p[VOFFSET(vop_lease)] = (vop_t *)nqnfs_vop_lease_check; #endif nfs_prev_lease_updatetime = lease_updatetime; lease_updatetime = nfs_lease_updatetime; nfs_prev_nfssvc_sy_narg = sysent[SYS_nfssvc].sy_narg; sysent[SYS_nfssvc].sy_narg = 2; nfs_prev_nfssvc_sy_call = sysent[SYS_nfssvc].sy_call; sysent[SYS_nfssvc].sy_call = (sy_call_t *)nfssvc; #ifndef NFS_NOSERVER nfs_prev_getfh_sy_narg = sysent[SYS_getfh].sy_narg; sysent[SYS_getfh].sy_narg = 2; nfs_prev_getfh_sy_call = sysent[SYS_getfh].sy_call; sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif nfs_pbuf_freecnt = nswbuf / 2 + 1; return (0); } int nfs_uninit(vfsp) struct vfsconf *vfsp; { untimeout(nfs_timer, (void *)NULL, nfs_timer_handle); nfs_mount_type = -1; #ifndef NFS_NOSERVER default_vnodeop_p[VOFFSET(vop_lease)] = nfs_prev_vop_lease_check; #endif lease_updatetime = nfs_prev_lease_updatetime; sysent[SYS_nfssvc].sy_narg = nfs_prev_nfssvc_sy_narg; sysent[SYS_nfssvc].sy_call = nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = nfs_prev_getfh_sy_narg; sysent[SYS_getfh].sy_call = nfs_prev_getfh_sy_call; #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register int32_t t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if ((error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) != 0) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makeudev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(int32_t, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. * Since the nfsnode does not have a lock, its * vnode lock has to be carried over. */ nvp->v_vnlock = vp->v_vnlock; vp->v_vnlock = NULL; nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; *vpp = vp = nvp; } } np->n_mtime = mtime.tv_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_hyper(&fp->fa3_size); vap->va_blocksize = NFS_FABLKSIZE; vap->va_bytes = fxdr_hyper(&fp->fa3_used); vap->va_fileid = fxdr_unsigned(int32_t, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize); vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.tv_nsec = 0; vap->va_gen = fxdr_unsigned(u_int32_t,fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time_second; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np; register struct vattr *vap; struct nfsmount *nmp; int timeo; np = VTONFS(vp); vap = &np->n_vattr; nmp = VFSTONFS(vp->v_mount); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) printf("nfs_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) printf("nfs_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct sockaddr *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag, pubflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; cnp->cn_pnbuf = zalloc(namei_zone); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag, pubflag); if (error) goto out; if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = zalloc(namei_zone); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') dp = rootvnode; } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * becuase lookup() will dereference ni_startdir. */ cnp->cn_proc = p; VREF(dp); ndp->ni_startdir = dp; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { nfsrv_object_create(ndp->ni_vp); if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else zfree(namei_zone, cnp->cn_pnbuf); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, p); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = zalloc(namei_zone); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = (struct proc *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; if (before_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct sockaddr *nam; int *rdonlyp; int kerbflag; int pubflag; { struct proc *p = curproc; /* XXX */ register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif *vpp = (struct vnode *)0; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } mp = vfs_getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if (saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_object_create(*vpp); if (!lockflag) VOP_UNLOCK(*vpp, 0, p); return (0); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhp) fhandle_t *fhp; { char *cp = (char *)fhp; int i; for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct sockaddr *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = (struct sockaddr_iso *)nam; isoaddr2 = (struct sockaddr_iso *)haddr->had_nam; if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at <= 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_object_create(vp) struct vnode *vp; { if (vp == NULL || vp->v_type != VREG) return (1); return (vfs_object_create(vp, curproc, curproc ? curproc->p_ucred : NULL)); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(list, num) register gid_t *list; register int num; { register int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * copy credentials making sure that the result can be compared with bcmp(). */ void nfsrv_setcred(incred, outcred) register struct ucred *incred, *outcred; { register int i; bzero((caddr_t)outcred, sizeof (struct ucred)); outcred->cr_ref = 1; outcred->cr_uid = incred->cr_uid; outcred->cr_ngroups = incred->cr_ngroups; for (i = 0; i < incred->cr_ngroups; i++) outcred->cr_groups[i] = incred->cr_groups[i]; nfsrvw_sort(outcred->cr_groups, outcred->cr_ngroups); } #endif /* NFS_NOSERVER */ Index: head/sys/nfsclient/nfs_vnops.c =================================================================== --- head/sys/nfsclient/nfs_vnops.c (revision 49534) +++ head/sys/nfsclient/nfs_vnops.c (revision 49535) @@ -1,3372 +1,3372 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95 - * $Id: nfs_vnops.c,v 1.137 1999/07/30 04:51:35 wpaul Exp $ + * $Id: nfs_vnops.c,v 1.138 1999/07/31 01:51:58 msmith Exp $ */ /* * vnode op calls for Sun NFS version 2 and 3 */ #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include /* Defs */ #define TRUE 1 #define FALSE 0 /* * Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these * calls are not in getblk() and brelse() so that they would not be necessary * here. */ #ifndef B_VMIO #define vfs_busy_pages(bp, f) #endif static int nfsspec_read __P((struct vop_read_args *)); static int nfsspec_write __P((struct vop_write_args *)); static int nfsfifo_read __P((struct vop_read_args *)); static int nfsfifo_write __P((struct vop_write_args *)); static int nfsspec_close __P((struct vop_close_args *)); static int nfsfifo_close __P((struct vop_close_args *)); #define nfs_poll vop_nopoll static int nfs_flush __P((struct vnode *,struct ucred *,int,struct proc *,int)); static int nfs_setattrrpc __P((struct vnode *,struct vattr *,struct ucred *,struct proc *)); static int nfs_lookup __P((struct vop_lookup_args *)); static int nfs_create __P((struct vop_create_args *)); static int nfs_mknod __P((struct vop_mknod_args *)); static int nfs_open __P((struct vop_open_args *)); static int nfs_close __P((struct vop_close_args *)); static int nfs_access __P((struct vop_access_args *)); static int nfs_getattr __P((struct vop_getattr_args *)); static int nfs_setattr __P((struct vop_setattr_args *)); static int nfs_read __P((struct vop_read_args *)); static int nfs_mmap __P((struct vop_mmap_args *)); static int nfs_fsync __P((struct vop_fsync_args *)); static int nfs_remove __P((struct vop_remove_args *)); static int nfs_link __P((struct vop_link_args *)); static int nfs_rename __P((struct vop_rename_args *)); static int nfs_mkdir __P((struct vop_mkdir_args *)); static int nfs_rmdir __P((struct vop_rmdir_args *)); static int nfs_symlink __P((struct vop_symlink_args *)); static int nfs_readdir __P((struct vop_readdir_args *)); static int nfs_bmap __P((struct vop_bmap_args *)); static int nfs_strategy __P((struct vop_strategy_args *)); static int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *, struct proc *, struct nfsnode **)); static int nfs_sillyrename __P((struct vnode *,struct vnode *,struct componentname *)); static int nfsspec_access __P((struct vop_access_args *)); static int nfs_readlink __P((struct vop_readlink_args *)); static int nfs_print __P((struct vop_print_args *)); static int nfs_advlock __P((struct vop_advlock_args *)); static int nfs_bwrite __P((struct vop_bwrite_args *)); /* * Global vfs data structures for nfs */ vop_t **nfsv2_vnodeop_p; static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_abortop_desc, (vop_t *) nfs_abortop }, { &vop_access_desc, (vop_t *) nfs_access }, { &vop_advlock_desc, (vop_t *) nfs_advlock }, { &vop_bmap_desc, (vop_t *) nfs_bmap }, { &vop_bwrite_desc, (vop_t *) nfs_bwrite }, { &vop_close_desc, (vop_t *) nfs_close }, { &vop_create_desc, (vop_t *) nfs_create }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_getpages_desc, (vop_t *) nfs_getpages }, { &vop_putpages_desc, (vop_t *) nfs_putpages }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) nfs_link }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_lookup_desc, (vop_t *) nfs_lookup }, { &vop_mkdir_desc, (vop_t *) nfs_mkdir }, { &vop_mknod_desc, (vop_t *) nfs_mknod }, { &vop_mmap_desc, (vop_t *) nfs_mmap }, { &vop_open_desc, (vop_t *) nfs_open }, { &vop_poll_desc, (vop_t *) nfs_poll }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfs_read }, { &vop_readdir_desc, (vop_t *) nfs_readdir }, { &vop_readlink_desc, (vop_t *) nfs_readlink }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_remove_desc, (vop_t *) nfs_remove }, { &vop_rename_desc, (vop_t *) nfs_rename }, { &vop_rmdir_desc, (vop_t *) nfs_rmdir }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_strategy_desc, (vop_t *) nfs_strategy }, { &vop_symlink_desc, (vop_t *) nfs_symlink }, { &vop_write_desc, (vop_t *) nfs_write }, { NULL, NULL } }; static struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; VNODEOP_SET(nfsv2_vnodeop_opv_desc); /* * Special device vnode ops */ vop_t **spec_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsspec_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsspec_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_write_desc, (vop_t *) nfsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, nfsv2_specop_entries }; VNODEOP_SET(spec_nfsv2nodeop_opv_desc); vop_t **fifo_nfsv2nodeop_p; static struct vnodeopv_entry_desc nfsv2_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_access_desc, (vop_t *) nfsspec_access }, { &vop_close_desc, (vop_t *) nfsfifo_close }, { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsfifo_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_write_desc, (vop_t *) nfsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, nfsv2_fifoop_entries }; VNODEOP_SET(fifo_nfsv2nodeop_opv_desc); static int nfs_commit __P((struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, struct proc *procp)); static int nfs_mknodrpc __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vattr *vap)); static int nfs_removerpc __P((struct vnode *dvp, const char *name, int namelen, struct ucred *cred, struct proc *proc)); static int nfs_renamerpc __P((struct vnode *fdvp, const char *fnameptr, int fnamelen, struct vnode *tdvp, const char *tnameptr, int tnamelen, struct ucred *cred, struct proc *proc)); static int nfs_renameit __P((struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp)); /* * Global variables */ extern u_int32_t nfs_true, nfs_false; extern u_int32_t nfs_xdrneg1; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; int nfs_numasync = 0; #define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) SYSCTL_DECL(_vfs_nfs); static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout"); static int nfsaccess_cache_hits; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD, &nfsaccess_cache_hits, 0, "NFS ACCESS cache hit count"); static int nfsaccess_cache_misses; SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD, &nfsaccess_cache_misses, 0, "NFS ACCESS cache miss count"); #define NFSV3ACCESS_ALL (NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY \ | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE \ | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP) static int nfs3_access_otw(struct vnode *vp, int wmode, struct proc *p, struct ucred *cred) { const int v3 = 1; u_int32_t *tl; int error = 0, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; caddr_t bpos, dpos, cp2; register int32_t t1, t2; register caddr_t cp; u_int32_t rmode; struct nfsnode *np = VTONFS(vp); nfsstats.rpccnt[NFSPROC_ACCESS]++; nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(wmode); nfsm_request(vp, NFSPROC_ACCESS, p, cred); nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); rmode = fxdr_unsigned(u_int32_t, *tl); np->n_mode = rmode; np->n_modeuid = cred->cr_uid; np->n_modestamp = time_second; } nfsm_reqdone; return error; } /* * nfs access vnode op. * For nfs version 2, just return ok. File accesses may fail later. * For nfs version 3, use the access rpc to check accessibility. If file modes * are changed on the server, accesses might still fail later. */ static int nfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; int error = 0; u_int32_t mode, wmode; int v3 = NFS_ISV3(vp); struct nfsnode *np = VTONFS(vp); /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * For nfs v3, check to see if we have done this recently, and if * so return our cached result instead of making an ACCESS call. * If not, do an access rpc, otherwise you are stuck emulating * ufs_access() locally using the vattr. This may not be correct, * since the server may apply other access criteria such as * client uid-->server uid mapping that we do not know about. */ if (v3) { if (ap->a_mode & VREAD) mode = NFSV3ACCESS_READ; else mode = 0; if (vp->v_type != VDIR) { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_EXECUTE; } else { if (ap->a_mode & VWRITE) mode |= (NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_DELETE); if (ap->a_mode & VEXEC) mode |= NFSV3ACCESS_LOOKUP; } /* XXX safety belt, only make blanket request if caching */ if (nfsaccess_cache_timeout > 0) { wmode = NFSV3ACCESS_READ | NFSV3ACCESS_MODIFY | NFSV3ACCESS_EXTEND | NFSV3ACCESS_EXECUTE | NFSV3ACCESS_DELETE | NFSV3ACCESS_LOOKUP; } else { wmode = mode; } /* * Does our cached result allow us to give a definite yes to * this request? */ if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) && (ap->a_cred->cr_uid == np->n_modeuid) && ((np->n_mode & mode) == mode)) { nfsaccess_cache_hits++; } else { /* * Either a no, or a don't know. Go to the wire. */ nfsaccess_cache_misses++; error = nfs3_access_otw(vp, wmode, ap->a_p,ap->a_cred); if (!error) { if ((np->n_mode & mode) != mode) { error = EACCES; } } } return (error); } else { if ((error = nfsspec_access(ap)) != 0) return (error); /* * Attempt to prevent a mapped root from accessing a file * which it shouldn't. We try to read a byte from the file * if the user is root and the file is not zero length. * After calling nfsspec_access, we should have the correct * file size cached. */ if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_procp = ap->a_p; if (vp->v_type == VREG) error = nfs_readrpc(vp, &auio, ap->a_cred); else if (vp->v_type == VDIR) { char* bp; bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK); aiov.iov_base = bp; aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ; error = nfs_readdirrpc(vp, &auio, ap->a_cred); free(bp, M_TEMP); } else if (vp->v_type == VLNK) error = nfs_readlinkrpc(vp, &auio, ap->a_cred); else error = EACCES; } return (error); } } /* * nfs open vnode op * Check to see if the type is ok * and that deletion is not in progress. * For paged in text files, you will need to flush the page cache * if consistency is lost. */ /* ARGSUSED */ static int nfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct vattr vattr; int error; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { #ifdef DIAGNOSTIC printf("open eacces vtyp=%d\n",vp->v_type); #endif return (EACCES); } /* * Get a valid lease. If cached data is stale, flush it. */ if (nmp->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKINVALID(vp, np, ND_READ)) { do { error = nqnfs_getlease(vp, ND_READ, ap->a_cred, ap->a_p); } while (error == NQNFS_EXPIRED); if (error) return (error); if (np->n_lrev != np->n_brev || (np->n_flag & NQNFSNONCACHE)) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_brev = np->n_lrev; } } } else { if (np->n_flag & NMODIFIED) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p); if (error) return (error); if (np->n_mtime != vattr.va_mtime.tv_sec) { if (vp->v_type == VDIR) np->n_direofoffset = 0; if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); np->n_mtime = vattr.va_mtime.tv_sec; } } } if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) np->n_attrstamp = 0; /* For Open/Close consistency */ return (0); } /* * nfs close vnode op * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: * 1 - So that any write errors may be reported back to the client process * doing the close system call. By far the two most likely errors are * NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure. * 2 - To put a worst case upper bound on cache inconsistency between * multiple clients for the file. * There is also a consistency problem for Version 2 of the protocol w.r.t. * not being able to tell if other clients are writing a file concurrently, * since there is no way of knowing if the changed modify time in the reply * is only due to the write for this client. * (NFS Version 3 provides weak cache consistency data in the reply that * should be sufficient to detect and handle this case.) * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers * for NFS Version 3 - flush dirty buffers to the server but don't invalidate * or commit them (this satisfies 1 and 2 except for the * case where the server crashes after this close but * before the commit RPC, which is felt to be "good * enough". Changing the last argument to nfs_flush() to * a 1 would force a commit operation, if it is felt a * commit is necessary now. * for NQNFS - do nothing now, since 2 is dealt with via leases and * 1 should be dealt with via an fsync() system call for * cases where write errors are important. */ /* ARGSUSED */ static int nfs_close(ap) struct vop_close_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); int error = 0; if (vp->v_type == VREG) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && (np->n_flag & NMODIFIED)) { if (NFS_ISV3(vp)) { error = nfs_flush(vp, ap->a_cred, MNT_WAIT, ap->a_p, 0); np->n_flag &= ~NMODIFIED; } else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); np->n_attrstamp = 0; } if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; error = np->n_error; } } return (error); } /* * nfs getattr call from vfs. */ static int nfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; caddr_t bpos, dpos; int error = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); /* * Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; /* * First look in the cache. */ if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); if (v3 && nfsaccess_cache_timeout > 0) { nfs3_access_otw(vp, NFSV3ACCESS_ALL, ap->a_p, ap->a_cred); if (nfs_getattrcache(vp, ap->a_vap) == 0) return (0); } nfsstats.rpccnt[NFSPROC_GETATTR]++; nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); if (!error) { nfsm_loadattr(vp, ap->a_vap); } nfsm_reqdone; return (error); } /* * nfs setattr call. */ static int nfs_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct vattr *vap = ap->a_vap; int error = 0; u_quad_t tsize; #ifndef nolint tsize = (u_quad_t)0; #endif /* * Setting of flags is not supported. */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_mtime.tv_sec == VNOVAL && vap->va_atime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL && vap->va_uid == (uid_t)VNOVAL && vap->va_gid == (gid_t)VNOVAL) return (0); vap->va_size = VNOVAL; break; default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); vnode_pager_setsize(vp, vap->va_size); if (np->n_flag & NMODIFIED) { if (vap->va_size == 0) error = nfs_vinvalbuf(vp, 0, ap->a_cred, ap->a_p, 1); else error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); if (error) { vnode_pager_setsize(vp, np->n_size); return (error); } } tsize = np->n_size; np->n_size = np->n_vattr.va_size = vap->va_size; }; } else if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG && (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); error = nfs_setattrrpc(vp, vap, ap->a_cred, ap->a_p); if (error && vap->va_size != VNOVAL) { np->n_size = np->n_vattr.va_size = tsize; vnode_pager_setsize(vp, np->n_size); } return (error); } /* * Do an nfs setattr rpc. */ static int nfs_setattrrpc(vp, vap, cred, procp) register struct vnode *vp; register struct vattr *vap; struct ucred *cred; struct proc *procp; { register struct nfsv2_sattr *sp; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; u_int32_t *tl; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_SETATTR]++; nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_v3attrbuild(vap, TRUE); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); if (vap->va_mode == (mode_t)VNOVAL) sp->sa_mode = nfs_xdrneg1; else sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode); if (vap->va_uid == (uid_t)VNOVAL) sp->sa_uid = nfs_xdrneg1; else sp->sa_uid = txdr_unsigned(vap->va_uid); if (vap->va_gid == (gid_t)VNOVAL) sp->sa_gid = nfs_xdrneg1; else sp->sa_gid = txdr_unsigned(vap->va_gid); sp->sa_size = txdr_unsigned(vap->va_size); txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(vp, NFSPROC_SETATTR, procp, cred); if (v3) { nfsm_wcc_data(vp, wccflag); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_reqdone; return (error); } /* * nfs lookup call, one step at a time... * First look in cache * If not found, unlock the directory nfsnode and do the rpc */ static int nfs_lookup(ap) struct vop_lookup_args /* { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; int flags = cnp->cn_flags; struct vnode *newvp; u_int32_t *tl; caddr_t cp; int32_t t1, t2; struct nfsmount *nmp; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; long len; nfsfh_t *fhp; struct nfsnode *np; int lockparent, wantparent, error = 0, attrflag, fhsize; int v3 = NFS_ISV3(dvp); struct proc *p = cnp->cn_proc; *vpp = NULLVP; if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); lockparent = flags & LOCKPARENT; wantparent = flags & (LOCKPARENT|WANTPARENT); nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { struct vattr vattr; int vpid; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, p)) != 0) { *vpp = NULLVP; return (error); } newvp = *vpp; vpid = newvp->v_id; /* * See the comment starting `Step through' in ufs/ufs_lookup.c * for an explanation of the locking protocol */ if (dvp == newvp) { VREF(newvp); error = 0; } else if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = vget(newvp, LK_EXCLUSIVE, p); if (!error && lockparent && (flags & ISLASTCN)) error = vn_lock(dvp, LK_EXCLUSIVE, p); } else { error = vget(newvp, LK_EXCLUSIVE, p); if (!lockparent || error || !(flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } if (!error) { if (vpid == newvp->v_id) { if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred, p) && vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) { nfsstats.lookupcache_hits++; if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; return (0); } cache_purge(newvp); } vput(newvp); if (lockparent && dvp != newvp && (flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); } error = vn_lock(dvp, LK_EXCLUSIVE, p); *vpp = NULLVP; if (error) return (error); } error = 0; newvp = NULLVP; nfsstats.lookupcache_misses++; nfsstats.rpccnt[NFSPROC_LOOKUP]++; len = cnp->cn_namelen; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); if (error) { nfsm_postop_attr(dvp, attrflag); m_freem(mrep); goto nfsmout; } nfsm_getfh(fhp, fhsize, v3); /* * Handle RENAME case... */ if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { if (NFS_CMPFH(np, fhp, fhsize)) { m_freem(mrep); return (EISDIR); } error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); *vpp = newvp; m_freem(mrep); cnp->cn_flags |= SAVENAME; if (!lockparent) VOP_UNLOCK(dvp, 0, p); return (0); } if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, p); error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } newvp = NFSTOV(np); if (lockparent && (flags & ISLASTCN) && (error = vn_lock(dvp, LK_EXCLUSIVE, p))) { vput(newvp); return (error); } } else if (NFS_CMPFH(np, fhp, fhsize)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); if (error) { m_freem(mrep); return (error); } if (!lockparent || !(flags & ISLASTCN)) VOP_UNLOCK(dvp, 0, p); newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); nfsm_postop_attr(dvp, attrflag); } else nfsm_loadattr(newvp, (struct vattr *)0); if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; if ((cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { np->n_ctime = np->n_vattr.va_ctime.tv_sec; cache_enter(dvp, newvp, cnp); } *vpp = newvp; nfsm_reqdone; if (error) { if (newvp != NULLVP) { vrele(newvp); *vpp = NULLVP; } if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN) && error == ENOENT) { if (!lockparent) VOP_UNLOCK(dvp, 0, p); if (dvp->v_mount->mnt_flag & MNT_RDONLY) error = EROFS; else error = EJUSTRETURN; } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) cnp->cn_flags |= SAVENAME; } return (error); } /* * nfs read call. * Just call nfs_bioread() to do the work. */ static int nfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VREG) return (EPERM); return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); } /* * nfs readlink call */ static int nfs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; if (vp->v_type != VLNK) return (EINVAL); return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); } /* * Do a readlink rpc. * Called by nfs_doio() from below the buffer cache. */ int nfs_readlinkrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, len, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_READLINK]++; nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3)); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); if (v3) nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_strsiz(len, NFS_MAXPATHLEN); if (len == NFS_MAXPATHLEN) { struct nfsnode *np = VTONFS(vp); if (np->n_size && np->n_size < NFS_MAXPATHLEN) len = np->n_size; } nfsm_mtouio(uiop, len); } nfsm_reqdone; return (error); } /* * nfs read rpc call * Ditto above */ int nfs_readrpc(vp, uiop, cred) register struct vnode *vp; struct uio *uiop; struct ucred *cred; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp; int error = 0, len, retlen, tsiz, eof, attrflag; int v3 = NFS_ISV3(vp); #ifndef nolint eof = 0; #endif nmp = VFSTONFS(vp->v_mount); tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_READ]++; len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3); nfsm_fhtom(vp, v3); nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (v3) { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } else { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else nfsm_loadattr(vp, (struct vattr *)0); nfsm_strsiz(retlen, nmp->nm_rsize); nfsm_mtouio(uiop, retlen); m_freem(mrep); tsiz -= retlen; if (v3) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } nfsmout: return (error); } /* * nfs write call */ int nfs_writerpc(vp, uiop, cred, iomode, must_commit) register struct vnode *vp; register struct uio *uiop; struct ucred *cred; int *iomode, *must_commit; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2, backup; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit; int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC; #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfs: writerpc iovcnt > 1"); #endif *must_commit = 0; tsiz = uiop->uio_resid; if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { register u_int32_t x; nfsm_build(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiotom(uiop, len); nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; m_freem(mrep); break; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base -= backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest committment level * obtained by any of the RPCs. */ if (committed == NFSV3WRITE_FILESYNC) committed = commit; else if (committed == NFSV3WRITE_DATASYNC && commit == NFSV3WRITE_UNSTABLE) committed = commit; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); nmp->nm_state |= NFSSTA_HASWRITEVERF; } else if (bcmp((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) { *must_commit = 1; bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); } } } else nfsm_loadattr(vp, (struct vattr *)0); if (wccflag) VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.tv_sec; m_freem(mrep); if (error) break; tsiz -= len; } nfsmout: if (vp->v_mount->mnt_flag & MNT_ASYNC) committed = NFSV3WRITE_FILESYNC; *iomode = committed; if (error) uiop->uio_resid = tsiz; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ static int nfs_mknodrpc(dvp, vpp, cnp, vap) register struct vnode *dvp; register struct vnode **vpp; register struct componentname *cnp; register struct vattr *vap; { register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np = (struct nfsnode *)0; struct vattr vattr; char *cp2; caddr_t bpos, dpos; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; u_int32_t rdev; int v3 = NFS_ISV3(dvp); if (vap->va_type == VCHR || vap->va_type == VBLK) rdev = txdr_unsigned(vap->va_rdev); else if (vap->va_type == VFIFO || vap->va_type == VSOCK) rdev = nfs_xdrneg1; else { VOP_ABORTOP(dvp, cnp); return (EOPNOTSUPP); } if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } nfsstats.rpccnt[NFSPROC_MKNOD]++; nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED + + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl++ = vtonfsv3_type(vap->va_type); nfsm_v3attrbuild(vap, FALSE); if (vap->va_type == VCHR || vap->va_type == VBLK) { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(umajor(vap->va_rdev)); *tl = txdr_unsigned(uminor(vap->va_rdev)); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = rdev; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (newvp) vput(newvp); } else { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *vpp = newvp; } zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs mknod vop * just call nfs_mknodrpc() to do the work. */ /* ARGSUSED */ static int nfs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *newvp; int error; error = nfs_mknodrpc(ap->a_dvp, &newvp, ap->a_cnp, ap->a_vap); if (!error) vput(newvp); return (error); } static u_long create_verf; /* * nfs file create call */ static int nfs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); /* * Oops, not for me.. */ if (vap->va_type == VSOCK) return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap)); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } if (vap->va_vaflags & VA_EXCLUSIVE) fmode |= O_EXCL; again: nfsstats.rpccnt[NFSPROC_CREATE]++; nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); nfsm_build(tl, u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET if (!TAILQ_EMPTY(&in_ifaddrhead)) *tl++ = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); } } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = 0; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); if (!error) { nfsm_mtofh(dvp, newvp, v3, gotvp); if (!gotvp) { if (newvp) { vput(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc, &np); if (!error) newvp = NFSTOV(np); } } if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; if (error) { if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) { fmode &= ~O_EXCL; goto again; } if (newvp) vput(newvp); } else if (v3 && (fmode & O_EXCL)) error = nfs_setattrrpc(newvp, vap, cnp->cn_cred, cnp->cn_proc); if (!error) { if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, newvp, cnp); *ap->a_vpp = newvp; } if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file remove call * To try and make nfs semantics closer to ufs semantics, a file that has * other processes using the vnode is renamed instead of removed and then * removed later on the last close. * - If v_usecount > 1 * If a rename is not already in the works * call nfs_sillyrename() to set it up * else * do the remove rpc */ static int nfs_remove(ap) struct vop_remove_args /* { struct vnodeop_desc *a_desc; struct vnode * a_dvp; struct vnode * a_vp; struct componentname * a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register struct nfsnode *np = VTONFS(vp); int error = 0; struct vattr vattr; #ifndef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("nfs_remove: no name"); if (vp->v_usecount < 1) panic("nfs_remove: bad v_usecount"); #endif if (vp->v_type == VDIR) error = EPERM; else if (vp->v_usecount == 1 || (np->n_sillyrename && VOP_GETATTR(vp, &vattr, cnp->cn_cred, cnp->cn_proc) == 0 && vattr.va_nlink > 1)) { /* * Purge the name cache so that the chance of a lookup for * the name succeeding while the remove is in progress is * minimized. Without node locking it can still happen, such * that an I/O op returns ESTALE, but since you get this if * another host removes the file.. */ cache_purge(vp); /* * throw away biocache buffers, mainly to avoid * unnecessary delayed writes later. */ error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); /* Do the rpc */ if (error != EINTR) error = nfs_removerpc(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc); /* * Kludge City: If the first reply to the remove rpc is lost.. * the reply to the retransmitted request will be ENOENT * since the file was in fact removed * Therefore, we cheat and return success. */ if (error == ENOENT) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); zfree(namei_zone, cnp->cn_pnbuf); np->n_attrstamp = 0; return (error); } /* * nfs file remove rpc called from nfs_inactive */ int nfs_removeit(sp) register struct sillyrename *sp; { return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred, (struct proc *)0)); } /* * Nfs remove rpc, called from nfs_remove() and nfs_removeit(). */ static int nfs_removerpc(dvp, name, namelen, cred, proc) register struct vnode *dvp; const char *name; int namelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_REMOVE]++; nfsm_reqhead(dvp, NFSPROC_REMOVE, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_REMOVE, proc, cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; return (error); } /* * nfs file rename call */ static int nfs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { register struct vnode *fvp = ap->a_fvp; register struct vnode *tvp = ap->a_tvp; register struct vnode *fdvp = ap->a_fdvp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *tcnp = ap->a_tcnp; register struct componentname *fcnp = ap->a_fcnp; int error; #ifndef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("nfs_rename: no name"); #endif /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } /* * We have to flush B_DELWRI data prior to renaming * the file. If we don't, the delayed-write buffers * can be flushed out later after the file has gone stale * under NFSV3. NFSV2 does not have this problem because * ( as far as I can tell ) it flushes dirty buffers more * often. */ VOP_FSYNC(fvp, fcnp->cn_cred, MNT_WAIT, fcnp->cn_proc); if (tvp) VOP_FSYNC(tvp, tcnp->cn_cred, MNT_WAIT, tcnp->cn_proc); /* * If the tvp exists and is in use, sillyrename it before doing the * rename of the new file over it. * XXX Can't sillyrename a directory. */ if (tvp && tvp->v_usecount > 1 && !VTONFS(tvp)->n_sillyrename && tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) { vput(tvp); tvp = NULL; } error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred, tcnp->cn_proc); if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); /* * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs file rename rpc called from nfs_remove() above */ static int nfs_renameit(sdvp, scnp, sp) struct vnode *sdvp; struct componentname *scnp; register struct sillyrename *sp; { return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp, sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_proc)); } /* * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit(). */ static int nfs_renamerpc(fdvp, fnameptr, fnamelen, tdvp, tnameptr, tnamelen, cred, proc) register struct vnode *fdvp; const char *fnameptr; int fnamelen; register struct vnode *tdvp; const char *tnameptr; int tnamelen; struct ucred *cred; struct proc *proc; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(fdvp); nfsstats.rpccnt[NFSPROC_RENAME]++; nfsm_reqhead(fdvp, NFSPROC_RENAME, (NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_fhtom(fdvp, v3); nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN); nfsm_fhtom(tdvp, v3); nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN); nfsm_request(fdvp, NFSPROC_RENAME, proc, cred); if (v3) { nfsm_wcc_data(fdvp, fwccflag); nfsm_wcc_data(tdvp, twccflag); } nfsm_reqdone; VTONFS(fdvp)->n_flag |= NMODIFIED; VTONFS(tdvp)->n_flag |= NMODIFIED; if (!fwccflag) VTONFS(fdvp)->n_attrstamp = 0; if (!twccflag) VTONFS(tdvp)->n_attrstamp = 0; return (error); } /* * nfs hard link create call */ static int nfs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *tdvp = ap->a_tdvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3; if (vp->v_mount != tdvp->v_mount) { VOP_ABORTOP(tdvp, cnp); return (EXDEV); } /* * Push all writes to the server, so that the attribute cache * doesn't get "out of sync" with the server. * XXX There should be a better way! */ VOP_FSYNC(vp, cnp->cn_cred, MNT_WAIT, cnp->cn_proc); v3 = NFS_ISV3(vp); nfsstats.rpccnt[NFSPROC_LINK]++; nfsm_reqhead(vp, NFSPROC_LINK, NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(vp, v3); nfsm_fhtom(tdvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(vp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); if (v3) { nfsm_postop_attr(vp, attrflag); nfsm_wcc_data(tdvp, wccflag); } nfsm_reqdone; zfree(namei_zone, cnp->cn_pnbuf); VTONFS(tdvp)->n_flag |= NMODIFIED; if (!attrflag) VTONFS(vp)->n_attrstamp = 0; if (!wccflag) VTONFS(tdvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; return (error); } /* * nfs symbolic link create call */ static int nfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vnode *newvp = (struct vnode *)0; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_SYMLINK]++; slen = strlen(ap->a_target); nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); if (!v3) { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); if (v3) { if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); nfsm_wcc_data(dvp, wccflag); } nfsm_reqdone; if (newvp) vput(newvp); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ if (error == EEXIST) error = 0; /* * cnp's buffer expected to be freed if SAVESTART not set or * if an error was returned. */ if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * nfs make dir call */ static int nfs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct nfsv2_sattr *sp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register int len; struct nfsnode *np = (struct nfsnode *)0; struct vnode *newvp = (struct vnode *)0; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; int gotvp = 0; struct mbuf *mreq, *mrep, *md, *mb, *mb2; struct vattr vattr; int v3 = NFS_ISV3(dvp); if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) != 0) { VOP_ABORTOP(dvp, cnp); return (error); } len = cnp->cn_namelen; nfsstats.rpccnt[NFSPROC_MKDIR]++; nfsm_reqhead(dvp, NFSPROC_MKDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); if (v3) { nfsm_v3attrbuild(vap, FALSE); } else { nfsm_build(sp, struct nfsv2_sattr *, NFSX_V2SATTR); sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode); sp->sa_uid = nfs_xdrneg1; sp->sa_gid = nfs_xdrneg1; sp->sa_size = nfs_xdrneg1; txdr_nfsv2time(&vap->va_atime, &sp->sa_atime); txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime); } nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); if (!error) nfsm_mtofh(dvp, newvp, v3, gotvp); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; /* * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry * if we can succeed in looking up the directory. */ if (error == EEXIST || (!error && !gotvp)) { if (newvp) { vrele(newvp); newvp = (struct vnode *)0; } error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred, cnp->cn_proc, &np); if (!error) { newvp = NFSTOV(np); if (newvp->v_type != VDIR) error = EEXIST; } } if (error) { if (newvp) vrele(newvp); } else *ap->a_vpp = newvp; if (error || (cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * nfs remove directory call */ static int nfs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct vnode *dvp = ap->a_dvp; register struct componentname *cnp = ap->a_cnp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; int v3 = NFS_ISV3(dvp); if (dvp == vp) return (EINVAL); nfsstats.rpccnt[NFSPROC_RMDIR]++; nfsm_reqhead(dvp, NFSPROC_RMDIR, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_fhtom(dvp, v3); nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); if (v3) nfsm_wcc_data(dvp, wccflag); nfsm_reqdone; zfree(namei_zone, cnp->cn_pnbuf); VTONFS(dvp)->n_flag |= NMODIFIED; if (!wccflag) VTONFS(dvp)->n_attrstamp = 0; cache_purge(dvp); cache_purge(vp); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * nfs readdir call */ static int nfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); register struct uio *uio = ap->a_uio; int tresid, error; struct vattr vattr; if (vp->v_type != VDIR) return (EPERM); /* * First, check for hit on the EOF offset cache */ if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { if (NQNFS_CKCACHABLE(vp, ND_READ)) { nfsstats.direofcache_hits++; return (0); } } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && np->n_mtime == vattr.va_mtime.tv_sec) { nfsstats.direofcache_hits++; return (0); } } /* * Call nfs_bioread() to do the real work. */ tresid = uio->uio_resid; error = nfs_bioread(vp, uio, 0, ap->a_cred); if (!error && uio->uio_resid == tresid) nfsstats.direofcache_misses++; return (error); } /* * Readdir rpc call. * Called from below the buffer cache by nfs_doio(). */ int nfs_readdirrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp = NULL; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag; int v3 = NFS_ISV3(vp); #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirrpc bad uio"); #endif /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIR]++; nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) + NFSX_READDIR(v3)); nfsm_fhtom(vp, v3); if (v3) { nfsm_build(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; } else { nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } *tl = txdr_unsigned(nmp->nm_readdirsize); nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); if (v3) { nfsm_postop_attr(vp, attrflag); if (!error) { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; } else { m_freem(mrep); goto nfsmout; } } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fileno = fxdr_unsigned(u_quad_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination */ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; /* null terminate */ uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); if (v3) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); } else { nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } if (bigenough) { cookie.nfsuquad[0] = *tl++; if (v3) cookie.nfsuquad[1] = *tl++; } else if (v3) tl += 2; else tl++; more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: return (error); } /* * NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc(). */ int nfs_readdirplusrpc(vp, uiop, cred) struct vnode *vp; register struct uio *uiop; struct ucred *cred; { register int len, left; register struct dirent *dp; register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; register struct vnode *newvp; register nfsuint64 *cookiep; caddr_t bpos, dpos, cp2, dpossav1, dpossav2; struct mbuf *mreq, *mrep, *md, *mb, *mb2, *mdsav1, *mdsav2; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; nfsuint64 cookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; nfsfh_t *fhp; u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; #ifndef nolint dp = (struct dirent *)0; #endif #ifndef DIAGNOSTIC if (uiop->uio_iovcnt != 1 || (uiop->uio_offset & (DIRBLKSIZ - 1)) || (uiop->uio_resid & (DIRBLKSIZ - 1))) panic("nfs readdirplusrpc bad uio"); #endif ndp->ni_dvp = vp; newvp = NULLVP; /* * If there is no cookie, assume directory was stale. */ cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0); if (cookiep) cookie = *cookiep; else return (NFSERR_BAD_COOKIE); /* * Loop around doing readdir rpc's of size nm_readdirsize * truncated to a multiple of DIRBLKSIZ. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { nfsstats.rpccnt[NFSPROC_READDIRPLUS]++; nfsm_reqhead(vp, NFSPROC_READDIRPLUS, NFSX_FH(1) + 6 * NFSX_UNSIGNED); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; *tl++ = cookie.nfsuquad[1]; *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_rsize); nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_procp, cred); nfsm_postop_attr(vp, attrflag); if (error) { m_freem(mrep); goto nfsmout; } nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; more_dirs = fxdr_unsigned(int, *tl); /* loop thru the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); fileno = fxdr_hyper(tl); len = fxdr_unsigned(int, *(tl + 2)); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; m_freem(mrep); goto nfsmout; } tlen = nfsm_rndup(len); if (tlen == len) tlen += 4; /* To ensure null termination*/ left = DIRBLKSIZ - blksiz; if ((tlen + DIRHDSIZ) > left) { dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; blksiz = 0; } if ((tlen + DIRHDSIZ) > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_fileno = (int)fileno; dp->d_namlen = len; dp->d_reclen = tlen + DIRHDSIZ; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_offset += DIRHDSIZ; uiop->uio_resid -= DIRHDSIZ; uiop->uio_iov->iov_base += DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; nfsm_mtouio(uiop, len); cp = uiop->uio_iov->iov_base; tlen -= len; *cp = '\0'; uiop->uio_iov->iov_base += tlen; uiop->uio_iov->iov_len -= tlen; uiop->uio_offset += tlen; uiop->uio_resid -= tlen; } else nfsm_adv(nfsm_rndup(len)); nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (bigenough) { cookie.nfsuquad[0] = *tl++; cookie.nfsuquad[1] = *tl++; } else tl += 2; /* * Since the attributes are before the file handle * (sigh), we must skip over the attributes and then * come back and get them. */ attrflag = fxdr_unsigned(int, *tl); if (attrflag) { dpossav1 = dpos; mdsav1 = md; nfsm_adv(NFSX_V3FATTR); nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); doit = fxdr_unsigned(int, *tl); if (doit) { nfsm_getfh(fhp, fhsize, 1); if (NFS_CMPFH(dnp, fhp, fhsize)) { VREF(vp); newvp = vp; np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, fhsize, &np); if (error) doit = 0; else newvp = NFSTOV(np); } } if (doit && bigenough) { dpossav2 = dpos; dpos = dpossav1; mdsav2 = md; md = mdsav1; nfsm_loadattr(newvp, (struct vattr *)0); dpos = dpossav2; md = mdsav2; dp->d_type = IFTODT(VTTOIF(np->n_vattr.va_type)); ndp->ni_vp = newvp; cnp->cn_hash = 0; for (cp = cnp->cn_nameptr, i = 1; i <= len; i++, cp++) cnp->cn_hash += (unsigned char)*cp; cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); } } else { /* Just skip over the file handle */ nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); nfsm_adv(nfsm_rndup(i)); } if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = (fxdr_unsigned(int, *tl) == 0); } m_freem(mrep); } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; dp->d_reclen += left; uiop->uio_iov->iov_base += left; uiop->uio_iov->iov_len -= left; uiop->uio_offset += left; uiop->uio_resid -= left; } /* * We are now either at the end of the directory or have filled the * block. */ if (bigenough) dnp->n_direofoffset = uiop->uio_offset; else { if (uiop->uio_resid > 0) printf("EEK! readdirplusrpc resid > 0\n"); cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1); *cookiep = cookie; } nfsmout: if (newvp != NULLVP) { if (newvp == vp) vrele(newvp); else vput(newvp); newvp = NULLVP; } return (error); } /* * Silly rename. To make the NFS filesystem that is stateless look a little * more like the "ufs" a remove of an active vnode is translated to a rename * to a funny looking filename that is removed by nfs_inactive on the * nfsnode. There is the potential for another process on a different client * to create the same funny name between the nfs_lookitup() fails and the * nfs_rename() completes, but... */ static int nfs_sillyrename(dvp, vp, cnp) struct vnode *dvp, *vp; struct componentname *cnp; { register struct sillyrename *sp; struct nfsnode *np; int error; short pid; cache_purge(dvp); np = VTONFS(vp); #ifndef DIAGNOSTIC if (vp->v_type == VDIR) panic("nfs: sillyrename dir"); #endif MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), M_NFSREQ, M_WAITOK); sp->s_cred = crdup(cnp->cn_cred); sp->s_dvp = dvp; VREF(dvp); /* Fudge together a funny name */ pid = cnp->cn_proc->p_pid; sp->s_namlen = sprintf(sp->s_name, ".nfsA%04x4.4", pid); /* Try lookitups until we get one that isn't there */ while (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, (struct nfsnode **)0) == 0) { sp->s_name[4]++; if (sp->s_name[4] > 'z') { error = EINVAL; goto bad; } } error = nfs_renameit(dvp, cnp, sp); if (error) goto bad; error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred, cnp->cn_proc, &np); np->n_sillyrename = sp; return (0); bad: vrele(sp->s_dvp); crfree(sp->s_cred); free((caddr_t)sp, M_NFSREQ); return (error); } /* * Look up a file name and optionally either update the file handle or * allocate an nfsnode, depending on the value of npp. * npp == NULL --> just do the lookup * *npp == NULL --> allocate a new nfsnode and make sure attributes are * handled too * *npp != NULL --> update the file handle in the vnode */ static int nfs_lookitup(dvp, name, len, cred, procp, npp) register struct vnode *dvp; const char *name; int len; struct ucred *cred; struct proc *procp; struct nfsnode **npp; { register u_int32_t *tl; register caddr_t cp; register int32_t t1, t2; struct vnode *newvp = (struct vnode *)0; struct nfsnode *np, *dnp = VTONFS(dvp); caddr_t bpos, dpos, cp2; int error = 0, fhlen, attrflag; struct mbuf *mreq, *mrep, *md, *mb, *mb2; nfsfh_t *nfhp; int v3 = NFS_ISV3(dvp); nfsstats.rpccnt[NFSPROC_LOOKUP]++; nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(dvp, v3); nfsm_strtom(name, len, NFS_MAXNAMLEN); nfsm_request(dvp, NFSPROC_LOOKUP, procp, cred); if (npp && !error) { nfsm_getfh(nfhp, fhlen, v3); if (*npp) { np = *npp; if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) { free((caddr_t)np->n_fhp, M_NFSBIGFH); np->n_fhp = &np->n_fh; } else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH) np->n_fhp =(nfsfh_t *)malloc(fhlen,M_NFSBIGFH,M_WAITOK); bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen); np->n_fhsize = fhlen; newvp = NFSTOV(np); } else if (NFS_CMPFH(dnp, nfhp, fhlen)) { VREF(dvp); newvp = dvp; } else { error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); if (error) { m_freem(mrep); return (error); } newvp = NFSTOV(np); } if (v3) { nfsm_postop_attr(newvp, attrflag); if (!attrflag && *npp == NULL) { m_freem(mrep); if (newvp == dvp) vrele(newvp); else vput(newvp); return (ENOENT); } } else nfsm_loadattr(newvp, (struct vattr *)0); } nfsm_reqdone; if (npp && *npp == NULL) { if (error) { if (newvp) { if (newvp == dvp) vrele(newvp); else vput(newvp); } } else *npp = np; } return (error); } /* * Nfs Version 3 commit rpc */ static int nfs_commit(vp, offset, cnt, cred, procp) register struct vnode *vp; u_quad_t offset; int cnt; struct ucred *cred; struct proc *procp; { register caddr_t cp; register u_int32_t *tl; register int32_t t1, t2; register struct nfsmount *nmp = VFSTONFS(vp->v_mount); caddr_t bpos, dpos, cp2; int error = 0, wccflag = NFSV3_WCCRATTR; struct mbuf *mreq, *mrep, *md, *mb, *mb2; if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) return (0); nfsstats.rpccnt[NFSPROC_COMMIT]++; nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1)); nfsm_fhtom(vp, 1); nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsm_request(vp, NFSPROC_COMMIT, procp, cred); nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF); if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl, NFSX_V3WRITEVERF)) { bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); error = NFSERR_STALEWRITEVERF; } } nfsm_reqdone; return (error); } /* * Kludge City.. * - make nfs_bmap() essentially a no-op that does no translation * - do nfs_strategy() by doing I/O with nfs_readrpc/nfs_writerpc * (Maybe I could use the process's page mapping, but I was concerned that * Kernel Write might not be enabled and also figured copyout() would do * a lot more work than bcopy() and also it currently happens in the * context of the swapper process (2). */ static int nfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { register struct vnode *vp = ap->a_vp; if (ap->a_vpp != NULL) *ap->a_vpp = vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } /* * Strategy routine. * For async requests when nfsiod(s) are running, queue the request by * calling nfs_asyncio(), otherwise just all nfs_doio() to do the * request. */ static int nfs_strategy(ap) struct vop_strategy_args *ap; { register struct buf *bp = ap->a_bp; struct ucred *cr; struct proc *p; int error = 0; KASSERT(!(bp->b_flags & B_DONE), ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp)); KASSERT(BUF_REFCNT(bp) > 0, ("nfs_strategy: buffer %p not locked", bp)); if (bp->b_flags & B_PHYS) panic("nfs physio"); if (bp->b_flags & B_ASYNC) p = (struct proc *)0; else p = curproc; /* XXX */ if (bp->b_flags & B_READ) cr = bp->b_rcred; else cr = bp->b_wcred; /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ if ((bp->b_flags & B_ASYNC) == 0 || nfs_asyncio(bp, NOCRED, p)) error = nfs_doio(bp, cr, p); return (error); } /* * Mmap a file * * NB Currently unsupported. */ /* ARGSUSED */ static int nfs_mmap(ap) struct vop_mmap_args /* { struct vnode *a_vp; int a_fflags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { return (EINVAL); } /* * fsync vnode op. Just call nfs_flush() with commit == 1. */ /* ARGSUSED */ static int nfs_fsync(ap) struct vop_fsync_args /* { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct proc * a_p; } */ *ap; { return (nfs_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_p, 1)); } /* * Flush all the blocks associated with a vnode. * Walk through the buffer pool and push any dirty pages * associated with the vnode. */ static int nfs_flush(vp, cred, waitfor, p, commit) register struct vnode *vp; struct ucred *cred; int waitfor; struct proc *p; int commit; { register struct nfsnode *np = VTONFS(vp); register struct buf *bp; register int i; struct buf *nbp; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; int passone = 1; u_quad_t off, endoff, toff; struct ucred* wcred = NULL; struct buf **bvec = NULL; #ifndef NFS_COMMITBVECSIZ #define NFS_COMMITBVECSIZ 20 #endif struct buf *bvec_on_stack[NFS_COMMITBVECSIZ]; int bvecsize = 0, bveccount; if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; if (!commit) passone = 0; /* * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the * server, but nas not been committed to stable storage on the server * yet. On the first pass, the byte range is worked out and the commit * rpc is done. On the second pass, nfs_writebp() is called to do the * job. */ again: off = (u_quad_t)-1; endoff = 0; bvecpos = 0; if (NFS_ISV3(vp) && commit) { s = splbio(); /* * Count up how many buffers waiting for a commit. */ bveccount = 0; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bveccount++; } /* * Allocate space to remember the list of bufs to commit. It is * important to use M_NOWAIT here to avoid a race with nfs_write. * If we can't get memory (for whatever reason), we will end up * committing the buffers one-by-one in the loop below. */ if (bveccount > NFS_COMMITBVECSIZ) { if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); bvec = (struct buf **) malloc(bveccount * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (bvec == NULL) { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } else bvecsize = bveccount; } else { bvec = bvec_on_stack; bvecsize = NFS_COMMITBVECSIZ; } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bvecpos >= bvecsize) break; if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_NEEDCOMMIT) || BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; bremfree(bp); /* * Work out if all buffers are using the same cred * so we can deal with them all with one commit. * * NOTE: we are not clearing B_DONE here, so we have * to do it later on in this routine if we intend to * initiate I/O on the bp. */ if (wcred == NULL) wcred = bp->b_wcred; else if (wcred != bp->b_wcred) wcred = NOCRED; bp->b_flags |= B_WRITEINPROG; vfs_busy_pages(bp, 1); /* * bp is protected by being locked, but nbp is not * and vfs_busy_pages() may sleep. We have to * recalculate nbp. */ nbp = TAILQ_NEXT(bp, b_vnbufs); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually * been committed. This is necessary, since there * may be a race between the commit rpc and new * uncommitted writes on the file. */ bvec[bvecpos++] = bp; toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; if (toff < off) off = toff; toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); if (toff > endoff) endoff = toff; } splx(s); } if (bvecpos > 0) { /* * Commit data on the server, as required. * If all bufs are using the same wcred, then use that with * one call for all of them, otherwise commit each one * separately. */ if (wcred != NOCRED) retv = nfs_commit(vp, off, (int)(endoff - off), wcred, p); else { retv = 0; for (i = 0; i < bvecpos; i++) { off_t off, size; bp = bvec[i]; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; size = (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); retv = nfs_commit(vp, off, (int)size, bp->b_wcred, p); if (retv) break; } } if (retv == NFSERR_STALEWRITEVERF) nfs_clearcommit(vp->v_mount); /* * Now, either mark the blocks I/O done or mark the * blocks dirty, depending on whether the commit * succeeded. */ for (i = 0; i < bvecpos; i++) { bp = bvec[i]; bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); if (retv) { /* * Error, leave B_DELWRI intact */ vfs_unbusy_pages(bp); brelse(bp); } else { /* * Success, remove B_DELWRI ( bundirty() ). * * b_dirtyoff/b_dirtyend seem to be NFS * specific. We should probably move that * into bundirty(). XXX */ s = splbio(); vp->v_numoutput++; bp->b_flags |= B_ASYNC; bundirty(bp); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_dirtyoff = bp->b_dirtyend = 0; splx(s); biodone(bp); } } } /* * Start/do any write(s) that are required. */ loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (waitfor != MNT_WAIT || passone) continue; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "nfsfsync", slpflag, slptimeo); splx(s); if (error == 0) panic("nfs_fsync: inconsistent lock"); if (error == ENOLCK) goto loop; if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } goto loop; } if ((bp->b_flags & B_DELWRI) == 0) panic("nfs_fsync: not dirty"); if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) { BUF_UNLOCK(bp); continue; } bremfree(bp); if (passone || !commit) bp->b_flags |= B_ASYNC; else bp->b_flags |= (B_ASYNC | B_WRITEINPROG | B_NEEDCOMMIT); splx(s); VOP_BWRITE(bp->b_vp, bp); goto loop; } splx(s); if (passone) { passone = 0; goto again; } if (waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); if (error) { if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) { error = EINTR; goto done; } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) { goto loop; } } if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); return (error); } /* * NFS advisory byte-level locks. * Currently unsupported. */ static int nfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * The following kludge is to allow diskless support to work * until a real NFS lockd is implemented. Basically, just pretend * that this is a local lock. */ return (lf_advlock(ap, &(np->n_lockf), np->n_size)); } /* * Print out the contents of an nfsnode. */ static int nfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); printf("tag VT_NFS, fileid %ld fsid 0x%x", np->n_vattr.va_fileid, np->n_vattr.va_fsid); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Just call nfs_writebp() with the force argument set to 1. * * NOTE: B_DONE may or may not be set in a_bp on call. */ static int nfs_bwrite(ap) struct vop_bwrite_args /* { struct vnode *a_bp; } */ *ap; { return (nfs_writebp(ap->a_bp, 1, curproc)); } /* * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless * the force flag is one and it also handles the B_NEEDCOMMIT flag. We set * B_CACHE if this is a VMIO buffer. */ int nfs_writebp(bp, force, procp) register struct buf *bp; int force; struct proc *procp; { int s; int oldflags = bp->b_flags; int retv = 1; off_t off; if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not locked???"); if (bp->b_flags & B_INVAL) { brelse(bp); return(0); } bp->b_flags |= B_CACHE; /* * Undirty the bp. We will redirty it later if the I/O fails. */ s = splbio(); bundirty(bp); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); bp->b_vp->v_numoutput++; curproc->p_stats->p_ru.ru_oublock++; splx(s); /* * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not * an actual write will have to be scheduled via. VOP_STRATEGY(). * If B_WRITEINPROG is already set, then push it with a write anyhow. */ vfs_busy_pages(bp, 1); if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; bp->b_flags |= B_WRITEINPROG; retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, bp->b_wcred, procp); bp->b_flags &= ~B_WRITEINPROG; if (!retv) { bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~B_NEEDCOMMIT; biodone(bp); } else if (retv == NFSERR_STALEWRITEVERF) { nfs_clearcommit(bp->b_vp->v_mount); } } if (retv) { if (force) bp->b_flags |= B_WRITEINPROG; BUF_KERNPROC(bp); VOP_STRATEGY(bp->b_vp, bp); } if( (oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { s = splbio(); reassignbuf(bp, bp->b_vp); splx(s); } brelse(bp); return (rtval); } return (0); } /* * nfs special file access vnode op. * Essentially just get vattr and then imitate iaccess() since the device is * local to the client. */ static int nfsspec_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vattr *vap; register gid_t *gp; register struct ucred *cred = ap->a_cred; struct vnode *vp = ap->a_vp; mode_t mode = ap->a_mode; struct vattr vattr; register int i; int error; /* * Disallow write attempts on filesystems mounted read-only; * unless the file is a socket, fifo, or a block or character * device resident on the filesystem. */ if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } /* * If you're the super-user, * you always get access. */ if (cred->cr_uid == 0) return (0); vap = &vattr; error = VOP_GETATTR(vp, vap, cred, ap->a_p); if (error) return (error); /* * Access check is based on only one of owner, group, public. * If not owner, then check group. If not a member of the * group, then check public access. */ if (cred->cr_uid != vap->va_uid) { mode >>= 3; gp = cred->cr_groups; for (i = 0; i < cred->cr_ngroups; i++, gp++) if (vap->va_gid == *gp) goto found; mode >>= 3; found: ; } error = (vap->va_mode & mode) == mode ? 0 : EACCES; return (error); } /* * Read wrapper for special devices. */ static int nfsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ static int nfsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the nfsnode then do device close. */ static int nfsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; if (np->n_flag & (NACC | NUPD)) { np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ static int nfsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set access flag. */ np->n_flag |= NACC; getnanotime(&np->n_atim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifos. */ static int nfsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct nfsnode *np = VTONFS(ap->a_vp); /* * Set update flag. */ np->n_flag |= NUPD; getnanotime(&np->n_mtim); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifos. * * Update the times on the nfsnode then do fifo close. */ static int nfsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct nfsnode *np = VTONFS(vp); struct vattr vattr; struct timespec ts; if (np->n_flag & (NACC | NUPD)) { getnanotime(&ts); if (np->n_flag & NACC) np->n_atim = ts; if (np->n_flag & NUPD) np->n_mtim = ts; np->n_flag |= NCHG; if (vp->v_usecount == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { VATTR_NULL(&vattr); if (np->n_flag & NACC) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); } } return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } Index: head/sys/nfsserver/nfs_srvsubs.c =================================================================== --- head/sys/nfsserver/nfs_srvsubs.c (revision 49534) +++ head/sys/nfsserver/nfs_srvsubs.c (revision 49535) @@ -1,2281 +1,2280 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 - * $Id: nfs_subs.c,v 1.78 1999/06/27 11:44:19 peter Exp $ + * $Id: nfs_subs.c,v 1.79 1999/07/17 18:43:47 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_int32_t nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; static void (*nfs_prev_lease_updatetime) __P((int)); static int nfs_prev_nfssvc_sy_narg; static sy_call_t *nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER static vop_t *nfs_prev_vop_lease_check; static int nfs_prev_getfh_sy_narg; static sy_call_t *nfs_prev_getfh_sy_call; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO /* << Last is 86 */ }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); LIST_HEAD(nfsnodehashhead, nfsnode); int nfs_webnamei __P((struct nameidata *, struct vnode *, struct proc *)); u_quad_t nfs_curusec() { struct timeval tv; getmicrotime(&tv); return ((u_quad_t)tv.tv_sec * 1000000 + (u_quad_t)tv.tv_usec); } /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_int32_t *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_int32_t *xidp; { register struct mbuf *mb; register u_int32_t *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); /* Get a pretty random xid to start with */ if (!nfs_xid) nfs_xid = random(); /* * Skip zero xid if it should ever happen. */ if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_int32_t *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain. * NOTE: can ony handle iovcnt == 1 */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; #ifdef DIAGNOSTIC if (uiop->uio_iovcnt != 1) panic("nfsm_uiotombuf: iovcnt != 1"); #endif if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; const char *cp; long siz; { register struct mbuf *m1 = NULL, *m2; long left, xfer, len, tlen; u_int32_t *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_int32_t *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_int32_t *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init(vfsp) struct vfsconf *vfsp; { register int i; nfsmount_zone = zinit("NFSMOUNT", sizeof(struct nfsmount), 0, 0, 1); /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfs_mount_type = vfsp->vfc_typenum; nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { nfs_iodwant[i] = (struct proc *)0; nfs_iodmount[i] = (struct nfsmount *)0; } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); nfs_timer(0); /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER nfs_prev_vop_lease_check = default_vnodeop_p[VOFFSET(vop_lease)]; default_vnodeop_p[VOFFSET(vop_lease)] = (vop_t *)nqnfs_vop_lease_check; #endif nfs_prev_lease_updatetime = lease_updatetime; lease_updatetime = nfs_lease_updatetime; nfs_prev_nfssvc_sy_narg = sysent[SYS_nfssvc].sy_narg; sysent[SYS_nfssvc].sy_narg = 2; nfs_prev_nfssvc_sy_call = sysent[SYS_nfssvc].sy_call; sysent[SYS_nfssvc].sy_call = (sy_call_t *)nfssvc; #ifndef NFS_NOSERVER nfs_prev_getfh_sy_narg = sysent[SYS_getfh].sy_narg; sysent[SYS_getfh].sy_narg = 2; nfs_prev_getfh_sy_call = sysent[SYS_getfh].sy_call; sysent[SYS_getfh].sy_call = (sy_call_t *)getfh; #endif nfs_pbuf_freecnt = nswbuf / 2 + 1; return (0); } int nfs_uninit(vfsp) struct vfsconf *vfsp; { untimeout(nfs_timer, (void *)NULL, nfs_timer_handle); nfs_mount_type = -1; #ifndef NFS_NOSERVER default_vnodeop_p[VOFFSET(vop_lease)] = nfs_prev_vop_lease_check; #endif lease_updatetime = nfs_prev_lease_updatetime; sysent[SYS_nfssvc].sy_narg = nfs_prev_nfssvc_sy_narg; sysent[SYS_nfssvc].sy_call = nfs_prev_nfssvc_sy_call; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = nfs_prev_getfh_sy_narg; sysent[SYS_getfh].sy_call = nfs_prev_getfh_sy_call; #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register int32_t t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if ((error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) != 0) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makeudev(fxdr_unsigned(int, fp->fa3_rdev.specdata1), fxdr_unsigned(int, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(int32_t, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. * Since the nfsnode does not have a lock, its * vnode lock has to be carried over. */ nvp->v_vnlock = vp->v_vnlock; vp->v_vnlock = NULL; nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; *vpp = vp = nvp; } } np->n_mtime = mtime.tv_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_hyper(&fp->fa3_size); vap->va_blocksize = NFS_FABLKSIZE; vap->va_bytes = fxdr_hyper(&fp->fa3_used); vap->va_fileid = fxdr_unsigned(int32_t, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize); vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.tv_nsec = 0; vap->va_gen = fxdr_unsigned(u_int32_t,fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time_second; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } #ifdef NFS_ACDEBUG #include SYSCTL_DECL(_vfs_nfs); static int nfs_acdebug; SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, ""); #endif /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np; register struct vattr *vap; struct nfsmount *nmp; int timeo; np = VTONFS(vp); vap = &np->n_vattr; nmp = VFSTONFS(vp->v_mount); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime) / 10; #ifdef NFS_ACDEBUG if (nfs_acdebug>1) printf("nfs_getattrcache: initial timeo = %d\n", timeo); #endif if (vap->va_type == VDIR) { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin) timeo = nmp->nm_acdirmin; else if (timeo > nmp->nm_acdirmax) timeo = nmp->nm_acdirmax; } else { if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin) timeo = nmp->nm_acregmin; else if (timeo > nmp->nm_acregmax) timeo = nmp->nm_acregmax; } #ifdef NFS_ACDEBUG if (nfs_acdebug > 2) printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n", nmp->nm_acregmin, nmp->nm_acregmax, nmp->nm_acdirmin, nmp->nm_acdirmax); if (nfs_acdebug) printf("nfs_getattrcache: age = %d; final timeo = %d\n", (time_second - np->n_attrstamp), timeo); #endif if ((time_second - np->n_attrstamp) >= timeo) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag, pubflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct sockaddr *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag, pubflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; cnp->cn_pnbuf = zalloc(namei_zone); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag, pubflag); if (error) goto out; if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = zalloc(namei_zone); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') dp = rootvnode; } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * becuase lookup() will dereference ni_startdir. */ cnp->cn_proc = p; VREF(dp); ndp->ni_startdir = dp; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { nfsrv_object_create(ndp->ni_vp); if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else zfree(namei_zone, cnp->cn_pnbuf); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0, p); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = zalloc(namei_zone); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = (struct proc *)0; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; if (before_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_int32_t *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag, pubflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct sockaddr *nam; int *rdonlyp; int kerbflag; int pubflag; { struct proc *p = curproc; /* XXX */ register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif *vpp = (struct vnode *)0; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } mp = vfs_getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if (saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_object_create(*vpp); if (!lockflag) VOP_UNLOCK(*vpp, 0, p); return (0); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhp) fhandle_t *fhp; { char *cp = (char *)fhp; int i; for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct sockaddr *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = (struct sockaddr_iso *)nam; isoaddr2 = (struct sockaddr_iso *)haddr->had_nam; if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { { 0, 0 } }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = (uoff_t)off / NFS_DIRBLKSIZ; if (pos == 0 || off < 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at <= 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_REFCNT(bp) == 0 && (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_object_create(vp) struct vnode *vp; { if (vp == NULL || vp->v_type != VREG) return (1); return (vfs_object_create(vp, curproc, curproc ? curproc->p_ucred : NULL)); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(list, num) register gid_t *list; register int num; { register int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * copy credentials making sure that the result can be compared with bcmp(). */ void nfsrv_setcred(incred, outcred) register struct ucred *incred, *outcred; { register int i; bzero((caddr_t)outcred, sizeof (struct ucred)); outcred->cr_ref = 1; outcred->cr_uid = incred->cr_uid; outcred->cr_ngroups = incred->cr_ngroups; for (i = 0; i < incred->cr_ngroups; i++) outcred->cr_groups[i] = incred->cr_groups[i]; nfsrvw_sort(outcred->cr_groups, outcred->cr_ngroups); } #endif /* NFS_NOSERVER */ Index: head/sys/ntfs/ntfs_compr.c =================================================================== --- head/sys/ntfs/ntfs_compr.c (revision 49534) +++ head/sys/ntfs/ntfs_compr.c (revision 49535) @@ -1,120 +1,118 @@ /* $NetBSD: ntfs_compr.c,v 1.2 1999/05/06 15:43:18 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_compr.c,v 1.3 1999/04/20 21:06:43 semenu Exp $ + * $Id: ntfs_compr.c,v 1.4 1999/05/12 09:42:54 semenu Exp $ */ #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #include #endif - -#include #include #include #define GET_UINT16(addr) (*((u_int16_t *)(addr))) int ntfs_uncompblock( u_int8_t * buf, u_int8_t * cbuf) { u_int32_t ctag; int len, dshift, lmask; int blen, boff; int i, j; int pos, cpos; len = GET_UINT16(cbuf) & 0xFFF; dprintf(("ntfs_uncompblock: block length: %d + 3, 0x%x,0x%04x\n", len, len, GET_UINT16(cbuf))); if (!(GET_UINT16(cbuf) & 0x8000)) { if ((len + 1) != NTFS_COMPBLOCK_SIZE) { dprintf(("ntfs_uncompblock: len: %x instead of %d\n", len, 0xfff)); } memcpy(buf, cbuf + 2, len + 1); bzero(buf + len + 1, NTFS_COMPBLOCK_SIZE - 1 - len); return len + 3; } cpos = 2; pos = 0; while ((cpos < len + 3) && (pos < NTFS_COMPBLOCK_SIZE)) { ctag = cbuf[cpos++]; for (i = 0; (i < 8) && (pos < NTFS_COMPBLOCK_SIZE); i++) { if (ctag & 1) { for (j = pos - 1, lmask = 0xFFF, dshift = 12; j >= 0x10; j >>= 1) { dshift--; lmask >>= 1; } boff = -1 - (GET_UINT16(cbuf + cpos) >> dshift); blen = 3 + (GET_UINT16(cbuf + cpos) & lmask); for (j = 0; (j < blen) && (pos < NTFS_COMPBLOCK_SIZE); j++) { buf[pos] = buf[pos + boff]; pos++; } cpos += 2; } else { buf[pos++] = cbuf[cpos++]; } ctag >>= 1; } } return len + 3; } int ntfs_uncompunit( struct ntfsmount * ntmp, u_int8_t * uup, u_int8_t * cup) { int i; int off = 0; int new; for (i = 0; i * NTFS_COMPBLOCK_SIZE < ntfs_cntob(NTFS_COMPUNIT_CL); i++) { new = ntfs_uncompblock(uup + i * NTFS_COMPBLOCK_SIZE, cup + off); if (new == 0) return (EINVAL); off += new; } return (0); } Index: head/sys/ntfs/ntfs_subr.c =================================================================== --- head/sys/ntfs/ntfs_subr.c (revision 49534) +++ head/sys/ntfs/ntfs_subr.c (revision 49535) @@ -1,1901 +1,1899 @@ /* $NetBSD: ntfs_subr.c,v 1.2 1999/05/06 15:43:19 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko (semenu@FreeBSD.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_subr.c,v 1.3 1999/04/20 21:06:43 semenu Exp $ + * $Id: ntfs_subr.c,v 1.4 1999/05/12 09:43:01 semenu Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #endif - -#include /* #define NTFS_DEBUG 1 */ #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) MALLOC_DEFINE(M_NTFSNTVATTR, "NTFS vattr", "NTFS file attribute information"); MALLOC_DEFINE(M_NTFSRDATA, "NTFS res data", "NTFS resident data"); MALLOC_DEFINE(M_NTFSRUN, "NTFS vrun", "NTFS vrun storage"); MALLOC_DEFINE(M_NTFSDECOMP, "NTFS decomp", "NTFS decompression temporary"); #endif /* * */ int ntfs_ntvattrrele( struct ntvattr * vap) { dprintf(("ntfs_ntvattrrele: ino: %d, type: 0x%x\n", vap->va_ip->i_number, vap->va_type)); ntfs_ntrele(vap->va_ip); return (0); } /* * Search attribute specifed in ntnode (load ntnode if nessecary). * If not found but ATTR_A_ATTRLIST present, read it in and search throught. * VOP_VGET node needed, and lookup througth it's ntnode (load if nessesary). * * ntnode should be locked */ int ntfs_ntvattrget( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t type, char *name, cn_t vcn, struct ntvattr ** vapp) { int error; struct ntvattr *vap; struct ntvattr *lvap = NULL; struct attr_attrlist *aalp; struct attr_attrlist *nextaalp; caddr_t alpool; int len, namelen; *vapp = NULL; if (name) { dprintf(("ntfs_ntvattrget: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); namelen = strlen(name); } else { dprintf(("ntfs_ntvattrget: " \ "ino: %d, type: 0x%x, vcn: %d\n", \ ip->i_number, type, (u_int32_t) vcn)); name = ""; namelen = 0; } if((ip->i_flag & IN_LOADED) == 0) { dprintf(("ntfs_ntvattrget: node not loaded, ino: %d\n", ip->i_number)); error = ntfs_loadntnode(ntmp,ip); if(error) { printf("ntfs_ntvattrget: FAILED TO LOAD INO: %d\n", ip->i_number); return (error); } } for (vap = ip->i_valist.lh_first; vap; vap = vap->va_list.le_next) { ddprintf(("type: 0x%x, vcn: %d - %d\n", \ vap->va_type, (u_int32_t) vap->va_vcnstart, \ (u_int32_t) vap->va_vcnend)); if ((vap->va_type == type) && (vap->va_vcnstart <= vcn) && (vap->va_vcnend >= vcn) && (vap->va_namelen == namelen) && (!strncmp(name, vap->va_name, namelen))) { *vapp = vap; ntfs_ntref(vap->va_ip); return (0); } if (vap->va_type == NTFS_A_ATTRLIST) lvap = vap; } if (!lvap) { dprintf(("ntfs_ntvattrget: UNEXISTED ATTRIBUTE: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); return (ENOENT); } /* Scan $ATTRIBUTE_LIST for requested attribute */ len = lvap->va_datalen; MALLOC(alpool, caddr_t, len, M_TEMP, M_WAITOK); error = ntfs_readntvattr_plain(ntmp, ip, lvap, 0, len, alpool, &len); if (error) goto out; aalp = (struct attr_attrlist *) alpool; nextaalp = NULL; while (len > 0) { dprintf(("ntfs_ntvattrget: " \ "attrlist: ino: %d, attr: 0x%x, vcn: %d\n", \ aalp->al_inumber, aalp->al_type, \ (u_int32_t) aalp->al_vcnstart)); if (len > aalp->reclen) { nextaalp = NTFS_NEXTREC(aalp, struct attr_attrlist *); } else { nextaalp = NULL; } len -= aalp->reclen; #define AALPCMP(aalp,type,name,namelen) ( \ (aalp->al_type == type) && (aalp->al_namelen == namelen) && \ !uastrcmp(aalp->al_name,aalp->al_namelen,name,namelen) ) if (AALPCMP(aalp, type, name, namelen) && (!nextaalp || (nextaalp->al_vcnstart > vcn) || !AALPCMP(nextaalp, type, name, namelen))) { struct vnode *newvp; struct ntnode *newip; dprintf(("ntfs_ntvattrget: attrbute in ino: %d\n", aalp->al_inumber)); /* error = VFS_VGET(ntmp->ntm_mountp, aalp->al_inumber, &newvp); */ error = ntfs_vgetex(ntmp->ntm_mountp, aalp->al_inumber, NTFS_A_DATA, NULL, LK_EXCLUSIVE, VG_EXT, curproc, &newvp); if (error) { printf("ntfs_ntvattrget: CAN'T VGET INO: %d\n", aalp->al_inumber); goto out; } newip = VTONT(newvp); /* XXX have to lock ntnode */ if(~newip->i_flag & IN_LOADED) { dprintf(("ntfs_ntvattrget: node not loaded," \ " ino: %d\n", newip->i_number)); error = ntfs_loadntnode(ntmp,ip); if(error) { printf("ntfs_ntvattrget: CAN'T LOAD " \ "INO: %d\n", newip->i_number); vput(newvp); goto out; } } for (vap = newip->i_valist.lh_first; vap; vap = vap->va_list.le_next) { if ((vap->va_type == type) && (vap->va_vcnstart <= vcn) && (vap->va_vcnend >= vcn) && (vap->va_namelen == namelen) && (!strncmp(name, vap->va_name, namelen))) { *vapp = vap; ntfs_ntref(vap->va_ip); vput(newvp); error = 0; goto out; } if (vap->va_type == NTFS_A_ATTRLIST) lvap = vap; } printf("ntfs_ntvattrget: ATTRLIST ERROR.\n"); vput(newvp); break; } #undef AALPCMP aalp = nextaalp; } error = ENOENT; dprintf(("ntfs_ntvattrget: UNEXISTED ATTRIBUTE: " \ "ino: %d, type: 0x%x, name: %s, vcn: %d\n", \ ip->i_number, type, name, (u_int32_t) vcn)); out: FREE(alpool, M_TEMP); return (error); } /* * Read ntnode from disk, make ntvattr list. * * ntnode should be locked */ int ntfs_loadntnode( struct ntfsmount * ntmp, struct ntnode * ip) { struct filerec *mfrp; daddr_t bn; int error,off; struct attr *ap; struct ntvattr *nvap; dprintf(("ntfs_loadnode: loading ino: %d\n",ip->i_number)); MALLOC(mfrp, struct filerec *, ntfs_bntob(ntmp->ntm_bpmftrec), M_TEMP, M_WAITOK); if (ip->i_number < NTFS_SYSNODESNUM) { struct buf *bp; dprintf(("ntfs_loadnode: read system node\n")); bn = ntfs_cntobn(ntmp->ntm_mftcn) + ntmp->ntm_bpmftrec * ip->i_number; error = bread(ntmp->ntm_devvp, bn, ntfs_bntob(ntmp->ntm_bpmftrec), NOCRED, &bp); if (error) { printf("ntfs_loadnode: BREAD FAILED\n"); brelse(bp); goto out; } memcpy(mfrp, bp->b_data, ntfs_bntob(ntmp->ntm_bpmftrec)); bqrelse(bp); } else { struct vnode *vp; vp = ntmp->ntm_sysvn[NTFS_MFTINO]; error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, ip->i_number * ntfs_bntob(ntmp->ntm_bpmftrec), ntfs_bntob(ntmp->ntm_bpmftrec), mfrp); if (error) { printf("ntfs_loadnode: ntfs_readattr failed\n"); goto out; } } /* Check if magic and fixups are correct */ error = ntfs_procfixups(ntmp, NTFS_FILEMAGIC, (caddr_t)mfrp, ntfs_bntob(ntmp->ntm_bpmftrec)); if (error) { printf("ntfs_loadnode: BAD MFT RECORD %d\n", (u_int32_t) ip->i_number); goto out; } dprintf(("ntfs_loadnode: load attrs for ino: %d\n",ip->i_number)); off = mfrp->fr_attroff; ap = (struct attr *) ((caddr_t)mfrp + off); LIST_INIT(&ip->i_valist); while (ap->a_hdr.a_type != -1) { error = ntfs_attrtontvattr(ntmp, &nvap, ap); if (error) break; nvap->va_ip = ip; LIST_INSERT_HEAD(&ip->i_valist, nvap, va_list); off += ap->a_hdr.reclen; ap = (struct attr *) ((caddr_t)mfrp + off); } if (error) { printf("ntfs_loadnode: failed to load attr ino: %d\n", ip->i_number); goto out; } ip->i_mainrec = mfrp->fr_mainrec; ip->i_nlink = mfrp->fr_nlink; ip->i_frflag = mfrp->fr_flags; ip->i_flag |= IN_LOADED; out: FREE(mfrp, M_TEMP); return (error); } /* * Routine locks ntnode and increase usecount, just opposite of * ntfs_ntput. */ int ntfs_ntget( struct ntnode *ip) { dprintf(("ntfs_ntget: get ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount++; restart: if (ip->i_lock) { while (ip->i_lock) { ip->i_lock = -1; tsleep(&ip->i_lock, PVM, "ntnode", 0); } goto restart; } ip->i_lock = 1; return 0; } /* * Routine search ntnode in hash, if found: lock, inc usecount and return. * If not in hash allocate structure for ntnode, prefill it, lock, * inc count and return. * * ntnode returned locked */ static int ntfs_ntnode_hash_lock; int ntfs_ntlookup( struct ntfsmount * ntmp, ino_t ino, struct ntnode ** ipp) { struct ntnode *ip; dprintf(("ntfs_ntlookup: for ntnode %d\n", ino)); *ipp = NULL; restart: ip = ntfs_nthashlookup(ntmp->ntm_dev, ino); /* XXX */ if (ip) { ntfs_ntget(ip); *ipp = ip; dprintf(("ntfs_ntlookup: ntnode %d: %p, usecount: %d\n", ino, ip, ip->i_usecount)); return (0); } if (ntfs_ntnode_hash_lock) { while(ntfs_ntnode_hash_lock) { ntfs_ntnode_hash_lock = -1; tsleep(&ntfs_ntnode_hash_lock, PVM, "ntfsntgt", 0); } goto restart; } ntfs_ntnode_hash_lock = 1; MALLOC(ip, struct ntnode *, sizeof(struct ntnode), M_NTFSNTNODE, M_WAITOK); ddprintf(("ntfs_ntlookup: allocating ntnode: %d: %p\n", ino, ip)); bzero((caddr_t) ip, sizeof(struct ntnode)); /* Generic initialization */ ip->i_number = ino; ip->i_mp = ntmp; ip->i_dev = ntmp->ntm_dev; ip->i_uid = ntmp->ntm_uid; ip->i_gid = ntmp->ntm_gid; ip->i_mode = ntmp->ntm_mode; ip->i_usecount++; ip->i_lock = 1; LIST_INIT(&ip->i_fnlist); ntfs_nthashins(ip); if (ntfs_ntnode_hash_lock < 0) wakeup(&ntfs_ntnode_hash_lock); ntfs_ntnode_hash_lock = 0; *ipp = ip; dprintf(("ntfs_ntlookup: ntnode %d: %p, usecount: %d\n", ino, ip, ip->i_usecount)); return (0); } /* * Decrement usecount of ntnode and unlock it, if usecount reach zero, * deallocate ntnode. * * ntnode should be locked on entry, and unlocked on return. */ void ntfs_ntput( struct ntnode *ip) { struct ntvattr *vap; if (!ip->i_lock) printf("ntfs_ntput: NOT LOCKED"); dprintf(("ntfs_ntput: rele ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount--; if (ip->i_usecount < 0) { panic("ntfs_ntput: ino: %d usecount: %d \n", ip->i_number,ip->i_usecount); } else if (ip->i_usecount == 0) { dprintf(("ntfs_ntput: deallocating ntnode: %d\n", ip->i_number)); if (ip->i_fnlist.lh_first) panic("ntfs_ntput: ntnode has fnodes\n"); ntfs_nthashrem(ip); while (ip->i_valist.lh_first != NULL) { vap = ip->i_valist.lh_first; LIST_REMOVE(vap,va_list); ntfs_freentvattr(vap); } FREE(ip, M_NTFSNTNODE); } else { if (ip->i_lock < 0) wakeup(&ip->i_lock); ip->i_lock = 0; } } /* * Decrement usecount of ntnode. */ void ntfs_ntrele( struct ntnode * ip) { dprintf(("ntfs_ntrele: rele ntnode %d: %p, usecount: %d\n", ip->i_number, ip, ip->i_usecount)); ip->i_usecount--; if (ip->i_usecount < 0) panic("ntfs_ntrele: ino: %d usecount: %d \n", ip->i_number,ip->i_usecount); } /* * Deallocate all memory allocated for ntvattr by call to * ntfs_attrtontvattr and some other functions. */ void ntfs_freentvattr( struct ntvattr * vap) { if (vap->va_flag & NTFS_AF_INRUN) { if (vap->va_vruncn) FREE(vap->va_vruncn, M_NTFSRUN); if (vap->va_vruncl) FREE(vap->va_vruncl, M_NTFSRUN); } else { if (vap->va_datap) FREE(vap->va_datap, M_NTFSRDATA); } FREE(vap, M_NTFSNTVATTR); } /* * Convert disk image of attribute into ntvattr structure, * runs are expanded also. */ int ntfs_attrtontvattr( struct ntfsmount * ntmp, struct ntvattr ** rvapp, struct attr * rap) { int error, i; struct ntvattr *vap; error = 0; *rvapp = NULL; MALLOC(vap, struct ntvattr *, sizeof(struct ntvattr), M_NTFSNTVATTR, M_WAITOK); bzero(vap, sizeof(struct ntvattr)); vap->va_ip = NULL; vap->va_flag = rap->a_hdr.a_flag; vap->va_type = rap->a_hdr.a_type; vap->va_compression = rap->a_hdr.a_compression; vap->va_index = rap->a_hdr.a_index; ddprintf(("type: 0x%x, index: %d", vap->va_type, vap->va_index)); vap->va_namelen = rap->a_hdr.a_namelen; if (rap->a_hdr.a_namelen) { wchar *unp = (wchar *) ((caddr_t) rap + rap->a_hdr.a_nameoff); ddprintf((", name:[")); for (i = 0; i < vap->va_namelen; i++) { vap->va_name[i] = unp[i]; ddprintf(("%c", vap->va_name[i])); } ddprintf(("]")); } if (vap->va_flag & NTFS_AF_INRUN) { ddprintf((", nonres.")); vap->va_datalen = rap->a_nr.a_datalen; vap->va_allocated = rap->a_nr.a_allocated; vap->va_vcnstart = rap->a_nr.a_vcnstart; vap->va_vcnend = rap->a_nr.a_vcnend; vap->va_compressalg = rap->a_nr.a_compressalg; error = ntfs_runtovrun(&(vap->va_vruncn), &(vap->va_vruncl), &(vap->va_vruncnt), (caddr_t) rap + rap->a_nr.a_dataoff); } else { vap->va_compressalg = 0; ddprintf((", res.")); vap->va_datalen = rap->a_r.a_datalen; vap->va_allocated = rap->a_r.a_datalen; vap->va_vcnstart = 0; vap->va_vcnend = ntfs_btocn(vap->va_allocated); MALLOC(vap->va_datap, caddr_t, vap->va_datalen, M_NTFSRDATA, M_WAITOK); memcpy(vap->va_datap, (caddr_t) rap + rap->a_r.a_dataoff, rap->a_r.a_datalen); } ddprintf((", len: %d", vap->va_datalen)); if (error) FREE(vap, M_NTFSNTVATTR); else *rvapp = vap; ddprintf(("\n")); return (error); } /* * Expand run into more utilizable and more memory eating format. */ int ntfs_runtovrun( cn_t ** rcnp, cn_t ** rclp, u_long * rcntp, u_int8_t * run) { u_int32_t off; u_int32_t sz, i; cn_t *cn; cn_t *cl; u_long cnt; cn_t prev; cn_t tmp; off = 0; cnt = 0; i = 0; while (run[off]) { off += (run[off] & 0xF) + ((run[off] >> 4) & 0xF) + 1; cnt++; } MALLOC(cn, cn_t *, cnt * sizeof(cn_t), M_NTFSRUN, M_WAITOK); MALLOC(cl, cn_t *, cnt * sizeof(cn_t), M_NTFSRUN, M_WAITOK); off = 0; cnt = 0; prev = 0; while (run[off]) { sz = run[off++]; cl[cnt] = 0; for (i = 0; i < (sz & 0xF); i++) cl[cnt] += (u_int32_t) run[off++] << (i << 3); sz >>= 4; if (run[off + sz - 1] & 0x80) { tmp = ((u_int64_t) - 1) << (sz << 3); for (i = 0; i < sz; i++) tmp |= (u_int64_t) run[off++] << (i << 3); } else { tmp = 0; for (i = 0; i < sz; i++) tmp |= (u_int64_t) run[off++] << (i << 3); } if (tmp) prev = cn[cnt] = prev + tmp; else cn[cnt] = tmp; cnt++; } *rcnp = cn; *rclp = cl; *rcntp = cnt; return (0); } /* * Convert wchar to uppercase wchar, should be macros? */ wchar ntfs_toupper( struct ntfsmount * ntmp, wchar wc) { return (ntmp->ntm_upcase[wc & 0xFF]); } /* * Compare to unicode strings case insensible. */ int ntfs_uustricmp( struct ntfsmount * ntmp, wchar * str1, int str1len, wchar * str2, int str2len) { int i; int res; for (i = 0; i < str1len && i < str2len; i++) { res = (int) ntfs_toupper(ntmp, str1[i]) - (int) ntfs_toupper(ntmp, str2[i]); if (res) return res; } return (str1len - str2len); } /* * Compare unicode and ascii string case insens. */ int ntfs_uastricmp( struct ntfsmount * ntmp, const wchar *str1, int str1len, const char *str2, int str2len) { int i; int res; for (i = 0; i < str1len && i < str2len; i++) { res = (int) ntfs_toupper(ntmp, str1[i]) - (int) ntfs_toupper(ntmp, (wchar) str2[i]); if (res) return res; } return (str1len - str2len); } /* * Compare unicode and ascii string case sens. */ int ntfs_uastrcmp( struct ntfsmount *ntmp, const wchar *str1, int str1len, const char *str2, int str2len) { int i; int res; for (i = 0; (i < str1len) && (i < str2len); i++) { res = ((int) str1[i]) - ((int) str2[i]); if (res) return res; } return (str1len - str2len); } /* * Search fnode in ntnode, if not found allocate and preinitialize. * * ntnode should be locked on entry. */ int ntfs_fget( struct ntfsmount *ntmp, struct ntnode *ip, int attrtype, char *attrname, struct fnode **fpp) { struct fnode *fp; dprintf(("ntfs_fget: ino: %d, attrtype: 0x%x, attrname: %s\n", ip->i_number,attrtype, attrname?attrname:"")); *fpp = NULL; for (fp = ip->i_fnlist.lh_first; fp != NULL; fp = fp->f_fnlist.le_next){ dprintf(("ntfs_fget: fnode: attrtype: %d, attrname: %s\n", fp->f_attrtype, fp->f_attrname?fp->f_attrname:"")); if ((attrtype == fp->f_attrtype) && ((!attrname && !fp->f_attrname) || (attrname && fp->f_attrname && !strcmp(attrname,fp->f_attrname)))){ dprintf(("ntfs_fget: found existed: %p\n",fp)); *fpp = fp; } } if (*fpp) return (0); MALLOC(fp, struct fnode *, sizeof(struct fnode), M_NTFSFNODE, M_WAITOK); bzero(fp, sizeof(struct fnode)); dprintf(("ntfs_fget: allocating fnode: %p\n",fp)); fp->f_devvp = ntmp->ntm_devvp; fp->f_dev = ntmp->ntm_dev; fp->f_mp = ntmp; fp->f_ip = ip; fp->f_attrname = attrname; if (fp->f_attrname) fp->f_flag |= FN_AATTRNAME; fp->f_attrtype = attrtype; ntfs_ntref(ip); LIST_INSERT_HEAD(&ip->i_fnlist, fp, f_fnlist); *fpp = fp; return (0); } /* * Deallocate fnode, remove it from ntnode's fnode list. * * ntnode should be locked. */ void ntfs_frele( struct fnode *fp) { struct ntnode *ip = FTONT(fp); dprintf(("ntfs_frele: fnode: %p for %d: %p\n", fp, ip->i_number, ip)); dprintf(("ntfs_frele: deallocating fnode\n")); LIST_REMOVE(fp,f_fnlist); if (fp->f_flag & FN_AATTRNAME) FREE(fp->f_attrname, M_TEMP); if (fp->f_dirblbuf) FREE(fp->f_dirblbuf, M_NTFSDIR); FREE(fp, M_NTFSFNODE); ntfs_ntrele(ip); } /* * Lookup attribute name in format: [[:$ATTR_TYPE]:$ATTR_NAME], * $ATTR_TYPE is searched in attrdefs read from $AttrDefs. * If $ATTR_TYPE nott specifed, ATTR_A_DATA assumed. */ int ntfs_ntlookupattr( struct ntfsmount * ntmp, const char * name, int namelen, int *attrtype, char **attrname) { const char *sys; size_t syslen, i; struct ntvattrdef *adp; if (namelen == 0) return (0); if (name[0] == '$') { sys = name; for (syslen = 0; syslen < namelen; syslen++) { if(sys[syslen] == ':') { name++; namelen--; break; } } name += syslen; namelen -= syslen; adp = ntmp->ntm_ad; for (i = 0; i < ntmp->ntm_adnum; i++){ if((syslen == adp->ad_namelen) && (!strncmp(sys,adp->ad_name,syslen))) { *attrtype = adp->ad_type; if(namelen) { MALLOC((*attrname), char *, namelen, M_TEMP, M_WAITOK); memcpy((*attrname), name, namelen); (*attrname)[namelen] = '\0'; } return (0); } adp++; } return (ENOENT); } if(namelen) { MALLOC((*attrname), char *, namelen, M_TEMP, M_WAITOK); memcpy((*attrname), name, namelen); (*attrname)[namelen] = '\0'; *attrtype = NTFS_A_DATA; } return (0); } /* * Lookup specifed node for filename, matching cnp, * return fnode filled. */ int ntfs_ntlookupfile( struct ntfsmount * ntmp, struct vnode * vp, struct componentname * cnp, struct vnode ** vpp) { struct fnode *fp = VTOF(vp); struct ntnode *ip = FTONT(fp); struct ntvattr *vap; /* Root attribute */ cn_t cn; /* VCN in current attribute */ caddr_t rdbuf; /* Buffer to read directory's blocks */ u_int32_t blsize; u_int32_t rdsize; /* Length of data to read from current block */ struct attr_indexentry *iep; int error, res, anamelen, fnamelen; const char *fname,*aname; u_int32_t aoff; error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, &vap); if (error || (vap->va_flag & NTFS_AF_INRUN)) return (ENOTDIR); blsize = vap->va_a_iroot->ir_size; rdsize = vap->va_datalen; /* * Divide file name into: foofilefoofilefoofile[:attrspec] * Store like this: fname:fnamelen [aname:anamelen] */ fname = cnp->cn_nameptr; aname = NULL; anamelen = 0; for (fnamelen = 0; fnamelen < cnp->cn_namelen; fnamelen++) if(fname[fnamelen] == ':') { aname = fname + fnamelen + 1; anamelen = cnp->cn_namelen - fnamelen - 1; dprintf(("ntfs_ntlookupfile: %s (%d), attr: %s (%d)\n", fname, fnamelen, aname, anamelen)); break; } dprintf(("ntfs_ntlookupfile: blksz: %d, rdsz: %d\n", blsize, rdsize)); MALLOC(rdbuf, caddr_t, blsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, rdsize, rdbuf); if (error) goto fail; aoff = sizeof(struct attr_indexroot); do { iep = (struct attr_indexentry *) (rdbuf + aoff); while (!(iep->ie_flag & NTFS_IEFLAG_LAST) && (rdsize > aoff)) { ddprintf(("scan: %d, %d\n", (u_int32_t) iep->ie_number, (u_int32_t) iep->ie_fnametype)); res = ntfs_uastricmp(ntmp, iep->ie_fname, iep->ie_fnamelen, fname, fnamelen); if (res == 0) { /* Matched something (case ins.) */ if (iep->ie_fnametype == 0 || !(ntmp->ntm_flag & NTFS_MFLAG_CASEINS)) res = ntfs_uastrcmp(ntmp, iep->ie_fname, iep->ie_fnamelen, fname, fnamelen); if (res == 0) { int attrtype = NTFS_A_DATA; char *attrname = NULL; struct fnode *nfp; struct vnode *nvp; if (aname) { error = ntfs_ntlookupattr(ntmp, aname, anamelen, &attrtype, &attrname); if (error) goto fail; } /* Check if we've found ourself */ if ((iep->ie_number == ip->i_number) && (attrtype == fp->f_attrtype) && ((!attrname && !fp->f_attrname) || (attrname && fp->f_attrname && !strcmp(attrname, fp->f_attrname)))) { VREF(vp); *vpp = vp; goto fail; } /* vget node, but don't load it */ error = ntfs_vgetex(ntmp->ntm_mountp, iep->ie_number, attrtype, attrname, LK_EXCLUSIVE, VG_DONTLOADIN | VG_DONTVALIDFN, curproc, &nvp); if(error) goto fail; nfp = VTOF(nvp); if (nfp->f_flag & FN_VALID) { *vpp = nvp; goto fail; } nfp->f_fflag = iep->ie_fflag; nfp->f_pnumber = iep->ie_fpnumber; nfp->f_times = iep->ie_ftimes; if((nfp->f_fflag & NTFS_FFLAG_DIR) && (nfp->f_attrtype == NTFS_A_DATA) && (nfp->f_attrname == NULL)) nfp->f_type = VDIR; else nfp->f_type = VREG; nvp->v_type = nfp->f_type; if ((nfp->f_attrtype == NTFS_A_DATA) && (nfp->f_attrname == NULL)) { /* Opening default attribute */ nfp->f_size = iep->ie_fsize; nfp->f_allocated = iep->ie_fallocated; nfp->f_flag |= FN_PRELOADED; } else { error = ntfs_filesize(ntmp, nfp, &nfp->f_size, &nfp->f_allocated); if (error) { vput(nvp); goto fail; } } nfp->f_flag &= ~FN_VALID; *vpp = nvp; goto fail; } } else if (res > 0) break; aoff += iep->reclen; iep = (struct attr_indexentry *) (rdbuf + aoff); } /* Dive if possible */ if (iep->ie_flag & NTFS_IEFLAG_SUBNODE) { dprintf(("ntfs_ntlookupfile: diving\n")); cn = *(cn_t *) (rdbuf + aoff + iep->reclen - sizeof(cn_t)); rdsize = blsize; error = ntfs_readattr(ntmp, ip, NTFS_A_INDX, "$I30", ntfs_cntob(cn), rdsize, rdbuf); if (error) goto fail; error = ntfs_procfixups(ntmp, NTFS_INDXMAGIC, rdbuf, rdsize); if (error) goto fail; aoff = (((struct attr_indexalloc *) rdbuf)->ia_hdrsize + 0x18); } else { dprintf(("ntfs_ntlookupfile: nowhere to dive :-(\n")); error = ENOENT; break; } } while (1); dprintf(("finish\n")); fail: ntfs_ntvattrrele(vap); ntfs_ntput(ip); FREE(rdbuf, M_TEMP); return (error); } /* * Check if name type is permitted to show. */ int ntfs_isnamepermitted( struct ntfsmount * ntmp, struct attr_indexentry * iep) { if (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES) return 1; switch (iep->ie_fnametype) { case 2: ddprintf(("ntfs_isnamepermitted: skiped DOS name\n")); return 0; case 0: case 1: case 3: return 1; default: printf("ntfs_isnamepermitted: " \ "WARNING! Unknown file name type: %d\n", iep->ie_fnametype); break; } return 0; } /* * Read ntfs dir like stream of attr_indexentry, not like btree of them. * This is done by scaning $BITMAP:$I30 for busy clusters and reading them. * Ofcouse $INDEX_ROOT:$I30 is read before. Last read values are stored in * fnode, so we can skip toward record number num almost immediatly. * Anyway this is rather slow routine. The problem is that we don't know * how many records are there in $INDEX_ALLOCATION:$I30 block. */ int ntfs_ntreaddir( struct ntfsmount * ntmp, struct fnode * fp, u_int32_t num, struct attr_indexentry ** riepp) { struct ntnode *ip = FTONT(fp); struct ntvattr *vap = NULL; /* IndexRoot attribute */ struct ntvattr *bmvap = NULL; /* BitMap attribute */ struct ntvattr *iavap = NULL; /* IndexAllocation attribute */ caddr_t rdbuf; /* Buffer to read directory's blocks */ u_char *bmp = NULL; /* Bitmap */ u_int32_t blsize; /* Index allocation size (2048) */ u_int32_t rdsize; /* Length of data to read */ u_int32_t attrnum; /* Current attribute type */ u_int32_t cpbl = 1; /* Clusters per directory block */ u_int32_t blnum; struct attr_indexentry *iep; int error = ENOENT; u_int32_t aoff, cnum; dprintf(("ntfs_ntreaddir: read ino: %d, num: %d\n", ip->i_number, num)); error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXROOT, "$I30", 0, &vap); if (error) return (ENOTDIR); if (fp->f_dirblbuf == NULL) { fp->f_dirblsz = vap->va_a_iroot->ir_size; MALLOC(fp->f_dirblbuf, caddr_t, max(vap->va_datalen,fp->f_dirblsz), M_NTFSDIR, M_WAITOK); } blsize = fp->f_dirblsz; rdbuf = fp->f_dirblbuf; dprintf(("ntfs_ntreaddir: rdbuf: 0x%p, blsize: %d\n", rdbuf, blsize)); if (vap->va_a_iroot->ir_flag & NTFS_IRFLAG_INDXALLOC) { error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDXBITMAP, "$I30", 0, &bmvap); if (error) { error = ENOTDIR; goto fail; } MALLOC(bmp, u_char *, bmvap->va_datalen, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, ip, NTFS_A_INDXBITMAP, "$I30", 0, bmvap->va_datalen, bmp); if (error) goto fail; error = ntfs_ntvattrget(ntmp, ip, NTFS_A_INDX, "$I30", 0, &iavap); if (error) { error = ENOTDIR; goto fail; } cpbl = ntfs_btocn(blsize + ntfs_cntob(1) - 1); dprintf(("ntfs_ntreaddir: indexalloc: %d, cpbl: %d\n", iavap->va_datalen, cpbl)); } else { dprintf(("ntfs_ntreadidir: w/o BitMap and IndexAllocation\n")); iavap = bmvap = NULL; bmp = NULL; } /* Try use previous values */ if ((fp->f_lastdnum < num) && (fp->f_lastdnum != 0)) { attrnum = fp->f_lastdattr; aoff = fp->f_lastdoff; blnum = fp->f_lastdblnum; cnum = fp->f_lastdnum; } else { attrnum = NTFS_A_INDXROOT; aoff = sizeof(struct attr_indexroot); blnum = 0; cnum = 0; } do { dprintf(("ntfs_ntreaddir: scan: 0x%x, %d, %d, %d, %d\n", attrnum, (u_int32_t) blnum, cnum, num, aoff)); rdsize = (attrnum == NTFS_A_INDXROOT) ? vap->va_datalen : blsize; error = ntfs_readattr(ntmp, ip, attrnum, "$I30", ntfs_cntob(blnum * cpbl), rdsize, rdbuf); if (error) goto fail; if (attrnum == NTFS_A_INDX) { error = ntfs_procfixups(ntmp, NTFS_INDXMAGIC, rdbuf, rdsize); if (error) goto fail; } if (aoff == 0) aoff = (attrnum == NTFS_A_INDX) ? (0x18 + ((struct attr_indexalloc *) rdbuf)->ia_hdrsize) : sizeof(struct attr_indexroot); iep = (struct attr_indexentry *) (rdbuf + aoff); while (!(iep->ie_flag & NTFS_IEFLAG_LAST) && (rdsize > aoff)) { if (ntfs_isnamepermitted(ntmp, iep)) { if (cnum >= num) { fp->f_lastdnum = cnum; fp->f_lastdoff = aoff; fp->f_lastdblnum = blnum; fp->f_lastdattr = attrnum; *riepp = iep; error = 0; goto fail; } cnum++; } aoff += iep->reclen; iep = (struct attr_indexentry *) (rdbuf + aoff); } if (iavap) { if (attrnum == NTFS_A_INDXROOT) blnum = 0; else blnum++; while (ntfs_cntob(blnum * cpbl) < iavap->va_datalen) { if (bmp[blnum >> 3] & (1 << (blnum & 3))) break; blnum++; } attrnum = NTFS_A_INDX; aoff = 0; if (ntfs_cntob(blnum * cpbl) >= iavap->va_datalen) break; dprintf(("ntfs_ntreaddir: blnum: %d\n", (u_int32_t) blnum)); } } while (iavap); *riepp = NULL; fp->f_lastdnum = 0; fail: if (vap) ntfs_ntvattrrele(vap); if (bmvap) ntfs_ntvattrrele(bmvap); if (iavap) ntfs_ntvattrrele(iavap); if (bmp) FREE(bmp, M_TEMP); ntfs_ntput(ip); return (error); } /* * Convert NTFS times that are in 100 ns units and begins from * 1601 Jan 1 into unix times. */ struct timespec ntfs_nttimetounix( u_int64_t nt) { struct timespec t; /* WindowNT times are in 100 ns and from 1601 Jan 1 */ t.tv_nsec = (nt % (1000 * 1000 * 10)) * 100; t.tv_sec = nt / (1000 * 1000 * 10) - 369LL * 365LL * 24LL * 60LL * 60LL - 89LL * 1LL * 24LL * 60LL * 60LL; return (t); } /* * Get file times from NTFS_A_NAME attribute. */ int ntfs_times( struct ntfsmount * ntmp, struct ntnode * ip, ntfs_times_t * tm) { struct ntvattr *vap; int error; dprintf(("ntfs_times: ino: %d...\n", ip->i_number)); error = ntfs_ntget(ip); if (error) return (error); error = ntfs_ntvattrget(ntmp, ip, NTFS_A_NAME, NULL, 0, &vap); if (error) { ntfs_ntput(ip); return (error); } *tm = vap->va_a_name->n_times; ntfs_ntvattrrele(vap); ntfs_ntput(ip); return (0); } /* * Get file sizes from corresponding attribute. * * ntnode under fnode should be locked. */ int ntfs_filesize( struct ntfsmount * ntmp, struct fnode * fp, u_int64_t * size, u_int64_t * bytes) { struct ntvattr *vap; struct ntnode *ip = FTONT(fp); u_int64_t sz, bn; int error; dprintf(("ntfs_filesize: ino: %d\n", ip->i_number)); error = ntfs_ntvattrget(ntmp, ip, fp->f_attrtype, fp->f_attrname, 0, &vap); if (error) return (error); bn = vap->va_allocated; sz = vap->va_datalen; dprintf(("ntfs_filesize: %d bytes (%d bytes allocated)\n", (u_int32_t) sz, (u_int32_t) bn)); if (size) *size = sz; if (bytes) *bytes = bn; ntfs_ntvattrrele(vap); return (0); } /* * This is one of write routine. * * ntnode should be locked. */ int ntfs_writeattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata, size_t * initp) { size_t init; int error = 0; off_t off = roff, left = rsize, towrite; caddr_t data = rdata; struct ntvattr *vap; *initp = 0; while (left) { error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, ntfs_btocn(off), &vap); if (error) return (error); towrite = min(left, ntfs_cntob(vap->va_vcnend + 1) - off); ddprintf(("ntfs_writeattr_plain: o: %d, s: %d (%d - %d)\n", (u_int32_t) off, (u_int32_t) towrite, (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend)); error = ntfs_writentvattr_plain(ntmp, ip, vap, off - ntfs_cntob(vap->va_vcnstart), towrite, data, &init); if (error) { printf("ntfs_writeattr_plain: " \ "ntfs_writentvattr_plain failed: o: %d, s: %d\n", (u_int32_t) off, (u_int32_t) towrite); printf("ntfs_writeattr_plain: attrib: %d - %d\n", (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend); ntfs_ntvattrrele(vap); break; } ntfs_ntvattrrele(vap); left -= towrite; off += towrite; data = data + towrite; *initp += init; } return (error); } /* * This is one of write routine. * * ntnode should be locked. */ int ntfs_writentvattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, struct ntvattr * vap, off_t roff, size_t rsize, void *rdata, size_t * initp) { int error = 0; int off; *initp = 0; if (vap->va_flag & NTFS_AF_INRUN) { int cnt; cn_t ccn, ccl, cn, left, cl; caddr_t data = rdata; struct buf *bp; size_t tocopy; ddprintf(("ntfs_writentvattr_plain: data in run: %d chains\n", vap->va_vruncnt)); off = roff; left = rsize; ccl = 0; ccn = 0; cnt = 0; while (left && (cnt < vap->va_vruncnt)) { ccn = vap->va_vruncn[cnt]; ccl = vap->va_vruncl[cnt]; ddprintf(("ntfs_writentvattr_plain: " \ "left %d, cn: 0x%x, cl: %d, off: %d\n", \ (u_int32_t) left, (u_int32_t) ccn, \ (u_int32_t) ccl, (u_int32_t) off)); if (ntfs_cntob(ccl) < off) { off -= ntfs_cntob(ccl); cnt++; continue; } if (ccn || ip->i_number == NTFS_BOOTINO) { /* XXX */ ccl -= ntfs_btocn(off); cn = ccn + ntfs_btocn(off); off = ntfs_btocnoff(off); while (left && ccl) { tocopy = min(left, min(ntfs_cntob(ccl) - off, MAXBSIZE - off)); cl = ntfs_btocl(tocopy + off); ddprintf(("ntfs_writentvattr_plain: " \ "write: cn: 0x%x cl: %d, " \ "off: %d len: %d, left: %d\n", (u_int32_t) cn, (u_int32_t) cl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); if ((off == 0) && (tocopy == ntfs_cntob(cl))) { bp = getblk(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), 0, 0); clrbuf(bp); } else { error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), NOCRED, &bp); if (error) { brelse(bp); return (error); } } memcpy(bp->b_data + off, data, tocopy); bawrite(bp); data = data + tocopy; *initp += tocopy; off = 0; left -= tocopy; cn += cl; ccl -= cl; } } cnt++; } if (left) { printf("ntfs_writentvattr_plain: POSSIBLE RUN ERROR\n"); error = EINVAL; } } else { printf("ntfs_writevattr_plain: CAN'T WRITE RES. ATTRIBUTE\n"); error = ENOTTY; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readntvattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, struct ntvattr * vap, off_t roff, size_t rsize, void *rdata, size_t * initp) { int error = 0; int off; *initp = 0; if (vap->va_flag & NTFS_AF_INRUN) { int cnt; cn_t ccn, ccl, cn, left, cl; caddr_t data = rdata; struct buf *bp; size_t tocopy; ddprintf(("ntfs_readntvattr_plain: data in run: %d chains\n", vap->va_vruncnt)); off = roff; left = rsize; ccl = 0; ccn = 0; cnt = 0; while (left && (cnt < vap->va_vruncnt)) { ccn = vap->va_vruncn[cnt]; ccl = vap->va_vruncl[cnt]; ddprintf(("ntfs_readntvattr_plain: " \ "left %d, cn: 0x%x, cl: %d, off: %d\n", \ (u_int32_t) left, (u_int32_t) ccn, \ (u_int32_t) ccl, (u_int32_t) off)); if (ntfs_cntob(ccl) < off) { off -= ntfs_cntob(ccl); cnt++; continue; } if (ccn || ip->i_number == NTFS_BOOTINO) { ccl -= ntfs_btocn(off); cn = ccn + ntfs_btocn(off); off = ntfs_btocnoff(off); while (left && ccl) { tocopy = min(left, min(ntfs_cntob(ccl) - off, MAXBSIZE - off)); cl = ntfs_btocl(tocopy + off); ddprintf(("ntfs_readntvattr_plain: " \ "read: cn: 0x%x cl: %d, " \ "off: %d len: %d, left: %d\n", (u_int32_t) cn, (u_int32_t) cl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn), ntfs_cntob(cl), NOCRED, &bp); if (error) { brelse(bp); return (error); } memcpy(data, bp->b_data + off, tocopy); brelse(bp); data = data + tocopy; *initp += tocopy; off = 0; left -= tocopy; cn += cl; ccl -= cl; } } else { tocopy = min(left, ntfs_cntob(ccl) - off); ddprintf(("ntfs_readntvattr_plain: " "sparce: ccn: 0x%x ccl: %d, off: %d, " \ " len: %d, left: %d\n", (u_int32_t) ccn, (u_int32_t) ccl, (u_int32_t) off, (u_int32_t) tocopy, (u_int32_t) left)); left -= tocopy; off = 0; bzero(data, tocopy); data = data + tocopy; } cnt++; } if (left) { printf("ntfs_readntvattr_plain: POSSIBLE RUN ERROR\n"); error = E2BIG; } } else { ddprintf(("ntfs_readnvattr_plain: data is in mft record\n")); memcpy(rdata, vap->va_datap + roff, rsize); *initp += rsize; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readattr_plain( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata, size_t * initp) { size_t init; int error = 0; off_t off = roff, left = rsize, toread; caddr_t data = rdata; struct ntvattr *vap; *initp = 0; while (left) { error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, ntfs_btocn(off), &vap); if (error) return (error); toread = min(left, ntfs_cntob(vap->va_vcnend + 1) - off); ddprintf(("ntfs_readattr_plain: o: %d, s: %d (%d - %d)\n", (u_int32_t) off, (u_int32_t) toread, (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend)); error = ntfs_readntvattr_plain(ntmp, ip, vap, off - ntfs_cntob(vap->va_vcnstart), toread, data, &init); if (error) { printf("ntfs_readattr_plain: " \ "ntfs_readntvattr_plain failed: o: %d, s: %d\n", (u_int32_t) off, (u_int32_t) toread); printf("ntfs_readattr_plain: attrib: %d - %d\n", (u_int32_t) vap->va_vcnstart, (u_int32_t) vap->va_vcnend); ntfs_ntvattrrele(vap); break; } ntfs_ntvattrrele(vap); left -= toread; off += toread; data = data + toread; *initp += init; } return (error); } /* * This is one of read routines. * * ntnode should be locked. */ int ntfs_readattr( struct ntfsmount * ntmp, struct ntnode * ip, u_int32_t attrnum, char *attrname, off_t roff, size_t rsize, void *rdata) { int error = 0; struct ntvattr *vap; size_t init; ddprintf(("ntfs_readattr: reading %d: 0x%x, from %d size %d bytes\n", ip->i_number, attrnum, (u_int32_t) roff, (u_int32_t) rsize)); error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname, 0, &vap); if (error) return (error); if ((roff > vap->va_datalen) || (roff + rsize > vap->va_datalen)) { ddprintf(("ntfs_readattr: offset too big\n")); ntfs_ntvattrrele(vap); return (E2BIG); } if (vap->va_compression && vap->va_compressalg) { u_int8_t *cup; u_int8_t *uup; off_t off = roff, left = rsize, tocopy; caddr_t data = rdata; cn_t cn; ddprintf(("ntfs_ntreadattr: compression: %d\n", vap->va_compressalg)); MALLOC(cup, u_int8_t *, ntfs_cntob(NTFS_COMPUNIT_CL), M_NTFSDECOMP, M_WAITOK); MALLOC(uup, u_int8_t *, ntfs_cntob(NTFS_COMPUNIT_CL), M_NTFSDECOMP, M_WAITOK); cn = (ntfs_btocn(roff)) & (~(NTFS_COMPUNIT_CL - 1)); off = roff - ntfs_cntob(cn); while (left) { error = ntfs_readattr_plain(ntmp, ip, attrnum, attrname, ntfs_cntob(cn), ntfs_cntob(NTFS_COMPUNIT_CL), cup, &init); if (error) break; tocopy = min(left, ntfs_cntob(NTFS_COMPUNIT_CL) - off); if (init == ntfs_cntob(NTFS_COMPUNIT_CL)) { memcpy(data, cup + off, tocopy); } else if (init == 0) { bzero(data, tocopy); } else { error = ntfs_uncompunit(ntmp, uup, cup); if (error) break; memcpy(data, uup + off, tocopy); } left -= tocopy; data = data + tocopy; off += tocopy - ntfs_cntob(NTFS_COMPUNIT_CL); cn += NTFS_COMPUNIT_CL; } FREE(uup, M_NTFSDECOMP); FREE(cup, M_NTFSDECOMP); } else error = ntfs_readattr_plain(ntmp, ip, attrnum, attrname, roff, rsize, rdata, &init); ntfs_ntvattrrele(vap); return (error); } #if UNUSED_CODE int ntfs_parserun( cn_t * cn, cn_t * cl, u_int8_t * run, u_long len, u_long *off) { u_int8_t sz; int i; if (NULL == run) { printf("ntfs_parsetun: run == NULL\n"); return (EINVAL); } sz = run[(*off)++]; if (0 == sz) { printf("ntfs_parserun: trying to go out of run\n"); return (E2BIG); } *cl = 0; if ((sz & 0xF) > 8 || (*off) + (sz & 0xF) > len) { printf("ntfs_parserun: " \ "bad run: length too big: sz: 0x%02x (%ld < %ld + sz)\n", sz, len, *off); return (EINVAL); } for (i = 0; i < (sz & 0xF); i++) *cl += (u_int32_t) run[(*off)++] << (i << 3); sz >>= 4; if ((sz & 0xF) > 8 || (*off) + (sz & 0xF) > len) { printf("ntfs_parserun: " \ "bad run: length too big: sz: 0x%02x (%ld < %ld + sz)\n", sz, len, *off); return (EINVAL); } for (i = 0; i < (sz & 0xF); i++) *cn += (u_int32_t) run[(*off)++] << (i << 3); return (0); } #endif /* * Process fixup routine on given buffer. */ int ntfs_procfixups( struct ntfsmount * ntmp, u_int32_t magic, caddr_t buf, size_t len) { struct fixuphdr *fhp = (struct fixuphdr *) buf; int i; u_int16_t fixup; u_int16_t *fxp; u_int16_t *cfxp; if (fhp->fh_magic != magic) { printf("ntfs_procfixups: magic doesn't match: %08x != %08x\n", fhp->fh_magic, magic); return (EINVAL); } if ((fhp->fh_fnum - 1) * ntmp->ntm_bps != len) { printf("ntfs_procfixups: " \ "bad fixups number: %d for %d bytes block\n", fhp->fh_fnum, len); return (EINVAL); } if (fhp->fh_foff >= ntmp->ntm_spc * ntmp->ntm_mftrecsz * ntmp->ntm_bps) { printf("ntfs_procfixups: invalid offset: %x", fhp->fh_foff); return (EINVAL); } fxp = (u_int16_t *) (buf + fhp->fh_foff); cfxp = (u_int16_t *) (buf + ntmp->ntm_bps - 2); fixup = *fxp++; for (i = 1; i < fhp->fh_fnum; i++, fxp++) { if (*cfxp != fixup) { printf("ntfs_procfixups: fixup %d doesn't match\n", i); return (EINVAL); } *cfxp = *fxp; ((caddr_t) cfxp) += ntmp->ntm_bps; } return (0); } #if UNUSED_CODE int ntfs_runtocn( cn_t * cn, struct ntfsmount * ntmp, u_int8_t * run, u_long len, cn_t vcn) { cn_t ccn = 0; cn_t ccl = 0; u_long off = 0; int error = 0; #if NTFS_DEBUG int i; printf("ntfs_runtocn: run: 0x%p, %ld bytes, vcn:%ld\n", run, len, (u_long) vcn); printf("ntfs_runtocn: run: "); for (i = 0; i < len; i++) printf("0x%02x ", run[i]); printf("\n"); #endif if (NULL == run) { printf("ntfs_runtocn: run == NULL\n"); return (EINVAL); } do { if (run[off] == 0) { printf("ntfs_runtocn: vcn too big\n"); return (E2BIG); } vcn -= ccl; error = ntfs_parserun(&ccn, &ccl, run, len, &off); if (error) { printf("ntfs_runtocn: ntfs_parserun failed\n"); return (error); } } while (ccl <= vcn); *cn = ccn + vcn; return (0); } #endif Index: head/sys/ntfs/ntfs_vfsops.c =================================================================== --- head/sys/ntfs/ntfs_vfsops.c (revision 49534) +++ head/sys/ntfs/ntfs_vfsops.c (revision 49535) @@ -1,996 +1,994 @@ /* $NetBSD: ntfs_vfsops.c,v 1.2 1999/05/06 15:43:20 christos Exp $ */ /*- * Copyright (c) 1998, 1999 Semen Ustimenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_vfsops.c,v 1.6 1999/05/12 09:43:04 semenu Exp $ + * $Id: ntfs_vfsops.c,v 1.7 1999/05/31 11:28:30 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -#include /*#define NTFS_DEBUG 1*/ #include #include #include #include #include #include #include #if defined(__FreeBSD__) MALLOC_DEFINE(M_NTFSMNT, "NTFS mount", "NTFS mount structure"); MALLOC_DEFINE(M_NTFSNTNODE,"NTFS ntnode", "NTFS ntnode information"); MALLOC_DEFINE(M_NTFSFNODE,"NTFS fnode", "NTFS fnode information"); MALLOC_DEFINE(M_NTFSDIR,"NTFS dir", "NTFS dir buffer"); #endif #if defined(__FreeBSD__) static int ntfs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); #else static int ntfs_mount __P((struct mount *, const char *, void *, struct nameidata *, struct proc *)); #endif static int ntfs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); static int ntfs_root __P((struct mount *, struct vnode **)); static int ntfs_start __P((struct mount *, int, struct proc *)); static int ntfs_statfs __P((struct mount *, struct statfs *, struct proc *)); static int ntfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); static int ntfs_unmount __P((struct mount *, int, struct proc *)); static int ntfs_vget __P((struct mount *mp, ino_t ino, struct vnode **vpp)); static int ntfs_mountfs __P((register struct vnode *, struct mount *, struct ntfs_args *, struct proc *)); static int ntfs_vptofh __P((struct vnode *, struct fid *)); #if defined(__FreeBSD__) static int ntfs_init __P((struct vfsconf *)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct sockaddr *, struct vnode **, int *, struct ucred **)); #elif defined(__NetBSD__) static void ntfs_init __P((void)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct vnode **)); static int ntfs_checkexp __P((struct mount *, struct mbuf *, int *, struct ucred **)); static int ntfs_mountroot __P((void)); static int ntfs_sysctl __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); #else static int ntfs_init __P((void)); static int ntfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, struct vnode **, int *, struct ucred **)); #endif #ifdef __NetBSD__ /*ARGSUSED*/ static int ntfs_checkexp(mp, nam, exflagsp, credanonp) register struct mount *mp; struct mbuf *nam; int *exflagsp; struct ucred **credanonp; { return (EINVAL); } /*ARGSUSED*/ static int ntfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; void *oldp; size_t *oldlenp; void *newp; size_t newlen; struct proc *p; { return (EINVAL); } static int ntfs_mountroot() { return (EINVAL); } #endif #if defined(__FreeBSD__) static int ntfs_init ( struct vfsconf *vcp ) #elif defined(__NetBSD__) static void ntfs_init () #else static int ntfs_init () #endif { ntfs_nthashinit(); #if !defined(__NetBSD__) return 0; #endif } static int ntfs_mount ( struct mount *mp, #if defined(__FreeBSD__) char *path, caddr_t data, #else const char *path, void *data, #endif struct nameidata *ndp, struct proc *p ) { u_int size; int err = 0; struct vnode *devvp; struct ntfs_args args; /* * Use NULL path to flag a root mount */ if( path == NULL) { /* *** * Mounting root file system *** */ /* Get vnode for root device*/ if( bdevvp( rootdev, &rootvp)) panic("ffs_mountroot: can't setup bdevvp for root"); /* * FS specific handling */ mp->mnt_flag |= MNT_RDONLY; /* XXX globally applicable?*/ /* * Attempt mount */ if( ( err = ntfs_mountfs(rootvp, mp, &args, p)) != 0) { /* fs specific cleanup (if any)*/ goto error_1; } goto dostatfs; /* success*/ } /* *** * Mounting non-root file system or updating a file system *** */ /* copy in user arguments*/ err = copyin(data, (caddr_t)&args, sizeof (struct ntfs_args)); if (err) goto error_1; /* can't get arguments*/ /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { printf("ntfs_mount(): MNT_UPDATE not supported\n"); err = EINVAL; goto error_1; #if 0 ump = VFSTOUFS(mp); fs = ump->um_fs; err = 0; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp)) { err = EBUSY; goto error_1; } err = ffs_flushfiles(mp, flags, p); vfs_unbusy(mp); } if (!err && (mp->mnt_flag & MNT_RELOAD)) err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); if (err) { goto error_1; } if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) { if (!fs->fs_clean) { if (mp->mnt_flag & MNT_FORCE) { printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt); } else { printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n", fs->fs_fsmnt); err = EPERM; goto error_1; } } fs->fs_ronly = 0; } if (fs->fs_ronly == 0) { fs->fs_clean = 0; ffs_sbupdate(ump, MNT_WAIT); } /* if not updating name...*/ if (args.fspec == 0) { /* * Process export requests. Jumping to "success" * will return the vfs_export() error code. */ err = vfs_export(mp, &ump->um_export, &args.export); goto success; } #endif } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); err = namei(ndp); if (err) { /* can't get devvp!*/ goto error_1; } devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { err = ENOTBLK; goto error_2; } if (bdevsw(devvp->v_rdev) == NULL) { err = ENXIO; goto error_2; } if (mp->mnt_flag & MNT_UPDATE) { #if 0 /* ******************** * UPDATE ******************** */ if (devvp != ntmp->um_devvp) err = EINVAL; /* needs translation */ else vrele(devvp); /* * Update device name only on success */ if( !err) { /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); } #endif } else { /* ******************** * NEW MOUNT ******************** */ /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. */ /* Save "last mounted on" info for mount point (NULL pad)*/ copyinstr( path, /* mount point*/ mp->mnt_stat.f_mntonname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size); /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, /* device name*/ mp->mnt_stat.f_mntfromname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); err = ntfs_mountfs(devvp, mp, &args, p); } if (err) { goto error_2; } dostatfs: /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname * * This code is common to root and non-root mounts */ (void)VFS_STATFS(mp, &mp->mnt_stat, p); goto success; error_2: /* error with devvp held*/ /* release devvp before failing*/ vrele(devvp); error_1: /* no state to back out*/ success: return( err); } /* * Common code for mount and mountroot */ int ntfs_mountfs(devvp, mp, argsp, p) register struct vnode *devvp; struct mount *mp; struct ntfs_args *argsp; struct proc *p; { struct buf *bp; struct ntfsmount *ntmp; dev_t dev = devvp->v_rdev; int error, ronly, ncount, i; struct vnode *vp; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); ncount = vcount(devvp); #if defined(__FreeBSD__) if (devvp->v_object) ncount -= 1; #endif if (ncount > 1 && devvp != rootvp) return (EBUSY); #if defined(__FreeBSD__) vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); #else error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); #endif if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); bp = NULL; error = bread(devvp, BBLOCK, BBSIZE, NOCRED, &bp); if (error) goto out; ntmp = malloc( sizeof *ntmp, M_NTFSMNT, M_WAITOK ); bzero( ntmp, sizeof *ntmp ); bcopy( bp->b_data, &ntmp->ntm_bootfile, sizeof(struct bootfile) ); brelse( bp ); bp = NULL; if (strncmp(ntmp->ntm_bootfile.bf_sysid, NTFS_BBID, NTFS_BBIDLEN)) { error = EINVAL; printf("ntfs_mountfs: invalid boot block\n"); goto out; } { int8_t cpr = ntmp->ntm_mftrecsz; if( cpr > 0 ) ntmp->ntm_bpmftrec = ntmp->ntm_spc * cpr; else ntmp->ntm_bpmftrec = (1 << (-cpr)) / ntmp->ntm_bps; } dprintf(("ntfs_mountfs(): bps: %d, spc: %d, media: %x, mftrecsz: %d (%d sects)\n", ntmp->ntm_bps,ntmp->ntm_spc,ntmp->ntm_bootfile.bf_media, ntmp->ntm_mftrecsz,ntmp->ntm_bpmftrec)); dprintf(("ntfs_mountfs(): mftcn: 0x%x|0x%x\n", (u_int32_t)ntmp->ntm_mftcn,(u_int32_t)ntmp->ntm_mftmirrcn)); ntmp->ntm_mountp = mp; ntmp->ntm_dev = dev; ntmp->ntm_devvp = devvp; ntmp->ntm_uid = argsp->uid; ntmp->ntm_gid = argsp->gid; ntmp->ntm_mode = argsp->mode; ntmp->ntm_flag = argsp->flag; mp->mnt_data = (qaddr_t)ntmp; dprintf(("ntfs_mountfs(): case-%s,%s uid: %d, gid: %d, mode: %o\n", (ntmp->ntm_flag & NTFS_MFLAG_CASEINS)?"insens.":"sens.", (ntmp->ntm_flag & NTFS_MFLAG_ALLNAMES)?" allnames,":"", ntmp->ntm_uid, ntmp->ntm_gid, ntmp->ntm_mode)); /* * We read in some system nodes to do not allow * reclaim them and to have everytime access to them. */ { int pi[3] = { NTFS_MFTINO, NTFS_ROOTINO, NTFS_BITMAPINO }; for (i=0; i<3; i++) { error = VFS_VGET(mp, pi[i], &(ntmp->ntm_sysvn[pi[i]])); if(error) goto out1; ntmp->ntm_sysvn[pi[i]]->v_flag |= VSYSTEM; VREF(ntmp->ntm_sysvn[pi[i]]); vput(ntmp->ntm_sysvn[pi[i]]); } } /* * Read in WHOLE lowcase -> upcase translation * file. */ MALLOC(ntmp->ntm_upcase, wchar *, 65536 * sizeof(wchar), M_NTFSMNT, M_WAITOK); error = VFS_VGET(mp, NTFS_UPCASEINO, &vp); if(error) goto out1; error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, 65536*sizeof(wchar), ntmp->ntm_upcase); vput(vp); if(error) goto out1; /* * Scan $BitMap and count free clusters */ error = ntfs_calccfree(ntmp, &ntmp->ntm_cfree); if(error) goto out1; /* * Read and translate to internal format attribute * definition file. */ { int num,j; struct attrdef ad; /* Open $AttrDef */ error = VFS_VGET(mp, NTFS_ATTRDEFINO, &vp ); if(error) goto out1; /* Count valid entries */ for(num=0;;num++) { error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, num * sizeof(ad), sizeof(ad), &ad); if (error) goto out1; if (ad.ad_name[0] == 0) break; } /* Alloc memory for attribute definitions */ MALLOC(ntmp->ntm_ad, struct ntvattrdef *, num * sizeof(struct ntvattrdef), M_NTFSMNT, M_WAITOK); ntmp->ntm_adnum = num; /* Read them and translate */ for(i=0;intm_ad[i].ad_name[j] = ad.ad_name[j]; } while(ad.ad_name[j++]); ntmp->ntm_ad[i].ad_namelen = j - 1; ntmp->ntm_ad[i].ad_type = ad.ad_type; } vput(vp); } mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); #if defined(__FreeBSD__) mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; #else mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_NTFS); #endif mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; #if defined(__FreeBSD__) devvp->v_specmountpoint = mp; #else devvp->v_specflags |= SI_MOUNTEDON; #endif return (0); out1: for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); if (vflush(mp,NULLVP,0)) printf("ntfs_mountfs: vflush failed\n"); out: #if defined(__FreeBSD__) devvp->v_specmountpoint = NULL; #else devvp->v_specflags &= ~SI_MOUNTEDON; #endif if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); return (error); } static int ntfs_start ( struct mount *mp, int flags, struct proc *p ) { return (0); } static int ntfs_unmount( struct mount *mp, int mntflags, struct proc *p) { register struct ntfsmount *ntmp; int error, ronly = 0, flags, i; dprintf(("ntfs_unmount: unmounting...\n")); ntmp = VFSTONTFS(mp); flags = 0; if(mntflags & MNT_FORCE) flags |= FORCECLOSE; dprintf(("ntfs_unmount: vflushing...\n")); error = vflush(mp,NULLVP,flags | SKIPSYSTEM); if (error) { printf("ntfs_unmount: vflush failed: %d\n",error); return (error); } /* Check if only system vnodes are rest */ for(i=0;intm_sysvn[i]) && (ntmp->ntm_sysvn[i]->v_usecount > 1)) return (EBUSY); /* Derefernce all system vnodes */ for(i=0;intm_sysvn[i]) vrele(ntmp->ntm_sysvn[i]); /* vflush system vnodes */ error = vflush(mp,NULLVP,flags); if (error) printf("ntfs_unmount: vflush failed(sysnodes): %d\n",error); #if defined(__FreeBSD__) ntmp->ntm_devvp->v_specmountpoint = NULL; #else ntmp->ntm_devvp->v_specflags &= ~SI_MOUNTEDON; #endif vinvalbuf(ntmp->ntm_devvp, V_SAVE, NOCRED, p, 0, 0); error = VOP_CLOSE(ntmp->ntm_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(ntmp->ntm_devvp); dprintf(("ntfs_umount: freeing memory...\n")); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; FREE(ntmp->ntm_ad, M_NTFSMNT); FREE(ntmp->ntm_upcase, M_NTFSMNT); FREE(ntmp, M_NTFSMNT); return (error); } static int ntfs_root( struct mount *mp, struct vnode **vpp ) { struct vnode *nvp; int error = 0; dprintf(("ntfs_root(): sysvn: %p\n", VFSTONTFS(mp)->ntm_sysvn[NTFS_ROOTINO])); error = VFS_VGET(mp, (ino_t)NTFS_ROOTINO, &nvp); if(error) { printf("ntfs_root: VFS_VGET failed: %d\n",error); return (error); } *vpp = nvp; return (0); } static int ntfs_quotactl ( struct mount *mp, int cmds, uid_t uid, caddr_t arg, struct proc *p) { printf("\nntfs_quotactl():\n"); return EOPNOTSUPP; } int ntfs_calccfree( struct ntfsmount *ntmp, cn_t *cfreep) { struct vnode *vp; u_int8_t *tmp; int j, error; long cfree = 0; size_t bmsize, i; vp = ntmp->ntm_sysvn[NTFS_BITMAPINO]; bmsize = VTOF(vp)->f_size; MALLOC(tmp, u_int8_t *, bmsize, M_TEMP, M_WAITOK); error = ntfs_readattr(ntmp, VTONT(vp), NTFS_A_DATA, NULL, 0, bmsize, tmp); if(error) { FREE(tmp, M_TEMP); return (error); } for(i=0;intm_sysvn[NTFS_MFTINO])->f_size; mftallocated = VTOF(ntmp->ntm_sysvn[NTFS_MFTINO])->f_allocated; #if defined(__FreeBSD__) sbp->f_type = mp->mnt_vfc->vfc_typenum; #elif defined(__NetBSD__) sbp->f_type = 0; #else sbp->f_type = MOUNT_NTFS; #endif sbp->f_bsize = ntmp->ntm_bps; sbp->f_iosize = ntmp->ntm_bps * ntmp->ntm_spc; sbp->f_blocks = ntmp->ntm_bootfile.bf_spv; sbp->f_bfree = sbp->f_bavail = ntfs_cntobn(ntmp->ntm_cfree); sbp->f_ffree = sbp->f_bfree / ntmp->ntm_bpmftrec; sbp->f_files = mftallocated / ntfs_bntob(ntmp->ntm_bpmftrec) + sbp->f_ffree; if (sbp != &mp->mnt_stat) { bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } sbp->f_flags = mp->mnt_flag; return (0); } static int ntfs_sync ( struct mount *mp, int waitfor, struct ucred *cred, struct proc *p) { /*dprintf(("ntfs_sync():\n"));*/ return (0); } /*ARGSUSED*/ static int ntfs_fhtovp( #if defined(__FreeBSD__) struct mount *mp, struct fid *fhp, struct sockaddr *nam, struct vnode **vpp, int *exflagsp, struct ucred **credanonp) #elif defined(__NetBSD__) struct mount *mp, struct fid *fhp, struct vnode **vpp) #else struct mount *mp, struct fid *fhp, struct mbuf *nam, struct vnode **vpp, int *exflagsp, struct ucred **credanonp) #endif { printf("\ntfs_fhtovp():\n"); return 0; } static int ntfs_vptofh( struct vnode *vp, struct fid *fhp) { printf("ntfs_vptofh():\n"); return EOPNOTSUPP; } int ntfs_vgetex( struct mount *mp, ino_t ino, u_int32_t attrtype, char *attrname, u_long lkflags, u_long flags, struct proc *p, struct vnode **vpp) { int error; register struct ntfsmount *ntmp; struct ntnode *ip; struct fnode *fp; struct vnode *vp; dprintf(("ntfs_vgetex: ino: %d, attr: 0x%x:%s, lkf: 0x%x, f: 0x%x\n", ino, attrtype, attrname?attrname:"", lkflags, flags )); ntmp = VFSTONTFS(mp); *vpp = NULL; /* Get ntnode */ error = ntfs_ntlookup(ntmp, ino, &ip); if (error) { printf("ntfs_vget: ntfs_ntget failed\n"); return (error); } /* It may be not initialized fully, so force load it */ if (!(flags & VG_DONTLOADIN) && !(ip->i_flag & IN_LOADED)) { error = ntfs_loadntnode(ntmp, ip); if(error) { printf("ntfs_vget: CAN'T LOAD ATTRIBUTES FOR INO: %d\n", ip->i_number); ntfs_ntput(ip); return (error); } } error = ntfs_fget(ntmp, ip, attrtype, attrname, &fp); if (error) { printf("ntfs_vget: ntfs_fget failed\n"); ntfs_ntput(ip); return (error); } if (!(flags & VG_DONTVALIDFN) && !(fp->f_flag & FN_VALID)) { if ((ip->i_frflag & NTFS_FRFLAG_DIR) && (fp->f_attrtype == 0x80 && fp->f_attrname == NULL)) { fp->f_type = VDIR; } else if(flags & VG_EXT) { fp->f_type = VNON; fp->f_size =fp->f_allocated = 0; } else { fp->f_type = VREG; error = ntfs_filesize(ntmp, fp, &fp->f_size, &fp->f_allocated); if (error) { ntfs_ntput(ip); return (error); } } fp->f_flag |= FN_VALID; } if (FTOV(fp)) { VGET(FTOV(fp), lkflags, p); *vpp = FTOV(fp); ntfs_ntput(ip); return (0); } error = getnewvnode(VT_NTFS, ntmp->ntm_mountp, ntfs_vnodeop_p, &vp); if(error) { ntfs_frele(fp); ntfs_ntput(ip); return (error); } dprintf(("ntfs_vget: vnode: %p for ntnode: %d\n", vp,ino)); lockinit(&fp->f_lock, PINOD, "fnode", 0, 0); fp->f_vp = vp; vp->v_data = fp; vp->v_type = fp->f_type; if (ino == NTFS_ROOTINO) vp->v_flag |= VROOT; ntfs_ntput(ip); if (lkflags & LK_TYPE_MASK) { error = VN_LOCK(vp, lkflags, p); if (error) { vput(vp); return (error); } } VREF(fp->f_devvp); *vpp = vp; return (0); } static int ntfs_vget( struct mount *mp, ino_t ino, struct vnode **vpp) { return ntfs_vgetex(mp, ino, NTFS_A_DATA, NULL, LK_EXCLUSIVE, 0, curproc, vpp); } #if defined(__FreeBSD__) static struct vfsops ntfs_vfsops = { ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, NULL }; VFS_SET(ntfs_vfsops, ntfs, 0); #elif defined(__NetBSD__) extern struct vnodeopv_desc ntfs_vnodeop_opv_desc; struct vnodeopv_desc *ntfs_vnodeopv_descs[] = { &ntfs_vnodeop_opv_desc, NULL, }; struct vfsops ntfs_vfsops = { MOUNT_NTFS, ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, ntfs_sysctl, ntfs_mountroot, ntfs_checkexp, ntfs_vnodeopv_descs, }; #else static struct vfsops ntfs_vfsops = { ntfs_mount, ntfs_start, ntfs_unmount, ntfs_root, ntfs_quotactl, ntfs_statfs, ntfs_sync, ntfs_vget, ntfs_fhtovp, ntfs_vptofh, ntfs_init, }; VFS_SET(ntfs_vfsops, ntfs, MOUNT_NTFS, 0); #endif Index: head/sys/ntfs/ntfs_vnops.c =================================================================== --- head/sys/ntfs/ntfs_vnops.c (revision 49534) +++ head/sys/ntfs/ntfs_vnops.c (revision 49535) @@ -1,1030 +1,1029 @@ /* $NetBSD: ntfs_vnops.c,v 1.2 1999/05/06 15:43:20 christos Exp $ */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: ntfs_vnops.c,v 1.4 1999/05/11 19:54:52 phk Exp $ + * $Id: ntfs_vnops.c,v 1.5 1999/05/12 09:43:06 semenu Exp $ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) #include #endif #include #include /*#define NTFS_DEBUG 1*/ #include #include #include #include -#include static int ntfs_bypass __P((struct vop_generic_args *ap)); static int ntfs_read __P((struct vop_read_args *)); static int ntfs_write __P((struct vop_write_args *ap)); static int ntfs_getattr __P((struct vop_getattr_args *ap)); static int ntfs_inactive __P((struct vop_inactive_args *ap)); static int ntfs_print __P((struct vop_print_args *ap)); static int ntfs_reclaim __P((struct vop_reclaim_args *ap)); static int ntfs_strategy __P((struct vop_strategy_args *ap)); #if defined(__NetBSD__) static int ntfs_islocked __P((struct vop_islocked_args *ap)); static int ntfs_unlock __P((struct vop_unlock_args *ap)); static int ntfs_lock __P((struct vop_lock_args *ap)); #endif static int ntfs_access __P((struct vop_access_args *ap)); static int ntfs_open __P((struct vop_open_args *ap)); static int ntfs_close __P((struct vop_close_args *ap)); static int ntfs_readdir __P((struct vop_readdir_args *ap)); static int ntfs_lookup __P((struct vop_lookup_args *ap)); static int ntfs_bmap __P((struct vop_bmap_args *ap)); #if defined(__FreeBSD__) static int ntfs_getpages __P((struct vop_getpages_args *ap)); static int ntfs_putpages __P((struct vop_putpages_args *)); #endif static int ntfs_fsync __P((struct vop_fsync_args *ap)); int ntfs_prtactive = 1; /* 1 => print out reclaim of active vnodes */ #if defined(__FreeBSD__) int ntfs_getpages(ap) struct vop_getpages_args *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int ntfs_putpages(ap) struct vop_putpages_args *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } #endif /* * This is a noop, simply returning what one has been given. */ int ntfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { dprintf(("ntfs_bmap: vn: %p, blk: %d\n", ap->a_vp,(u_int32_t)ap->a_bn)); if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; #if !defined(__NetBSD__) if (ap->a_runb != NULL) *ap->a_runb = 0; #endif return (0); } static int ntfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int8_t *data; u_int64_t toread; int error; dprintf(("ntfs_read: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); toread = fp->f_size; dprintf(("ntfs_read: filesize: %d",(u_int32_t)toread)); toread = min( uio->uio_resid, toread - uio->uio_offset ); dprintf((", toread: %d\n",(u_int32_t)toread)); MALLOC(data, u_int8_t *, toread, M_TEMP,M_WAITOK); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, uio->uio_offset, toread, data); if(error) { printf("ntfs_read: ntfs_readattr failed: %d\n",error); FREE(data, M_TEMP); return (error); } error = uiomove(data, (int) toread, uio); if(error) { printf("ntfs_read: uiomove failed: %d\n",error); FREE(data, M_TEMP); return (error); } FREE(data, M_TEMP); return (0); } static int ntfs_bypass(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { int error = ENOTTY; dprintf(("ntfs_bypass: %s\n", ap->a_desc->vdesc_name)); return (error); } static int ntfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); register struct vattr *vap = ap->a_vap; dprintf(("ntfs_getattr: %d, flags: %d\n",ip->i_number,ip->i_flag)); vap->va_fsid = dev2udev(fp->f_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = 0; /* XXX UNODEV ? */ vap->va_size = fp->f_size; vap->va_bytes = fp->f_allocated; vap->va_atime = ntfs_nttimetounix(fp->f_times.t_access); vap->va_mtime = ntfs_nttimetounix(fp->f_times.t_write); vap->va_ctime = ntfs_nttimetounix(fp->f_times.t_create); vap->va_flags = ip->i_flag; vap->va_gen = 0; vap->va_blocksize = ip->i_mp->ntm_spc * ip->i_mp->ntm_bps; vap->va_type = fp->f_type; vap->va_filerev = 0; return (0); } /* * Last reference to an ntnode. If necessary, write or delete it. */ int ntfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); int error; dprintf(("ntfs_inactive: vnode: %p, ntnode: %d\n", vp, ip->i_number)); if (ntfs_prtactive && vp->v_usecount != 0) vprint("ntfs_inactive: pushing active", vp); error = 0; VOP__UNLOCK(vp,0,ap->a_p); /* * If we are done with the ntnode, reclaim it * so that it can be reused immediately. */ if (vp->v_usecount == 0 && ip->i_mode == 0) #if defined(__FreeBSD__) vrecycle(vp, (struct simplelock *)0, ap->a_p); #else /* defined(__NetBSD__) */ vgone(vp); #endif return (error); } /* * Reclaim an inode so that it can be used for other purposes. */ int ntfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); int error; dprintf(("ntfs_reclaim: vnode: %p, ntnode: %d\n", vp, ip->i_number)); error = ntfs_ntget(ip); if (error) return (error); #if defined(__FreeBSD__) VOP__UNLOCK(vp,0,ap->a_p); #endif /* Purge old data structures associated with the inode. */ cache_purge(vp); if (fp->f_devvp) { vrele(fp->f_devvp); fp->f_devvp = NULL; } ntfs_frele(fp); vp->v_data = NULL; ntfs_ntput(ip); return (0); } static int ntfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { /* printf("[ntfs_print]");*/ return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ntfs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct ntfsmount *ntmp = ip->i_mp; int error; dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n", (u_int32_t)bp->b_offset,(u_int32_t)bp->b_blkno, (u_int32_t)bp->b_lblkno)); dprintf(("strategy: bcount: %d flags: 0x%x\n", (u_int32_t)bp->b_bcount,bp->b_flags)); if (bp->b_flags & B_READ) { u_int32_t toread; if (ntfs_cntob(bp->b_blkno) >= fp->f_size) { clrbuf(bp); error = 0; } else { toread = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: toread: %d, fsize: %d\n", toread,(u_int32_t)fp->f_size)); error = ntfs_readattr(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno), toread, bp->b_data); if (error) { printf("ntfs_strategy: ntfs_readattr failed\n"); bp->b_error = error; bp->b_flags |= B_ERROR; } bzero(bp->b_data + toread, bp->b_bcount - toread); } } else { size_t tmp; u_int32_t towrite; if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) { printf("ntfs_strategy: CAN'T EXTEND FILE\n"); bp->b_error = error = EFBIG; bp->b_flags |= B_ERROR; } else { towrite = min(bp->b_bcount, fp->f_size-ntfs_cntob(bp->b_blkno)); dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n", towrite,(u_int32_t)fp->f_size)); error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite, bp->b_data, &tmp); if (error) { printf("ntfs_strategy: ntfs_writeattr fail\n"); bp->b_error = error; bp->b_flags |= B_ERROR; } } } biodone(bp); return (error); } static int ntfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; u_int8_t *data; u_int64_t towrite; off_t off; size_t written; int error; dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg)); towrite = fp->f_size; dprintf(("ntfs_write: filesize: %d",(u_int32_t)towrite)); if (uio->uio_resid + uio->uio_offset > towrite) { printf("ntfs_write: CAN'T WRITE BEYOND OF FILE\n"); return (EFBIG); } towrite = min(uio->uio_resid, towrite - uio->uio_offset); off = uio->uio_offset; dprintf((", towrite: %d\n",(u_int32_t)towrite)); MALLOC(data, u_int8_t *, towrite, M_TEMP,M_WAITOK); error = uiomove(data, (int) towrite, uio); if(error) { FREE(data, M_TEMP); return (error); } error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype, fp->f_attrname, off, towrite, data, &written); if(error) { printf("ntfs_write: ntfs_writeattr failed: %d\n",error); FREE(data, M_TEMP); return (error); } FREE(data, M_TEMP); return (0); } #if defined(__NetBSD__) /* * Check for a locked ntnode. */ int ntfs_islocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { register struct ntnode *ip = VTONT(ap->a_vp); dprintf(("ntfs_islocked %d\n",ip->i_number)); if (ip->i_flag & IN_LOCKED) return (1); return (0); } /* * Unlock an ntnode. If WANT bit is on, wakeup. */ int ntfs_lockcount = 90; int ntfs_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { register struct ntnode *ip = VTONT(ap->a_vp); #ifdef DIAGNOSTIC struct proc *p = curproc; #endif dprintf(("ntfs_unlock %d\n",ip->i_number)); #ifdef DIAGNOSTIC if ((ip->i_flag & IN_LOCKED) == 0) { vprint("ntfs_unlock: unlocked ntnode", ap->a_vp); panic("ntfs_unlock NOT LOCKED"); } if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 && ip->i_lockholder > -1 && ntfs_lockcount++ < 100) panic("unlocker (%d) != lock holder (%d)", p->p_pid, ip->i_lockholder); #endif if (--ip->i_lockcount > 0) { if ((ip->i_flag & IN_RECURSE) == 0) panic("ntfs_unlock: recursive lock prematurely released, pid=%d\n", ip->i_lockholder); return (0); } ip->i_lockholder = 0; ip->i_flag &= ~(IN_LOCKED|IN_RECURSE); if (ip->i_flag & IN_WANTED) { ip->i_flag &= ~IN_WANTED; wakeup((caddr_t)ip); } return (0); } /* * Lock an ntnode. If its already locked, set the WANT bit and sleep. */ int ntfs_lock(ap) struct vop_lock_args /* { struct vnode *a_vp; } */ *ap; { struct proc *p = curproc; register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); dprintf(("ntfs_lock %d (%d locks)\n",ip->i_number,ip->i_lockcount)); start: while (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t)vp, PINOD, "ntflk1", 0); } if (vp->v_tag == VT_NON) return (ENOENT); ip = VTONT(vp); if (ip->i_flag & IN_LOCKED) { if (p->p_pid == ip->i_lockholder) { if( (ip->i_flag & IN_RECURSE) == 0) panic("ntfs_lock: recursive lock not expected, pid: %d\n", ip->i_lockholder); } else { ip->i_flag |= IN_WANTED; #ifdef DIAGNOSTIC if (p) ip->i_lockwaiter = p->p_pid; else ip->i_lockwaiter = -1; #endif (void) tsleep((caddr_t)ip, PINOD, "ntflk2", 0); goto start; } } #ifdef DIAGNOSTIC ip->i_lockwaiter = 0; if (((ip->i_flag & IN_RECURSE) == 0) && (ip->i_lockholder != 0)) panic("lockholder (%d) != 0", ip->i_lockholder); if (p && p->p_pid == 0) printf("locking by process 0\n"); #endif if ((ip->i_flag & IN_RECURSE) == 0) ip->i_lockcount = 1; else ++ip->i_lockcount; if (p) ip->i_lockholder = p->p_pid; else ip->i_lockholder = -1; ip->i_flag |= IN_LOCKED; return (0); } #endif int ntfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct ntnode *ip = VTONT(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; register gid_t *gp; int i; #ifdef QUOTA int error; #endif dprintf(("ntfs_access: %d\n",ip->i_number)); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch ((int)vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } /* If immutable bit set, nobody gets to write it. */ /* if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) return (EPERM); */ /* Otherwise, user id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->i_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->i_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ static int ntfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_open: %d\n",ip->i_number); #endif /* * Files marked append-only must be opened for appending. */ return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ntfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { #if NTFS_DEBUG register struct vnode *vp = ap->a_vp; register struct ntnode *ip = VTONT(vp); printf("ntfs_close: %d\n",ip->i_number); #endif return (0); } int ntfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct fnode *fp = VTOF(vp); register struct ntnode *ip = FTONT(fp); struct uio *uio = ap->a_uio; struct ntfsmount *ntmp = ip->i_mp; int i, error = 0; u_int32_t faked = 0, num; int ncookies = 0; struct dirent cde; off_t off; dprintf(("ntfs_readdir %d off: %d resid: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid)); off = uio->uio_offset; /* Simulate . in every dir except ROOT */ if( ip->i_number != NTFS_ROOTINO ) { struct dirent dot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 1, "." }; if( uio->uio_offset < sizeof(struct dirent) ) { dot.d_fileno = ip->i_number; error = uiomove((char *)&dot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } } /* Simulate .. in every dir including ROOT */ if( uio->uio_offset < 2 * sizeof(struct dirent) ) { struct dirent dotdot = { NTFS_ROOTINO, sizeof(struct dirent), DT_DIR, 2, ".." }; error = uiomove((char *)&dotdot,sizeof(struct dirent),uio); if(error) return (error); ncookies ++; } faked = (ip->i_number == NTFS_ROOTINO) ? 1 : 2; num = uio->uio_offset / sizeof(struct dirent) - faked; while( uio->uio_resid >= sizeof(struct dirent) ) { struct attr_indexentry *iep; error = ntfs_ntreaddir(ntmp, fp, num, &iep); if(error) return (error); if( NULL == iep ) break; while( !(iep->ie_flag & NTFS_IEFLAG_LAST) && (uio->uio_resid >= sizeof(struct dirent)) ) { if( ntfs_isnamepermitted(ntmp,iep) ) { dprintf(("ntfs_readdir: elem: %d, fname:[",num)); for(i=0;iie_fnamelen;i++) { cde.d_name[i] = (char)iep->ie_fname[i]; dprintf(("%c", cde.d_name[i])); } dprintf(("] type: %d, flag: %d, ",iep->ie_fnametype, iep->ie_flag)); cde.d_name[i] = '\0'; cde.d_namlen = iep->ie_fnamelen; cde.d_fileno = iep->ie_number; cde.d_type = (iep->ie_fflag & NTFS_FFLAG_DIR) ? DT_DIR : DT_REG; cde.d_reclen = sizeof(struct dirent); dprintf(("%s\n", (cde.d_type == DT_DIR) ? "dir":"reg")); error = uiomove((char *)&cde, sizeof(struct dirent), uio); if(error) return (error); ncookies++; num++; } iep = NTFS_NEXTREC(iep,struct attr_indexentry *); } } dprintf(("ntfs_readdir: %d entries (%d bytes) read\n", ncookies,(u_int)(uio->uio_offset - off))); dprintf(("ntfs_readdir: off: %d resid: %d\n", (u_int32_t)uio->uio_offset,uio->uio_resid)); if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dp; #if defined(__FreeBSD__) u_long *cookies; u_long *cookiep; #else /* defined(__NetBSD__) */ off_t *cookies; off_t *cookiep; #endif printf("ntfs_readdir: %d cookies\n",ncookies); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ntfs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) ((caddr_t)uio->uio_iov->iov_base - (uio->uio_offset - off)); #if defined(__FreeBSD__) MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); #else /* defined(__NetBSD__) */ MALLOC(cookies, off_t *, ncookies * sizeof(off_t), M_TEMP, M_WAITOK); #endif for (dp = dpStart, cookiep = cookies, i=0; i < ncookies; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen), i++) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } /* if (ap->a_eofflag) *ap->a_eofflag = VTONT(ap->a_vp)->i_size <= uio->uio_offset; */ return (error); } int ntfs_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct ntnode *dip = VTONT(dvp); struct ntfsmount *ntmp = dip->i_mp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int error; int lockparent = cnp->cn_flags & LOCKPARENT; #if NTFS_DEBUG int wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); #endif dprintf(("ntfs_lookup: %s (%ld bytes) in %d, lp: %d, wp: %d \n", cnp->cn_nameptr, cnp->cn_namelen, dip->i_number,lockparent, wantparent)); error = VOP_ACCESS(dvp, VEXEC, cred, cnp->cn_proc); if(error) return (error); if( (cnp->cn_namelen == 1) && !strncmp(cnp->cn_nameptr,".",1) ) { dprintf(("ntfs_lookup: faking . directory in %d\n", dip->i_number)); VREF(dvp); *ap->a_vpp = dvp; return (0); } else if( (cnp->cn_namelen == 2) && !strncmp(cnp->cn_nameptr,"..",2) && (cnp->cn_flags & ISDOTDOT) ) { struct ntvattr *vap; dprintf(("ntfs_lookup: faking .. directory in %d\n", dip->i_number)); error = ntfs_ntvattrget(ntmp, dip, NTFS_A_NAME, NULL, 0, &vap); if(error) return (error); VOP__UNLOCK(dvp,0,cnp->cn_proc); dprintf(("ntfs_lookup: parentdir: %d\n", vap->va_a_name->n_pnumber)); error = VFS_VGET(ntmp->ntm_mountp, vap->va_a_name->n_pnumber,ap->a_vpp); ntfs_ntvattrrele(vap); if(error) { VOP__LOCK(dvp, 0, cnp->cn_proc); return(error); } if( lockparent && (cnp->cn_flags & ISLASTCN) && (error = VOP__LOCK(dvp, 0, cnp->cn_proc)) ) { vput( *(ap->a_vpp) ); return (error); } return (error); } else { error = ntfs_ntlookupfile(ntmp, dvp, cnp, ap->a_vpp); if(error) return (error); dprintf(("ntfs_lookup: found ino: %d\n", VTONT(*ap->a_vpp)->i_number)); if(!lockparent || !(cnp->cn_flags & ISLASTCN)) VOP__UNLOCK(dvp, 0, cnp->cn_proc); if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *ap->a_vpp, cnp); } return (error); } /* * Flush the blocks of a file to disk. * * This function is worthless for vnodes that represent directories. Maybe we * could just do a sync if they try an fsync on a directory file. */ static int ntfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { return (0); } /* * Global vfs data structures */ vop_t **ntfs_vnodeop_p; #if defined(__FreeBSD__) static #endif struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *)ntfs_bypass }, { &vop_getattr_desc, (vop_t *)ntfs_getattr }, { &vop_inactive_desc, (vop_t *)ntfs_inactive }, { &vop_reclaim_desc, (vop_t *)ntfs_reclaim }, { &vop_print_desc, (vop_t *)ntfs_print }, #if defined(__FreeBSD__) { &vop_islocked_desc, (vop_t *)vop_stdislocked }, { &vop_unlock_desc, (vop_t *)vop_stdunlock }, { &vop_lock_desc, (vop_t *)vop_stdlock }, { &vop_cachedlookup_desc, (vop_t *)ntfs_lookup }, { &vop_lookup_desc, (vop_t *)vfs_cache_lookup }, #else { &vop_islocked_desc, (vop_t *)ntfs_islocked }, { &vop_unlock_desc, (vop_t *)ntfs_unlock }, { &vop_lock_desc, (vop_t *)ntfs_lock }, { &vop_lookup_desc, (vop_t *)ntfs_lookup }, #endif { &vop_access_desc, (vop_t *)ntfs_access }, { &vop_close_desc, (vop_t *)ntfs_close }, { &vop_open_desc, (vop_t *)ntfs_open }, { &vop_readdir_desc, (vop_t *)ntfs_readdir }, { &vop_fsync_desc, (vop_t *)ntfs_fsync }, { &vop_bmap_desc, (vop_t *)ntfs_bmap }, #if defined(__FreeBSD__) { &vop_getpages_desc, (vop_t *) ntfs_getpages }, { &vop_putpages_desc, (vop_t *) ntfs_putpages }, #endif { &vop_strategy_desc, (vop_t *)ntfs_strategy }, #if defined(__FreeBSD__) { &vop_bwrite_desc, (vop_t *)vop_stdbwrite }, #else /* defined(__NetBSD__) */ { &vop_bwrite_desc, (vop_t *)vn_bwrite }, #endif { &vop_read_desc, (vop_t *)ntfs_read }, { &vop_write_desc, (vop_t *)ntfs_write }, { NULL, NULL } }; #if defined(__FreeBSD__) static #endif struct vnodeopv_desc ntfs_vnodeop_opv_desc = { &ntfs_vnodeop_p, ntfs_vnodeop_entries }; #if defined(__FreeBSD__) VNODEOP_SET(ntfs_vnodeop_opv_desc); #endif Index: head/sys/sys/conf.h =================================================================== --- head/sys/sys/conf.h (revision 49534) +++ head/sys/sys/conf.h (revision 49535) @@ -1,231 +1,278 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)conf.h 8.5 (Berkeley) 1/9/95 - * $Id: conf.h,v 1.66 1999/07/17 19:58:51 phk Exp $ + * $Id: conf.h,v 1.67 1999/07/20 09:47:50 phk Exp $ */ #ifndef _SYS_CONF_H_ #define _SYS_CONF_H_ +#define SPECNAMELEN 15 + +struct tty; +struct vnode; + +struct specinfo { + struct mount *si_mountpoint; + int si_bsize_phys; /* minimum physical block size */ + int si_bsize_best; /* optimal block size / VBLK */ + int si_bsize_max; /* maximum block size */ + + udev_t si_udev; + SLIST_ENTRY(specinfo) si_hash; + struct vnode *si_hlist; + char si_name[SPECNAMELEN + 1]; + void *si_drv1, *si_drv2; + struct cdevsw *si_devsw; + union { + struct { + struct tty *__sit_tty; + } __si_tty; + } __si_u; +}; + +#define si_tty_tty __si_u.__si_tty.__sit_tty + /* + * Exported shorthand + */ +#define v_hashchain v_specinfo->si_hlist +#define v_specmountpoint v_specinfo->si_mountpoint + +/* + * Special device management + */ +#define SPECHSZ 64 +#define SPECHASH(rdev) (((unsigned)(minor(rdev)))%SPECHSZ) + +/* * Definitions of device driver entry switches */ struct buf; struct proc; -struct specinfo; -struct tty; struct uio; -struct vnode; typedef int d_open_t __P((dev_t dev, int oflags, int devtype, struct proc *p)); typedef int d_close_t __P((dev_t dev, int fflag, int devtype, struct proc *p)); typedef void d_strategy_t __P((struct buf *bp)); typedef int d_parms_t __P((dev_t dev, struct specinfo *sinfo, int ctl)); typedef int d_ioctl_t __P((dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)); typedef int d_dump_t __P((dev_t dev)); typedef int d_psize_t __P((dev_t dev)); typedef int d_read_t __P((dev_t dev, struct uio *uio, int ioflag)); typedef int d_write_t __P((dev_t dev, struct uio *uio, int ioflag)); typedef void d_stop_t __P((struct tty *tp, int rw)); typedef int d_reset_t __P((dev_t dev)); typedef struct tty *d_devtotty_t __P((dev_t dev)); typedef int d_poll_t __P((dev_t dev, int events, struct proc *p)); typedef int d_mmap_t __P((dev_t dev, vm_offset_t offset, int nprot)); typedef int l_open_t __P((dev_t dev, struct tty *tp)); typedef int l_close_t __P((struct tty *tp, int flag)); typedef int l_read_t __P((struct tty *tp, struct uio *uio, int flag)); typedef int l_write_t __P((struct tty *tp, struct uio *uio, int flag)); typedef int l_ioctl_t __P((struct tty *tp, u_long cmd, caddr_t data, int flag, struct proc *p)); typedef int l_rint_t __P((int c, struct tty *tp)); typedef int l_start_t __P((struct tty *tp)); typedef int l_modem_t __P((struct tty *tp, int flag)); /* * Types for d_type. */ #define D_TAPE 1 #define D_DISK 2 #define D_TTY 3 #define D_TYPEMASK 0xffff /* * Flags for d_flags. */ #define D_NOCLUSTERR 0x10000 /* disables cluter read */ #define D_NOCLUSTERW 0x20000 /* disables cluster write */ #define D_NOCLUSTERRW (D_NOCLUSTERR | D_NOCLUSTERW) #define D_CANFREE 0x40000 /* can free blocks */ /* - * Control type for d_parms() call. - */ -#define DPARM_GET 0 /* ask device to load parms in */ - -/* * Character device switch table */ struct cdevsw { d_open_t *d_open; d_close_t *d_close; d_read_t *d_read; d_write_t *d_write; d_ioctl_t *d_ioctl; d_stop_t *d_stop; d_reset_t *d_bogoreset; /* XXX not used */ d_devtotty_t *d_devtotty; d_poll_t *d_poll; d_mmap_t *d_mmap; d_strategy_t *d_strategy; char *d_name; /* base device name, e.g. 'vn' */ d_parms_t *d_bogoparms; /* XXX not used */ int d_maj; d_dump_t *d_dump; d_psize_t *d_psize; u_int d_flags; int d_maxio; int d_bmaj; }; /* * Line discipline switch table */ struct linesw { l_open_t *l_open; l_close_t *l_close; l_read_t *l_read; l_write_t *l_write; l_ioctl_t *l_ioctl; l_rint_t *l_rint; l_start_t *l_start; l_modem_t *l_modem; u_char l_hotchar; }; #ifdef KERNEL extern struct linesw linesw[]; extern int nlinesw; int ldisc_register __P((int , struct linesw *)); void ldisc_deregister __P((int)); #define LDISC_LOAD -1 /* Loadable line discipline */ #endif /* * Swap device table */ struct swdevt { udev_t sw_dev; /* For quasibogus swapdev reporting */ int sw_flags; int sw_nblks; struct vnode *sw_vp; dev_t sw_device; }; #define SW_FREED 0x01 #define SW_SEQUENTIAL 0x02 #define sw_freed sw_flags /* XXX compat */ #ifdef KERNEL d_open_t noopen; d_close_t noclose; d_read_t noread; d_write_t nowrite; d_ioctl_t noioctl; d_stop_t nostop; d_reset_t noreset; d_devtotty_t nodevtotty; d_mmap_t nommap; #define nostrategy ((d_strategy_t *)NULL) #define noparms ((d_parms_t *)NULL) #define nopoll seltrue d_dump_t nodump; #define NUMCDEVSW 256 /* * nopsize is little used, so not worth having dummy functions for. */ #define nopsize ((d_psize_t *)NULL) d_open_t nullopen; d_close_t nullclose; l_read_t l_noread; l_write_t l_nowrite; struct module; struct devsw_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ struct cdevsw *cdevsw; /* device functions */ /* Do not initialize fields hereafter */ }; #define DEV_MODULE(name, cmaj, bmaj, devsw, evh, arg) \ static struct devsw_module_data name##_devsw_mod = { \ evh, arg, &devsw \ }; \ \ static moduledata_t name##_mod = { \ #name, \ devsw_module_handler, \ &name##_devsw_mod \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+cmaj*256+bmaj) struct cdevsw *bdevsw __P((dev_t dev)); int cdevsw_add __P((struct cdevsw *new)); int cdevsw_remove __P((struct cdevsw *old)); dev_t chrtoblk __P((dev_t dev)); struct cdevsw *devsw __P((dev_t dev)); int devsw_module_handler __P((struct module *mod, int what, void *arg)); int iskmemdev __P((dev_t dev)); int iszerodev __P((dev_t dev)); dev_t makebdev __P((int maj, int min)); +dev_t make_dev __P((struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, char *fmt, ...)) __printflike(6, 7); void setconf __P((void)); + +/* + * XXX: This gunk included in case DEVFS resurfaces + */ + +#define UID_ROOT 0 +#define UID_BIN 3 +#define UID_UUCP 66 + +#define GID_WHEEL 0 +#define GID_KMEM 2 +#define GID_OPERATOR 5 +#define GID_BIN 7 +#define GID_GAMES 13 +#define GID_DIALER 68 + #endif /* KERNEL */ #endif /* !_SYS_CONF_H_ */ Index: head/sys/sys/devfsext.h =================================================================== --- head/sys/sys/devfsext.h (revision 49534) +++ head/sys/sys/devfsext.h (revision 49535) @@ -1,85 +1,72 @@ /* * Copyright 1997,1998 Julian Elischer. All rights reserved. * julian@freebsd.org * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: devfsext.h,v 1.21 1998/07/13 06:45:16 bde Exp $ + * $Id: devfsext.h,v 1.22 1998/12/10 19:57:01 eivind Exp $ */ #ifndef _SYS_DEVFSEXT_H_ #define _SYS_DEVFSEXT_H_ /* * Make a device at a path, and get a cookie for it in return. * Specify the type, the minor number and the devsw entry to use, * and the initial default perms/ownerships. */ void *devfs_add_devswf __P((void *devsw, int minor, int chrblk, uid_t uid, gid_t gid, int perms, char *fmt, ...)) __printflike(7, 8); /* * Make a link to a device you already made, and have the cookie for * We get another cookie, but for now, it can be discarded, as * at the moment there is nothing you can do with it that you couldn't do * with the original cookie. ( XXX this might be something I should change ) */ void *devfs_makelink __P((void *original, char *fmt, ...)) __printflike(2, 3); /* * Remove all instances of a device you have made. INCLUDING LINKS. * I.e. either the cookie from the original device or the cookie * from a link will have the effect of removing both entries. * Removing with BOTH an original cookie and one from a link is * likely to cause a panic. */ void devfs_remove_dev __P((void *devnmp)); /* * Check if a device exists and is the type you need. Returns NULL or a * cookie that can be used to try 'open' the device. XXX This is a bit * of a duplication of devfs_lookup(). I might one day try merge them a bit. * Used for mountroot under DEVFS. Path is relative to the base of the devfs. */ struct vnode *devfs_open_device __P((char *path, int devtype)); void devfs_close_device __P((struct vnode *vn)); dev_t devfs_vntodev __P((struct vnode *vn)); /* extract dev_t from devfs vn */ #define DV_CHR 0 #define DV_BLK 1 #define DV_DEV 2 - -/* XXX */ -#define UID_ROOT 0 -#define UID_BIN 3 -#define UID_UUCP 66 - -/* XXX */ -#define GID_WHEEL 0 -#define GID_KMEM 2 -#define GID_OPERATOR 5 -#define GID_BIN 7 -#define GID_GAMES 13 -#define GID_DIALER 68 #endif /* !_SYS_DEVFSEXT_H_ */ Index: head/sys/sys/linedisc.h =================================================================== --- head/sys/sys/linedisc.h (revision 49534) +++ head/sys/sys/linedisc.h (revision 49535) @@ -1,231 +1,278 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)conf.h 8.5 (Berkeley) 1/9/95 - * $Id: conf.h,v 1.66 1999/07/17 19:58:51 phk Exp $ + * $Id: conf.h,v 1.67 1999/07/20 09:47:50 phk Exp $ */ #ifndef _SYS_CONF_H_ #define _SYS_CONF_H_ +#define SPECNAMELEN 15 + +struct tty; +struct vnode; + +struct specinfo { + struct mount *si_mountpoint; + int si_bsize_phys; /* minimum physical block size */ + int si_bsize_best; /* optimal block size / VBLK */ + int si_bsize_max; /* maximum block size */ + + udev_t si_udev; + SLIST_ENTRY(specinfo) si_hash; + struct vnode *si_hlist; + char si_name[SPECNAMELEN + 1]; + void *si_drv1, *si_drv2; + struct cdevsw *si_devsw; + union { + struct { + struct tty *__sit_tty; + } __si_tty; + } __si_u; +}; + +#define si_tty_tty __si_u.__si_tty.__sit_tty + /* + * Exported shorthand + */ +#define v_hashchain v_specinfo->si_hlist +#define v_specmountpoint v_specinfo->si_mountpoint + +/* + * Special device management + */ +#define SPECHSZ 64 +#define SPECHASH(rdev) (((unsigned)(minor(rdev)))%SPECHSZ) + +/* * Definitions of device driver entry switches */ struct buf; struct proc; -struct specinfo; -struct tty; struct uio; -struct vnode; typedef int d_open_t __P((dev_t dev, int oflags, int devtype, struct proc *p)); typedef int d_close_t __P((dev_t dev, int fflag, int devtype, struct proc *p)); typedef void d_strategy_t __P((struct buf *bp)); typedef int d_parms_t __P((dev_t dev, struct specinfo *sinfo, int ctl)); typedef int d_ioctl_t __P((dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)); typedef int d_dump_t __P((dev_t dev)); typedef int d_psize_t __P((dev_t dev)); typedef int d_read_t __P((dev_t dev, struct uio *uio, int ioflag)); typedef int d_write_t __P((dev_t dev, struct uio *uio, int ioflag)); typedef void d_stop_t __P((struct tty *tp, int rw)); typedef int d_reset_t __P((dev_t dev)); typedef struct tty *d_devtotty_t __P((dev_t dev)); typedef int d_poll_t __P((dev_t dev, int events, struct proc *p)); typedef int d_mmap_t __P((dev_t dev, vm_offset_t offset, int nprot)); typedef int l_open_t __P((dev_t dev, struct tty *tp)); typedef int l_close_t __P((struct tty *tp, int flag)); typedef int l_read_t __P((struct tty *tp, struct uio *uio, int flag)); typedef int l_write_t __P((struct tty *tp, struct uio *uio, int flag)); typedef int l_ioctl_t __P((struct tty *tp, u_long cmd, caddr_t data, int flag, struct proc *p)); typedef int l_rint_t __P((int c, struct tty *tp)); typedef int l_start_t __P((struct tty *tp)); typedef int l_modem_t __P((struct tty *tp, int flag)); /* * Types for d_type. */ #define D_TAPE 1 #define D_DISK 2 #define D_TTY 3 #define D_TYPEMASK 0xffff /* * Flags for d_flags. */ #define D_NOCLUSTERR 0x10000 /* disables cluter read */ #define D_NOCLUSTERW 0x20000 /* disables cluster write */ #define D_NOCLUSTERRW (D_NOCLUSTERR | D_NOCLUSTERW) #define D_CANFREE 0x40000 /* can free blocks */ /* - * Control type for d_parms() call. - */ -#define DPARM_GET 0 /* ask device to load parms in */ - -/* * Character device switch table */ struct cdevsw { d_open_t *d_open; d_close_t *d_close; d_read_t *d_read; d_write_t *d_write; d_ioctl_t *d_ioctl; d_stop_t *d_stop; d_reset_t *d_bogoreset; /* XXX not used */ d_devtotty_t *d_devtotty; d_poll_t *d_poll; d_mmap_t *d_mmap; d_strategy_t *d_strategy; char *d_name; /* base device name, e.g. 'vn' */ d_parms_t *d_bogoparms; /* XXX not used */ int d_maj; d_dump_t *d_dump; d_psize_t *d_psize; u_int d_flags; int d_maxio; int d_bmaj; }; /* * Line discipline switch table */ struct linesw { l_open_t *l_open; l_close_t *l_close; l_read_t *l_read; l_write_t *l_write; l_ioctl_t *l_ioctl; l_rint_t *l_rint; l_start_t *l_start; l_modem_t *l_modem; u_char l_hotchar; }; #ifdef KERNEL extern struct linesw linesw[]; extern int nlinesw; int ldisc_register __P((int , struct linesw *)); void ldisc_deregister __P((int)); #define LDISC_LOAD -1 /* Loadable line discipline */ #endif /* * Swap device table */ struct swdevt { udev_t sw_dev; /* For quasibogus swapdev reporting */ int sw_flags; int sw_nblks; struct vnode *sw_vp; dev_t sw_device; }; #define SW_FREED 0x01 #define SW_SEQUENTIAL 0x02 #define sw_freed sw_flags /* XXX compat */ #ifdef KERNEL d_open_t noopen; d_close_t noclose; d_read_t noread; d_write_t nowrite; d_ioctl_t noioctl; d_stop_t nostop; d_reset_t noreset; d_devtotty_t nodevtotty; d_mmap_t nommap; #define nostrategy ((d_strategy_t *)NULL) #define noparms ((d_parms_t *)NULL) #define nopoll seltrue d_dump_t nodump; #define NUMCDEVSW 256 /* * nopsize is little used, so not worth having dummy functions for. */ #define nopsize ((d_psize_t *)NULL) d_open_t nullopen; d_close_t nullclose; l_read_t l_noread; l_write_t l_nowrite; struct module; struct devsw_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ struct cdevsw *cdevsw; /* device functions */ /* Do not initialize fields hereafter */ }; #define DEV_MODULE(name, cmaj, bmaj, devsw, evh, arg) \ static struct devsw_module_data name##_devsw_mod = { \ evh, arg, &devsw \ }; \ \ static moduledata_t name##_mod = { \ #name, \ devsw_module_handler, \ &name##_devsw_mod \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+cmaj*256+bmaj) struct cdevsw *bdevsw __P((dev_t dev)); int cdevsw_add __P((struct cdevsw *new)); int cdevsw_remove __P((struct cdevsw *old)); dev_t chrtoblk __P((dev_t dev)); struct cdevsw *devsw __P((dev_t dev)); int devsw_module_handler __P((struct module *mod, int what, void *arg)); int iskmemdev __P((dev_t dev)); int iszerodev __P((dev_t dev)); dev_t makebdev __P((int maj, int min)); +dev_t make_dev __P((struct cdevsw *devsw, int minor, uid_t uid, gid_t gid, int perms, char *fmt, ...)) __printflike(6, 7); void setconf __P((void)); + +/* + * XXX: This gunk included in case DEVFS resurfaces + */ + +#define UID_ROOT 0 +#define UID_BIN 3 +#define UID_UUCP 66 + +#define GID_WHEEL 0 +#define GID_KMEM 2 +#define GID_OPERATOR 5 +#define GID_BIN 7 +#define GID_GAMES 13 +#define GID_DIALER 68 + #endif /* KERNEL */ #endif /* !_SYS_CONF_H_ */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 49534) +++ head/sys/sys/vnode.h (revision 49535) @@ -1,582 +1,584 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 - * $Id: vnode.h,v 1.91 1999/07/20 09:47:54 phk Exp $ + * $Id: vnode.h,v 1.92 1999/07/26 06:25:53 alc Exp $ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; /* * Vnode tag types. * These are for the benefit of external programs only (e.g., pstat) * and should NEVER be inspected by the kernel. */ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_UNION, VT_MSDOSFS, VT_DEVFS, VT_TFS, VT_VFS, VT_CODA, VT_NTFS }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ TAILQ_HEAD(buflists, buf); typedef int vop_t __P((void *)); struct namecache; /* * Reading or writing any of these items requires holding the appropriate lock. * v_freelist is locked by the global vnode_free_list simple lock. * v_mntvnodes is locked by the global mntvnodes simple lock. * v_flag, v_usecount, v_holdcount and v_writecount are * locked by the v_interlock simple lock. * v_pollinfo is locked by the lock contained inside it. */ struct vnode { u_long v_flag; /* vnode flags (see below) */ int v_usecount; /* reference count of users */ int v_writecount; /* reference count of writers */ int v_holdcnt; /* page & buffer references */ daddr_t v_lastr; /* last read (read-ahead) */ u_long v_id; /* capability identifier */ struct mount *v_mount; /* ptr to vfs we are in */ vop_t **v_op; /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */ long v_numoutput; /* num of writes in progress */ enum vtype v_type; /* vnode type */ union { struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ struct socket *vu_socket; /* unix ipc (VSOCK) */ struct { struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ struct vnode *vu_specnext; } vu_spec; struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; struct nqlease *v_lease; /* Soft reference to lease */ daddr_t v_lastw; /* last write (write cluster) */ daddr_t v_cstart; /* start block of cluster */ daddr_t v_lasta; /* last allocation */ int v_clen; /* length of current cluster */ int v_maxio; /* maximum I/O cluster size */ struct vm_object *v_object; /* Place to store VM object */ struct simplelock v_interlock; /* lock on usecount and flag */ struct lock *v_vnlock; /* used for non-locking fs's */ enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ LIST_HEAD(, namecache) v_cache_src; /* Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* Cache entries to us */ struct vnode *v_dd; /* .. vnode */ u_long v_ddid; /* .. capability identifier */ struct { struct simplelock vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ } v_pollinfo; #ifdef DEBUG_LOCKS const char *filename; /* Source file doing locking */ int line; /* Line number doing locking */ #endif }; #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket #define v_specinfo v_un.vu_spec.vu_specinfo #define v_rdev v_un.vu_spec.vu_specinfo #define v_specnext v_un.vu_spec.vu_specnext #define v_fifoinfo v_un.vu_fifoinfo #define VN_POLLEVENT(vp, events) \ do { \ if ((vp)->v_pollinfo.vpi_events & (events)) \ vn_pollevent((vp), (events)); \ } while (0) /* * Vnode flags. */ #define VROOT 0x00001 /* root of its file system */ #define VTEXT 0x00002 /* vnode is a pure text prototype */ #define VSYSTEM 0x00004 /* vnode being used by kernel */ #define VISTTY 0x00008 /* vnode represents a tty */ #define VXLOCK 0x00100 /* vnode is locked to change underlying type */ #define VXWANT 0x00200 /* process is waiting for vnode */ #define VBWAIT 0x00400 /* waiting for output to complete */ #define VALIASED 0x00800 /* vnode has an alias */ #define VDIROP 0x01000 /* LFS: vnode is involved in a directory op */ #define VOBJBUF 0x02000 /* Allocate buffers in VM object */ #define VNINACT 0x04000 /* LFS: skip ufs_inactive() in lfs_vunref */ #define VAGE 0x08000 /* Insert vnode at head of free list */ #define VOLOCK 0x10000 /* vnode is locked waiting for an object */ #define VOWANT 0x20000 /* a process is waiting for VOLOCK */ #define VDOOMED 0x40000 /* This vnode is being recycled */ #define VFREE 0x80000 /* This vnode is on the freelist */ #define VTBFREE 0x100000 /* This vnode is on the to-be-freelist */ #define VONWORKLST 0x200000 /* On syncer work-list */ #define VMOUNT 0x400000 /* Mount in progress */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ udev_t va_fsid; /* file system id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ udev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ /* * Flags for ioflag. */ #define IO_UNIT 0x01 /* do I/O as atomic unit */ #define IO_APPEND 0x02 /* append write to end */ #define IO_SYNC 0x04 /* do I/O synchronously */ #define IO_NODELOCKED 0x08 /* underlying node already locked */ #define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VSUID 04000 /* set user id on execution */ #define VSGID 02000 /* set group id on execution */ #define VSVTX 01000 /* save swapped text even after use */ #define VREAD 00400 /* read, write, execute permissions */ #define VWRITE 00200 #define VEXEC 00100 /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) #ifdef KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) #define VNODEOP_SET(f) \ C_SYSINIT(f##init, SI_SUB_VFS, SI_ORDER_SECOND, vfs_add_vnodeops, &f); \ C_SYSUNINIT(f##uninit, SI_SUB_VFS, SI_ORDER_SECOND, vfs_rm_vnodeops, &f); /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int desiredvnodes; /* number of vnodes desired */ extern time_t syncdelay; /* max time to delay syncing data */ extern time_t filedelay; /* time to delay syncing files */ extern time_t dirdelay; /* time to delay syncing directories */ extern time_t metadelay; /* time to delay syncing metadata */ extern struct vm_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ extern int vfs_ioopt; /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime) __P((int deltat)); #define VSHOULDFREE(vp) \ (!((vp)->v_flag & (VFREE|VDOOMED)) && \ !(vp)->v_holdcnt && !(vp)->v_usecount && \ (!(vp)->v_object || \ !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count))) #define VSHOULDBUSY(vp) \ (((vp)->v_flag & (VFREE|VTBFREE)) && \ ((vp)->v_holdcnt || (vp)->v_usecount)) #endif /* KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { int vdesc_offset; /* offset in vector--first for speed */ char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_proc_offset; /* proc location, if any */ int vdesc_componentname_offset; /* if any */ /* * Finally, we've got a list of private data (about each operation) * for each transport layer. (Support to manage this list is not * yet part of BSD.) */ caddr_t *vdesc_transports; }; #ifdef KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; /* * Interlock for scanning list of vnodes attached to a mountpoint */ extern struct simplelock mntvnode_slock; /* * This macro is very helpful in defining those offsets in the vdesc struct. * * This is stolen from X11R4. I ignored all the fancy stuff for * Crays, so if you decide to port this to such a serious machine, * you might want to consult Intrinsic.h's XtOffset{,Of,To}. */ #define VOPARG_OFFSET(p_type,field) \ ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) #define VOPARG_OFFSETOF(s_type,field) \ VOPARG_OFFSET(s_type*,field) #define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) /* * This structure is used to configure the new vnodeops vector. */ struct vnodeopv_entry_desc { struct vnodeop_desc *opve_op; /* which operation this is */ vop_t *opve_impl; /* code implementing this operation */ }; struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ vop_t ***opv_desc_vector_p; struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; #ifdef DEBUG_VFS_LOCKS /* * Macros to aid in tracing VFS locking problems. Not totally * reliable since if the process sleeps between changing the lock * state and checking it with the assert, some other process could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. I find that 'cvs co src' * is a pretty good test. */ /* * [dfr] Kludge until I get around to fixing all the vfs locking. */ #define IS_LOCKING_VFS(vp) ((vp)->v_tag == VT_UFS \ || (vp)->v_tag == VT_MFS \ || (vp)->v_tag == VT_NFS \ || (vp)->v_tag == VT_LFS \ || (vp)->v_tag == VT_ISOFS \ || (vp)->v_tag == VT_MSDOSFS \ || (vp)->v_tag == VT_DEVFS) #define ASSERT_VOP_LOCKED(vp, str) \ if ((vp) && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp)) { \ panic("%s: %p is not locked but should be", str, vp); \ } #define ASSERT_VOP_UNLOCKED(vp, str) \ if ((vp) && IS_LOCKING_VFS(vp) && VOP_ISLOCKED(vp)) { \ panic("%s: %p is locked but shouldn't be", str, vp); \ } #else #define ASSERT_VOP_LOCKED(vp, str) #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* * VOCALL calls an op given an ops vector. We break it out because BSD's * vclean changes the ops vector and then wants to call ops with the old * vector. */ #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) /* * This call works for vnodes in the kernel. */ #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) #define VDESC(OP) (& __CONCAT(OP,_desc)) #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; struct vop_bwrite_args; extern int (*lease_check_hook) __P((struct vop_lease_args *)); int bdevvp __P((dev_t dev, struct vnode **vpp)); /* cache_* may belong in namei.h. */ void cache_enter __P((struct vnode *dvp, struct vnode *vp, struct componentname *cnp)); int cache_lookup __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)); void cache_purge __P((struct vnode *vp)); void cache_purgevfs __P((struct mount *mp)); void cvtstat __P((struct stat *st, struct ostat *ost)); void cvtnstat __P((struct stat *sb, struct nstat *nsb)); int getnewvnode __P((enum vtagtype tag, struct mount *mp, vop_t **vops, struct vnode **vpp)); int lease_check __P((struct vop_lease_args *ap)); +int spec_vnoperate __P((struct vop_generic_args *)); int speedup_syncer __P((void)); void vattr_null __P((struct vattr *vap)); int vcount __P((struct vnode *vp)); void vdrop __P((struct vnode *)); int vfinddev __P((dev_t dev, enum vtype type, struct vnode **vpp)); void vfs_add_vnodeops __P((const void *)); void vfs_rm_vnodeops __P((const void *)); int vflush __P((struct mount *mp, struct vnode *skipvp, int flags)); int vget __P((struct vnode *vp, int lockflag, struct proc *p)); void vgone __P((struct vnode *vp)); void vhold __P((struct vnode *)); int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, struct proc *p, int slpflag, int slptimeo)); int vtruncbuf __P((struct vnode *vp, struct ucred *cred, struct proc *p, off_t length, int blksize)); void vprint __P((char *label, struct vnode *vp)); int vrecycle __P((struct vnode *vp, struct simplelock *inter_lkp, struct proc *p)); int vn_close __P((struct vnode *vp, int flags, struct ucred *cred, struct proc *p)); int vn_lock __P((struct vnode *vp, int flags, struct proc *p)); #ifdef DEBUG_LOCKS int debug_vn_lock __P((struct vnode *vp, int flags, struct proc *p, const char *filename, int line)); #define vn_lock(vp,flags,p) debug_vn_lock(vp,flags,p,__FILE__,__LINE__) #endif int vn_open __P((struct nameidata *ndp, int fmode, int cmode)); void vn_pollevent __P((struct vnode *vp, int events)); void vn_pollgone __P((struct vnode *vp)); int vn_pollrecord __P((struct vnode *vp, struct proc *p, int events)); int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); dev_t vn_todev __P((struct vnode *vp)); int vfs_cache_lookup __P((struct vop_lookup_args *ap)); int vfs_object_create __P((struct vnode *vp, struct proc *p, struct ucred *cred)); int vn_writechk __P((struct vnode *vp)); int vop_stdbwrite __P((struct vop_bwrite_args *ap)); int vop_stdislocked __P((struct vop_islocked_args *)); int vop_stdlock __P((struct vop_lock_args *)); int vop_stdunlock __P((struct vop_unlock_args *)); int vop_noislocked __P((struct vop_islocked_args *)); int vop_nolock __P((struct vop_lock_args *)); int vop_nopoll __P((struct vop_poll_args *)); int vop_nounlock __P((struct vop_unlock_args *)); int vop_stdpathconf __P((struct vop_pathconf_args *)); int vop_stdpoll __P((struct vop_poll_args *)); int vop_revoke __P((struct vop_revoke_args *)); int vop_sharedlock __P((struct vop_lock_args *)); int vop_eopnotsupp __P((struct vop_generic_args *ap)); int vop_ebadf __P((struct vop_generic_args *ap)); int vop_einval __P((struct vop_generic_args *ap)); int vop_enotty __P((struct vop_generic_args *ap)); int vop_defaultop __P((struct vop_generic_args *ap)); int vop_null __P((struct vop_generic_args *ap)); int vop_panic __P((struct vop_generic_args *ap)); struct vnode * checkalias __P((struct vnode *vp, udev_t nvp_rdev, struct mount *mp)); void vput __P((struct vnode *vp)); void vrele __P((struct vnode *vp)); void vref __P((struct vnode *vp)); void vbusy __P((struct vnode *vp)); extern vop_t **default_vnodeop_p; +extern vop_t **spec_vnodeop_p; extern TAILQ_HEAD(tobefreelist, vnode) vnode_tobefree_list; /* vnode free list */ #endif /* KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c (revision 49534) +++ head/sys/ufs/ffs/ffs_softdep.c (revision 49535) @@ -1,4485 +1,4485 @@ /* * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * The following are the copyrights and redistribution conditions that * apply to this copy of the soft update software. For a license * to use, redistribute or sell the soft update software under * conditions other than those described here, please contact the * author at one of the following addresses: * * Marshall Kirk McKusick mckusick@mckusick.com * 1614 Oxford Street +1-510-843-9542 * Berkeley, CA 94709-1608 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. None of the names of McKusick, Ganger, Patt, or the University of * Michigan may be used to endorse or promote products derived from * this software without specific prior written permission. * 4. Redistributions in any form must be accompanied by information on * how to obtain complete source code for any accompanying software * that uses this software. This source code must either be included * in the distribution or be available for no more than the cost of * distribution plus a nominal fee, and must be freely redistributable * under reasonable conditions. For an executable file, complete * source code means the source code for all modules it contains. * It does not mean source code for modules or files that typically * accompany the operating system on which the executable file runs, * e.g., standard library modules or system header files. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.40 (McKusick) 6/15/99 - * $Id: ffs_softdep.c,v 1.33 1999/06/27 13:26:23 peter Exp $ + * $Id: ffs_softdep.c,v 1.34 1999/06/29 15:57:40 mckusick Exp $ */ /* * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. */ #ifndef DIAGNOSTIC #define DIAGNOSTIC #endif #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include /* * These definitions need to be adapted to the system to which * this file is being ported. */ /* * malloc types defined for the softdep system. */ MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); #define D_PAGEDEP 0 #define D_INODEDEP 1 #define D_NEWBLK 2 #define D_BMSAFEMAP 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 #define D_FREEFRAG 7 #define D_FREEBLKS 8 #define D_FREEFILE 9 #define D_DIRADD 10 #define D_MKDIR 11 #define D_DIRREM 12 #define D_LAST D_DIRREM /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_NEWBLK, M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") #define CURPROC curproc /* * End system adaptaion definitions. */ /* * Internal function prototypes. */ static void softdep_error __P((char *, int)); static void drain_output __P((struct vnode *, int)); static int getdirtybuf __P((struct buf **, int)); static void clear_remove __P((struct proc *)); static void clear_inodedeps __P((struct proc *)); static int flush_pagedep_deps __P((struct vnode *, struct mount *, struct diraddhd *)); static int flush_inodedep_deps __P((struct fs *, ino_t)); static int handle_written_filepage __P((struct pagedep *, struct buf *)); static void diradd_inode_written __P((struct diradd *, struct inodedep *)); static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); static void handle_allocdirect_partdone __P((struct allocdirect *)); static void handle_allocindir_partdone __P((struct allocindir *)); static void initiate_write_filepage __P((struct pagedep *, struct buf *)); static void handle_written_mkdir __P((struct mkdir *, int)); static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); static void handle_workitem_freefile __P((struct freefile *)); static void handle_workitem_remove __P((struct dirrem *)); static struct dirrem *newdirrem __P((struct buf *, struct inode *, struct inode *, int)); static void free_diradd __P((struct diradd *)); static void free_allocindir __P((struct allocindir *, struct inodedep *)); static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, long *)); static void deallocate_dependencies __P((struct buf *, struct inodedep *)); static void free_allocdirect __P((struct allocdirectlst *, struct allocdirect *, int)); static int free_inodedep __P((struct inodedep *)); static void handle_workitem_freeblocks __P((struct freeblks *)); static void merge_inode_lists __P((struct inodedep *)); static void setup_allocindir_phase2 __P((struct buf *, struct inode *, struct allocindir *)); static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, ufs_daddr_t)); static void handle_workitem_freefrag __P((struct freefrag *)); static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); static void allocdirect_merge __P((struct allocdirectlst *, struct allocdirect *, struct allocdirect *)); static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, struct newblk **)); static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, struct pagedep **)); static void pause_timer __P((void *)); static int request_cleanup __P((int, int)); static void add_to_worklist __P((struct worklist *)); /* * Exported softdep operations. */ struct bio_ops bioops = { softdep_disk_io_initiation, /* io_start */ softdep_disk_write_complete, /* io_complete */ softdep_deallocate_dependencies, /* io_deallocate */ softdep_fsync, /* io_fsync */ softdep_process_worklist, /* io_sync */ }; /* * Locking primitives. * * For a uniprocessor, all we need to do is protect against disk * interrupts. For a multiprocessor, this lock would have to be * a mutex. A single mutex is used throughout this file, though * finer grain locking could be used if contention warranted it. * * For a multiprocessor, the sleep call would accept a lock and * release it after the sleep processing was complete. In a uniprocessor * implementation there is no such interlock, so we simple mark * the places where it needs to be done with the `interlocked' form * of the lock calls. Since the uniprocessor sleep already interlocks * the spl, there is nothing that really needs to be done. */ #ifndef /* NOT */ DEBUG static struct lockit { int lkt_spl; } lk = { 0 }; #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() #define FREE_LOCK(lk) splx((lk)->lkt_spl) #define ACQUIRE_LOCK_INTERLOCKED(lk) #define FREE_LOCK_INTERLOCKED(lk) #else /* DEBUG */ static struct lockit { int lkt_spl; pid_t lkt_held; } lk = { 0, -1 }; static int lockcnt; static void acquire_lock __P((struct lockit *)); static void free_lock __P((struct lockit *)); static void acquire_lock_interlocked __P((struct lockit *)); static void free_lock_interlocked __P((struct lockit *)); #define ACQUIRE_LOCK(lk) acquire_lock(lk) #define FREE_LOCK(lk) free_lock(lk) #define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) #define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) static void acquire_lock(lk) struct lockit *lk; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) panic("softdep_lock: locking against myself"); else panic("softdep_lock: lock held by %d", lk->lkt_held); } lk->lkt_spl = splbio(); lk->lkt_held = CURPROC->p_pid; lockcnt++; } static void free_lock(lk) struct lockit *lk; { if (lk->lkt_held == -1) panic("softdep_unlock: lock not held"); lk->lkt_held = -1; splx(lk->lkt_spl); } static void acquire_lock_interlocked(lk) struct lockit *lk; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) panic("softdep_lock_interlocked: locking against self"); else panic("softdep_lock_interlocked: lock held by %d", lk->lkt_held); } lk->lkt_held = CURPROC->p_pid; lockcnt++; } static void free_lock_interlocked(lk) struct lockit *lk; { if (lk->lkt_held == -1) panic("softdep_unlock_interlocked: lock not held"); lk->lkt_held = -1; } #endif /* DEBUG */ /* * Place holder for real semaphores. */ struct sema { int value; pid_t holder; char *name; int prio; int timo; }; static void sema_init __P((struct sema *, char *, int, int)); static int sema_get __P((struct sema *, struct lockit *)); static void sema_release __P((struct sema *)); static void sema_init(semap, name, prio, timo) struct sema *semap; char *name; int prio, timo; { semap->holder = -1; semap->value = 0; semap->name = name; semap->prio = prio; semap->timo = timo; } static int sema_get(semap, interlock) struct sema *semap; struct lockit *interlock; { if (semap->value++ > 0) { if (interlock != NULL) FREE_LOCK_INTERLOCKED(interlock); tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); if (interlock != NULL) { ACQUIRE_LOCK_INTERLOCKED(interlock); FREE_LOCK(interlock); } return (0); } semap->holder = CURPROC->p_pid; if (interlock != NULL) FREE_LOCK(interlock); return (1); } static void sema_release(semap) struct sema *semap; { if (semap->value <= 0 || semap->holder != CURPROC->p_pid) panic("sema_release: not held"); if (--semap->value > 0) { semap->value = 0; wakeup(semap); } semap->holder = -1; } /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) #else /* DEBUG */ static void worklist_insert __P((struct workhead *, struct worklist *)); static void worklist_remove __P((struct worklist *)); static void workitem_free __P((struct worklist *, int)); #define WORKLIST_INSERT(head, item) worklist_insert(head, item) #define WORKLIST_REMOVE(item) worklist_remove(item) #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) static void worklist_insert(head, item) struct workhead *head; struct worklist *item; { if (lk.lkt_held == -1) panic("worklist_insert: lock not held"); if (item->wk_state & ONWORKLIST) panic("worklist_insert: already on list"); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item) struct worklist *item; { if (lk.lkt_held == -1) panic("worklist_remove: lock not held"); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: not on list"); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } static void workitem_free(item, type) struct worklist *item; int type; { if (item->wk_state & ONWORKLIST) panic("workitem_free: still on list"); if (item->wk_type != type) panic("workitem_free: type mismatch"); FREE(item, DtoM(type)); } #endif /* DEBUG */ /* * Workitem queue management */ static struct workhead softdep_workitem_pending; static int softdep_worklist_busy; static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static struct proc *filesys_syncer; /* proc of filesystem syncer process */ static int req_clear_inodedeps; /* syncer process flush some inodedeps */ #define FLUSH_INODES 1 static int req_clear_remove; /* syncer process flush some freeblks */ #define FLUSH_REMOVE 2 /* * runtime statistics */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ #ifdef DEBUG #include #include #if defined(__FreeBSD__) SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); #else /* !__FreeBSD__ */ struct ctldebug debug20 = { "max_softdeps", &max_softdeps }; struct ctldebug debug21 = { "tickdelay", &tickdelay }; struct ctldebug debug23 = { "blk_limit_push", &stat_blk_limit_push }; struct ctldebug debug24 = { "ino_limit_push", &stat_ino_limit_push }; struct ctldebug debug25 = { "blk_limit_hit", &stat_blk_limit_hit }; struct ctldebug debug26 = { "ino_limit_hit", &stat_ino_limit_hit }; struct ctldebug debug27 = { "indir_blk_ptrs", &stat_indir_blk_ptrs }; struct ctldebug debug28 = { "inode_bitmap", &stat_inode_bitmap }; struct ctldebug debug29 = { "direct_blk_ptrs", &stat_direct_blk_ptrs }; struct ctldebug debug30 = { "dir_entry", &stat_dir_entry }; #endif /* !__FreeBSD__ */ #endif /* DEBUG */ /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ static void add_to_worklist(wk) struct worklist *wk; { static struct worklist *worklist_tail; if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: already on list"); wk->wk_state |= ONWORKLIST; if (LIST_FIRST(&softdep_workitem_pending) == NULL) LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); else LIST_INSERT_AFTER(worklist_tail, wk, wk_list); worklist_tail = wk; } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ int softdep_process_worklist(matchmnt) struct mount *matchmnt; { struct proc *p = CURPROC; struct worklist *wk; struct fs *matchfs; int matchcnt; /* * Record the process identifier of our caller so that we can give * this process preferential treatment in request_cleanup below. */ filesys_syncer = p; matchcnt = 0; matchfs = NULL; if (matchmnt != NULL) matchfs = VFSTOUFS(matchmnt)->um_fs; /* * There is no danger of having multiple processes run this * code. It is single threaded solely so that softdep_flushfiles * (below) can get an accurate count of the number of items * related to its mount point that are in the list. */ if (softdep_worklist_busy && matchmnt == NULL) return (-1); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); req_clear_inodedeps = 0; wakeup(&proc_waiting); } if (req_clear_remove) { clear_remove(p); req_clear_remove = 0; wakeup(&proc_waiting); } ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { WORKLIST_REMOVE(wk); FREE_LOCK(&lk); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ if (WK_DIRREM(wk)->dm_mnt == matchmnt) matchcnt += 1; handle_workitem_remove(WK_DIRREM(wk)); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ if (WK_FREEBLKS(wk)->fb_fs == matchfs) matchcnt += 1; handle_workitem_freeblocks(WK_FREEBLKS(wk)); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ if (WK_FREEFRAG(wk)->ff_fs == matchfs) matchcnt += 1; handle_workitem_freefrag(WK_FREEFRAG(wk)); break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ if (WK_FREEFILE(wk)->fx_fs == matchfs) matchcnt += 1; handle_workitem_freefile(WK_FREEFILE(wk)); break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } if (softdep_worklist_busy && matchmnt == NULL) return (-1); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); req_clear_inodedeps = 0; wakeup(&proc_waiting); } if (req_clear_remove) { clear_remove(p); req_clear_remove = 0; wakeup(&proc_waiting); } ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); return (matchcnt); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushfiles(oldmnt, flags, p) struct mount *oldmnt; int flags; struct proc *p; { struct vnode *devvp; int error, loopcnt; /* * Await our turn to clear out the queue. */ while (softdep_worklist_busy) tsleep(&lbolt, PRIBIO, "softflush", 0); softdep_worklist_busy = 1; if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { softdep_worklist_busy = 0; return (error); } /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ devvp = VFSTOUFS(oldmnt)->um_devvp; for (loopcnt = 10; loopcnt > 0; loopcnt--) { if (softdep_process_worklist(oldmnt) == 0) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) break; /* * If we still found nothing to do, we are really done. */ if (softdep_process_worklist(oldmnt) == 0) break; } vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); VOP_UNLOCK(devvp, 0, p); if (error) break; } softdep_worklist_busy = 0; /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } return (error); } /* * Structure hashing. * * There are three types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ /* * Structures and routines associated with pagedep caching. */ LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; u_long pagedep_hash; /* size of hash table - 1 */ #define PAGEDEP_HASH(mp, inum, lbn) \ (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ pagedep_hash]) static struct sema pagedep_in_progress; /* * Look up a pagedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(ip, lbn, flags, pagedeppp) struct inode *ip; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct mount *mp; int i; #ifdef DEBUG if (lk.lkt_held == -1) panic("pagedep_lookup: lock not held"); #endif mp = ITOV(ip)->v_mount; pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); top: for (pagedep = LIST_FIRST(pagedephd); pagedep; pagedep = LIST_NEXT(pagedep, pd_hash)) if (ip->i_number == pagedep->pd_ino && lbn == pagedep->pd_lbn && mp == pagedep->pd_mnt) break; if (pagedep) { *pagedeppp = pagedep; return (1); } if ((flags & DEPALLOC) == 0) { *pagedeppp = NULL; return (0); } if (sema_get(&pagedep_in_progress, &lk) == 0) { ACQUIRE_LOCK(&lk); goto top; } MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, M_WAITOK); bzero(pagedep, sizeof(struct pagedep)); pagedep->pd_list.wk_type = D_PAGEDEP; pagedep->pd_mnt = mp; pagedep->pd_ino = ip->i_number; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); sema_release(&pagedep_in_progress); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; static u_long inodedep_hash; /* size of hash table - 1 */ static long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) static struct sema inodedep_in_progress; /* * Look up a inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(fs, inum, flags, inodedeppp) struct fs *fs; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; int firsttry; #ifdef DEBUG if (lk.lkt_held == -1) panic("inodedep_lookup: lock not held"); #endif firsttry = 1; inodedephd = INODEDEP_HASH(fs, inum); top: for (inodedep = LIST_FIRST(inodedephd); inodedep; inodedep = LIST_NEXT(inodedep, id_hash)) if (inum == inodedep->id_ino && fs == inodedep->id_fs) break; if (inodedep) { *inodedeppp = inodedep; return (1); } if ((flags & DEPALLOC) == 0) { *inodedeppp = NULL; return (0); } /* * If we are over our limit, try to improve the situation. */ if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 && request_cleanup(FLUSH_INODES, 1)) { firsttry = 0; goto top; } if (sema_get(&inodedep_in_progress, &lk) == 0) { ACQUIRE_LOCK(&lk); goto top; } num_inodedep += 1; MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), M_INODEDEP, M_WAITOK); inodedep->id_list.wk_type = D_INODEDEP; inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino = NULL; inodedep->id_savedsize = -1; inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); sema_release(&inodedep_in_progress); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; u_long newblk_hash; /* size of hash table - 1 */ #define NEWBLK_HASH(fs, inum) \ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static struct sema newblk_in_progress; /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(fs, newblkno, flags, newblkpp) struct fs *fs; ufs_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; newblkhd = NEWBLK_HASH(fs, newblkno); top: for (newblk = LIST_FIRST(newblkhd); newblk; newblk = LIST_NEXT(newblk, nb_hash)) if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) break; if (newblk) { *newblkpp = newblk; return (1); } if ((flags & DEPALLOC) == 0) { *newblkpp = NULL; return (0); } if (sema_get(&newblk_in_progress, 0) == 0) goto top; MALLOC(newblk, struct newblk *, sizeof(struct newblk), M_NEWBLK, M_WAITOK); newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); sema_release(&newblk_in_progress); *newblkpp = newblk; return (0); } /* * Executed during filesystem system initialization before * mounting any file systems. */ void softdep_initialize() { LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); max_softdeps = desiredvnodes * 8; pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum cstotal; struct cg *cgp; struct buf *bp; int error, cyl; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_flag |= MNT_SOFTDEP; /* * When doing soft updates, the counters in the * superblock may have gotten out of sync, so we have * to scan the cylinder groups and recalculate them. */ if (fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("ffs_mountfs: superblock updated for soft updates\n"); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a file system * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs file system maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ { struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) panic("softdep_setup_inomapdep: found inode"); inodedep->id_buf = bp; inodedep->id_state &= ~DEPCOMPLETE; bmsafemap = bmsafemap_lookup(bp); LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, fs, newblkno) struct buf *bp; /* buffer for cylgroup block with block map */ struct fs *fs; /* filesystem doing allocation */ ufs_daddr_t newblkno; /* number of newly allocated block */ { struct newblk *newblk; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); ACQUIRE_LOCK(&lk); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); FREE_LOCK(&lk); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * splbio interrupts blocked. */ static struct bmsafemap * bmsafemap_lookup(bp) struct buf *bp; { struct bmsafemap *bmsafemap; struct worklist *wk; #ifdef DEBUG if (lk.lkt_held == -1) panic("bmsafemap_lookup: lock not held"); #endif for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) if (wk->wk_type == D_BMSAFEMAP) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), M_BMSAFEMAP, M_WAITOK); bmsafemap->sm_list.wk_type = D_BMSAFEMAP; bmsafemap->sm_list.wk_state = 0; bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_allocdirecthd); LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_newblkhd); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t lbn; /* block pointer within inode */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct pagedep *pagedep; struct newblk *newblk; MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_WAITOK); bzero(adp, sizeof(struct allocdirect)); adp->ad_list.wk_type = D_ALLOCDIRECT; adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NDADDR) { /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } for (oldadp = TAILQ_FIRST(adphead); oldadp; oldadp = TAILQ_NEXT(oldadp, ad_next)) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct freefrag *freefrag; #ifdef DEBUG if (lk.lkt_held == -1) panic("allocdirect_merge: lock not held"); #endif if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_lbn >= NDADDR) panic("allocdirect_check: old %d != new %d || lbn %ld >= %d", newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, NDADDR); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_allocdirect. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } free_allocdirect(adphead, oldadp, 0); } /* * Allocate a new freefrag structure if needed. */ static struct freefrag * newfreefrag(ip, blkno, size) struct inode *ip; ufs_daddr_t blkno; long size; { struct freefrag *freefrag; struct fs *fs; if (blkno == 0) return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), M_FREEFRAG, M_WAITOK); freefrag->ff_list.wk_type = D_FREEFRAG; freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ freefrag->ff_inum = ip->i_number; freefrag->ff_fs = fs; freefrag->ff_devvp = ip->i_devvp; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct inode tip; tip.i_fs = freefrag->ff_fs; tip.i_devvp = freefrag->ff_devvp; tip.i_dev = freefrag->ff_devvp->v_rdev; tip.i_number = freefrag->ff_inum; tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); FREE(freefrag, M_FREEFRAG); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 if none */ { struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), M_ALLOCINDIR, M_WAITOK); bzero(aip, sizeof(struct allocindir)); aip->ai_list.wk_type = D_ALLOCINDIR; aip->ai_state = ATTACHED; aip->ai_offset = ptrno; aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ ufs_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct allocindir *aip; struct pagedep *pagedep; aip = newallocindir(ip, ptrno, newblkno, oldblkno); ACQUIRE_LOCK(&lk); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); setup_allocindir_phase2(bp, ip, aip); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs_daddr_t newblkno; /* disk block number being added */ { struct allocindir *aip; aip = newallocindir(ip, ptrno, newblkno, 0); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); setup_allocindir_phase2(bp, ip, aip); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void setup_allocindir_phase2(bp, ip, aip) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct allocindir *aip; /* allocindir allocated by the above routines */ { struct worklist *wk; struct indirdep *indirdep, *newindirdep; struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; struct newblk *newblk; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); for (indirdep = NULL, newindirdep = NULL; ; ) { ACQUIRE_LOCK(&lk); for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); newindirdep = NULL; } FREE_LOCK(&lk); if (indirdep) { if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, &newblk) == 0) panic("setup_allocindir: lost block"); ACQUIRE_LOCK(&lk); if (newblk->nb_state == DEPCOMPLETE) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; aip->ai_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old * dependency into the new one. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) if (oldaip->ai_offset == aip->ai_offset) break; if (oldaip != NULL) { if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("setup_allocindir_phase2: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = aip->ai_freefrag; aip->ai_freefrag = freefrag; free_allocindir(oldaip, NULL); } LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); ((ufs_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; FREE_LOCK(&lk); } if (newindirdep) { if (indirdep->ir_savebp != NULL) brelse(newindirdep->ir_savebp); WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); } if (indirdep) break; MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), M_INDIRDEP, M_WAITOK); newindirdep->ir_list.wk_type = D_INDIRDEP; newindirdep->ir_state = ATTACHED; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); if (bp->b_blkno == bp->b_lblkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); } } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ static long num_freeblks; /* number of freeblks allocated */ void softdep_setup_freeblocks(ip, length) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ { struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct vnode *vp; struct buf *bp; struct fs *fs; int i, error; fs = ip->i_fs; if (length != 0) panic("softde_setup_freeblocks: non-zero length"); /* * If we are over our limit, try to improve the situation. */ if (num_freeblks > max_softdeps / 2 && speedup_syncer() == 0) (void) request_cleanup(FLUSH_REMOVE, 0); num_freeblks += 1; MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), M_FREEBLKS, M_WAITOK); bzero(freeblks, sizeof(struct freeblks)); freeblks->fb_list.wk_type = D_FREEBLKS; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; freeblks->fb_fs = fs; freeblks->fb_oldsize = ip->i_size; freeblks->fb_newsize = length; freeblks->fb_chkcnt = ip->i_blocks; for (i = 0; i < NDADDR; i++) { freeblks->fb_dblks[i] = ip->i_db[i]; ip->i_db[i] = 0; } for (i = 0; i < NIADDR; i++) { freeblks->fb_iblks[i] = ip->i_ib[i]; ip->i_ib[i] = 0; } ip->i_blocks = 0; ip->i_size = 0; /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) softdep_error("softdep_setup_freeblocks", error); *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(&lk); (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. */ WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. */ merge_inode_lists(inodedep); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, 1); FREE_LOCK(&lk); bdwrite(bp); /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); ACQUIRE_LOCK(&lk); drain_output(vp, 1); while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { bp = TAILQ_FIRST(&vp->v_dirtyblkhd); (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); } /* * Try freeing the inodedep in case that was the last dependency. */ if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) (void) free_inodedep(inodedep); FREE_LOCK(&lk); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static void deallocate_dependencies(bp, inodedep) struct buf *bp; struct inodedep *inodedep; { struct worklist *wk; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct diradd *dap; int i; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("deallocate_dependencies: already gone"); indirdep->ir_state |= GOINGAWAY; while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); WORKLIST_REMOVE(wk); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* * None of the directory additions will ever be * visible, so they can simply be tossed. */ for (i = 0; i < DAHASHSZ; i++) while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i]))) free_diradd(dap); while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) free_diradd(dap); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. * If the inode has already been written, then they * can be dumped directly onto the work list. */ for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; dirrem = LIST_NEXT(dirrem, dm_next)) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL) add_to_worklist(&dirrem->dm_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: free_allocindir(WK_ALLOCINDIR(wk), inodedep); continue; case D_ALLOCDIRECT: case D_INODEDEP: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ default: panic("deallocate_dependencies: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Free an allocdirect. Generate a new freefrag work request if appropriate. * This routine must be called with splbio interrupts blocked. */ static void free_allocdirect(adphead, adp, delay) struct allocdirectlst *adphead; struct allocdirect *adp; int delay; { #ifdef DEBUG if (lk.lkt_held == -1) panic("free_allocdirect: lock not held"); #endif if ((adp->ad_state & DEPCOMPLETE) == 0) LIST_REMOVE(adp, ad_deps); TAILQ_REMOVE(adphead, adp, ad_next); if ((adp->ad_state & COMPLETE) == 0) WORKLIST_REMOVE(&adp->ad_list); if (adp->ad_freefrag != NULL) { if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_freefrag->ff_list); else add_to_worklist(&adp->ad_freefrag->ff_list); } WORKITEM_FREE(adp, D_ALLOCDIRECT); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ static long num_freefile; /* number of freefile allocated */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; /* * If we are over our limit, try to improve the situation. */ if (num_freefile > max_softdeps / 2 && speedup_syncer() == 0) (void) request_cleanup(FLUSH_REMOVE, 0); /* * This sets up the inode de-allocation dependency. */ num_freefile += 1; MALLOC(freefile, struct freefile *, sizeof(struct freefile), M_FREEFILE, M_WAITOK); freefile->fx_list.wk_type = D_FREEFILE; freefile->fx_list.wk_state = 0; freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; freefile->fx_fs = ip->i_fs; /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk and we can free the file immediately. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0) { add_to_worklist(&freefile->fx_list); FREE_LOCK(&lk); return; } /* * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We could process * the freefile immediately, but then we would have to clear the * id_inowait dependencies here and it is easier just to let the * zero'ed inode be written and let them be cleaned up in the * normal followup actions that follow the inode write. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } /* * If the inodedep has no dependencies associated with it, * then we must free it here and free the file immediately. * This case arises when an early allocation fails (for * example, the user is over their file quota). */ if (free_inodedep(inodedep) == 0) WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); else add_to_worklist(&freefile->fx_list); FREE_LOCK(&lk); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { if ((inodedep->id_state & ONWORKLIST) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || LIST_FIRST(&inodedep->id_pendinghd) != NULL || LIST_FIRST(&inodedep->id_bufwait) != NULL || LIST_FIRST(&inodedep->id_inowait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino != NULL) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; return (1); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static void handle_workitem_freeblocks(freeblks) struct freeblks *freeblks; { struct inode tip; ufs_daddr_t bn; struct fs *fs; int i, level, bsize; long nblocks, blocksreleased = 0; int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; tip.i_number = freeblks->fb_previousinum; tip.i_devvp = freeblks->fb_devvp; tip.i_dev = freeblks->fb_devvp->v_rdev; tip.i_fs = freeblks->fb_fs; tip.i_size = freeblks->fb_oldsize; tip.i_uid = freeblks->fb_uid; fs = freeblks->fb_fs; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { tmpval *= NINDIR(fs); baselbns[i] = baselbns[i - 1] + tmpval; } nblocks = btodb(fs->fs_bsize); blocksreleased = 0; /* * Indirect blocks first. */ for (level = (NIADDR - 1); level >= 0; level--) { if ((bn = freeblks->fb_iblks[level]) == 0) continue; if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, baselbns[level], &blocksreleased)) == 0) allerror = error; ffs_blkfree(&tip, bn, fs->fs_bsize); blocksreleased += nblocks; } /* * All direct blocks or frags. */ for (i = (NDADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_dblks[i]) == 0) continue; bsize = blksize(fs, &tip, i); ffs_blkfree(&tip, bn, bsize); blocksreleased += btodb(bsize); } #ifdef DIAGNOSTIC if (freeblks->fb_chkcnt != blocksreleased) panic("handle_workitem_freeblocks: block count"); if (allerror) softdep_error("handle_workitem_freeblks", allerror); #endif /* DIAGNOSTIC */ WORKITEM_FREE(freeblks, D_FREEBLKS); num_freeblks -= 1; } /* * Release blocks associated with the inode ip and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int indir_trunc(ip, dbn, level, lbn, countp) struct inode *ip; ufs_daddr_t dbn; int level; ufs_lbn_t lbn; long *countp; { struct buf *bp; ufs_daddr_t *bap; ufs_daddr_t nb; struct fs *fs; struct worklist *wk; struct indirdep *indirdep; int i, lbnadd, nblocks; int error, allerror = 0; fs = ip->i_fs; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); /* * Get buffer of block pointers to be freed. This routine is not * called until the zero'ed inode has been written, so it is safe * to free blocks as they are encountered. Because the inode has * been zero'ed, calls to bmap on these blocks will fail. So, we * have to use the on-disk address and the block device for the * filesystem to look them up. If the file was deleted before its * indirect blocks were all written to disk, the routine that set * us up (deallocate_dependencies) will have arranged to leave * a complete copy of the indirect block in memory for our use. * Otherwise we have to read the blocks in from the disk. */ ACQUIRE_LOCK(&lk); if ((bp = incore(ip->i_devvp, dbn)) != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: lost indirdep"); WORKLIST_REMOVE(wk); WORKITEM_FREE(indirdep, D_INDIRDEP); if (LIST_FIRST(&bp->b_dep) != NULL) panic("indir_trunc: dangling dep"); FREE_LOCK(&lk); } else { FREE_LOCK(&lk); error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) return (error); } /* * Recursively free indirect blocks. */ bap = (ufs_daddr_t *)bp->b_data; nblocks = btodb(fs->fs_bsize); for (i = NINDIR(fs) - 1; i >= 0; i--) { if ((nb = bap[i]) == 0) continue; if (level != 0) { if ((error = indir_trunc(ip, fsbtodb(fs, nb), level - 1, lbn + (i * lbnadd), countp)) != 0) allerror = error; } ffs_blkfree(ip, nb, fs->fs_bsize); *countp += nblocks; } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (allerror); } /* * Free an allocindir. * This routine must be called with splbio interrupts blocked. */ static void free_allocindir(aip, inodedep) struct allocindir *aip; struct inodedep *inodedep; { struct freefrag *freefrag; #ifdef DEBUG if (lk.lkt_held == -1) panic("free_allocindir: lock not held"); #endif if ((aip->ai_state & DEPCOMPLETE) == 0) LIST_REMOVE(aip, ai_deps); if (aip->ai_state & ONWORKLIST) WORKLIST_REMOVE(&aip->ai_list); LIST_REMOVE(aip, ai_next); if ((freefrag = aip->ai_freefrag) != NULL) { if (inodedep == NULL) add_to_worklist(&freefrag->ff_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs file system will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ void softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ long newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir1, *mkdir2; /* * Whiteouts have no dependencies. */ if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return; } fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_WAITOK); mkdir1->md_list.wk_type = D_MKDIR; mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_WAITOK); mkdir2->md_list.wk_type = D_MKDIR; mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); FREE_LOCK(&lk); bdwrite(newdirbp); /* * Dependency on link count increase for parent directory */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); } else { LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } } /* * Link into parent directory pagedep to await its being written. */ if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); FREE_LOCK(&lk); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct diradd *dap; ufs_lbn_t lbn; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); dap; dap = LIST_NEXT(dap, da_pdlist)) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) break; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], dap, da_pdlist); break; } if (dap == NULL) { for (dap = LIST_FIRST(&pagedep->pd_pendinghd); dap; dap = LIST_NEXT(dap, da_pdlist)) { if (dap->da_offset == oldoffset) { dap->da_offset = newoffset; break; } } } done: bcopy(oldloc, newloc, entrysize); FREE_LOCK(&lk); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap) struct diradd *dap; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; #ifdef DEBUG if (lk.lkt_held == -1) panic("free_diradd: lock not held"); #endif WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~mkdir->md_state; WORKLIST_REMOVE(&mkdir->md_list); LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem; /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir); if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), M_DIRREM, M_WAITOK); bzero(dirrem, sizeof(struct dirrem)); dirrem->dm_list.wk_type = D_DIRREM; dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_mnt = ITOV(ip)->v_mount; dirrem->dm_oldinum = ip->i_number; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. Check for an entry on both the pd_dirraddhd * list and the pd_pendinghd list. */ for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]); dap; dap = LIST_NEXT(dap, da_pdlist)) if (dap->da_offset == offset) break; if (dap == NULL) { for (dap = LIST_FIRST(&pagedep->pd_pendinghd); dap; dap = LIST_NEXT(dap, da_pdlist)) if (dap->da_offset == offset) break; if (dap == NULL) return (dirrem); } /* * Must be ATTACHED at this point, so just delete it. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); free_diradd(dap); dirrem->dm_state |= COMPLETE; return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ long newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; offset = blkoff(dp->i_fs, dp->i_offset); /* * Whiteouts do not need diradd dependencies. */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); return; } /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ dap->da_previous = dirrem; if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If the previous inode was never written or its previous directory * entry was never written, then we do not want to roll back to this * previous value. Instead we want to roll back to zero and immediately * free the unwritten or unreferenced inode. */ if (dirrem->dm_state & COMPLETE) { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); } /* * Called whenever the link count on an inode is increased. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_increase_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); FREE_LOCK(&lk); } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static void handle_workitem_remove(dirrem) struct dirrem *dirrem; { struct proc *p = CURPROC; /* XXX */ struct inodedep *inodedep; struct vnode *vp; struct inode *ip; int error; if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); ip->i_flag |= IN_CHANGE; vput(vp); WORKITEM_FREE(dirrem, D_DIRREM); return; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Next truncate the directory to length zero. When the * truncation completes, arrange to have the reference count on * the parent decremented to account for the loss of "..". */ ip->i_nlink -= 2; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); ip->i_flag |= IN_CHANGE; if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) softdep_error("handle_workitem_remove: truncate", error); /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { vput(vp); WORKITEM_FREE(dirrem, D_DIRREM); return; } ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, &inodedep); dirrem->dm_state = 0; dirrem->dm_oldinum = dirrem->dm_dirinum; WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); vput(vp); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct vnode vp; struct inode tip; struct inodedep *idp; int error; #ifdef DEBUG ACQUIRE_LOCK(&lk); if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) panic("handle_workitem_freefile: inodedep survived"); FREE_LOCK(&lk); #endif tip.i_devvp = freefile->fx_devvp; tip.i_dev = freefile->fx_devvp->v_rdev; tip.i_fs = freefile->fx_fs; vp.v_data = &tip; if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); WORKITEM_FREE(freefile, D_FREEFILE); num_freefile -= 1; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk, *nextwk; struct indirdep *indirdep; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_flags & B_READ) panic("softdep_disk_io_initiation: read"); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { nextwk = LIST_NEXT(wk, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: initiate_write_inodeblock(WK_INODEDEP(wk), bp); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this * will be writing the real pointers, so the * dependency can be freed. */ if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(indirdep->ir_savebp); /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; LIST_REMOVE(wk, wk_list); WORKITEM_FREE(indirdep, D_INDIRDEP); continue; } /* * Replace up-to-date version with safe version. */ ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, M_INDIRDEP, M_WAITOK); bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); FREE_LOCK(&lk); continue; case D_MKDIR: case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; ACQUIRE_LOCK(&lk); for (i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = LIST_NEXT(dap, da_pdlist)) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %d != new %d", "initiate_write_filepage", ep->d_ino, dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } FREE_LOCK(&lk); } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct dinode *dp; struct fs *fs; ufs_lbn_t prevlbn = 0; int i, deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino != NULL) panic("initiate_write_inodeblock: already doing I/O"); MALLOC(inodedep->id_savedino, struct dinode *, sizeof(struct dinode), M_INODEDEP, M_WAITOK); *inodedep->id_savedino = *dp; bzero((caddr_t)dp, sizeof(struct dinode)); return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) return; /* * Set the dependencies to busy. */ ACQUIRE_LOCK(&lk); for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%ld mismatch %d != %d", "softdep_write_inodeblock", adp->ad_lbn, dp->di_db[adp->ad_lbn], adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%ld mismatch %d != %d", "softdep_write_inodeblock", adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* DIAGNOSTIC */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* DIAGNOSTIC */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* DIAGNOSTIC */ dp->di_ib[i] = 0; } FREE_LOCK(&lk); return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; FREE_LOCK(&lk); } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the file system caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. */ void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct workhead reattach; struct newblk *newblk; struct allocindir *aip; struct allocdirect *adp; struct indirdep *indirdep; struct inodedep *inodedep; struct bmsafemap *bmsafemap; #ifdef DEBUG if (lk.lkt_held != -1) panic("softdep_disk_write_complete: lock is held"); lk.lkt_held = -2; #endif LIST_INIT(&reattach); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); } while ((adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; LIST_REMOVE(adp, ad_deps); handle_allocdirect_partdone(adp); } while ((aip = LIST_FIRST(&bmsafemap->sm_allocindirhd))) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; LIST_REMOVE(aip, ai_deps); handle_allocindir_partdone(aip); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); adp->ad_state |= COMPLETE; handle_allocdirect_partdone(adp); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); aip->ai_state |= COMPLETE; handle_allocindir_partdone(aip); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); FREE(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); } WORKLIST_INSERT(&reattach, wk); if ((bp->b_flags & B_DELWRI) == 0) stat_indir_blk_ptrs++; bdirty(bp); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } #ifdef DEBUG if (lk.lkt_held != -2) panic("softdep_disk_write_complete: lock lost"); lk.lkt_held = -1; #endif } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { struct allocdirect *listadp; struct inodedep *inodedep; long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (adp->ad_buf != NULL) panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; listadp = TAILQ_NEXT(listadp, ad_next)) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; listadp = TAILQ_NEXT(listadp, ad_next)) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. */ for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; free_allocdirect(&inodedep->id_inoupdt, adp, 1); } } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (aip->ai_buf != NULL) panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; if (indirdep->ir_state & UNDONE) { LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; LIST_REMOVE(aip, ai_next); if (aip->ai_freefrag != NULL) add_to_worklist(&aip->ai_freefrag->ff_list); WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ static int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { struct worklist *wk, *filefree; struct allocdirect *adp, *nextadp; struct dinode *dp; int hadchanges; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; inodedep->id_state |= COMPLETE; dp = (struct dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino != NULL) { *dp = *inodedep->id_savedino; FREE(inodedep->id_savedino, M_INODEDEP); inodedep->id_savedino = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); return (1); } /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (adp->ad_lbn < NDADDR) { if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) panic("%s: %s #%ld mismatch %d != %d", "handle_written_inodeblock", "direct pointer", adp->ad_lbn, dp->di_db[adp->ad_lbn], adp->ad_oldblkno); dp->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%ld allocated as %d", "handle_written_inodeblock", "indirect pointer", adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR]); dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1) panic("handle_written_inodeblock: bad size"); if (dp->di_size != inodedep->id_savedsize) { dp->di_size = inodedep->id_savedsize; hadchanges = 1; } inodedep->id_savedsize = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. */ filefree = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding filefree to the worklist until * all other additions have been made to ensure * that it will be done after all the old blocks * have been freed. */ if (filefree != NULL) panic("handle_written_inodeblock: filefree"); filefree = wk; continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEBLKS: case D_FREEFRAG: case D_DIRREM: add_to_worklist(wk); continue; default: panic("handle_written_inodeblock: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } if (filefree != NULL) { if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep"); add_to_worklist(filefree); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) return (0); return (hadchanges); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { struct pagedep *pagedep; dap->da_state |= COMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { struct diradd *dap; struct pagedep *pagedep; if (mkdir->md_state != type) panic("handle_written_mkdir: bad type"); dap = mkdir->md_diradd; dap->da_state &= ~type; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) dap->da_state |= DEPCOMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ static int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Free any directory additions that have been committed. */ while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); } /* * If no dependencies remain, the pagedep will be freed. * Otherwise it will remain to update the page before it * is written back to disk. */ if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { for (i = 0; i < DAHASHSZ; i++) if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) break; if (i == DAHASHSZ) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (0); } } return (1); } /* * Writing back in-core inode structures. * * The file system only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } if (inodedep->id_nlinkdelta != 0) { ip->i_effnlink -= inodedep->id_nlinkdelta; ip->i_flag |= IN_MODIFIED; inodedep->id_nlinkdelta = 0; (void) free_inodedep(inodedep); } FREE_LOCK(&lk); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct worklist *wk; int error, gotit; /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) { (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } if (ip->i_nlink < ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(inodedep); if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { FREE_LOCK(&lk); return; } gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); FREE_LOCK(&lk); if (gotit && (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); if ((inodedep->id_state & DEPCOMPLETE) == 0) panic("softdep_update_inodeblock: update failed"); } /* * Merge the new inode dependency list (id_newinoupdt) into the old * inode dependency list (id_inoupdt). This routine must be called * with splbio interrupts blocked. */ static void merge_inode_lists(inodedep) struct inodedep *inodedep; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_lbn == newadp->ad_lbn) { allocdirect_merge(&inodedep->id_inoupdt, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); } while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct diradd *dap, *olddap; struct inodedep *inodedep; struct pagedep *pagedep; struct worklist *wk; struct mount *mnt; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct proc *p = CURPROC; /* XXX */ int error, ret, flushparent; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); fs = ip->i_fs; for (error = 0, flushparent = 0, olddap = NULL; ; ) { ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) break; if (LIST_FIRST(&inodedep->id_inowait) != NULL || LIST_FIRST(&inodedep->id_bufwait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) panic("softdep_fsync: pending ops"); if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == olddap) panic("softdep_fsync: flush failed"); olddap = dap; /* * Flush our parent if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; mnt = pagedep->pd_mnt; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); flushparent = dap->da_state & MKDIR_PARENT; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_flag & VXLOCK) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we must unlock ourselves before * requesting the lock on our parent. See the comment in * ufs_lookup for details on possible races. */ FREE_LOCK(&lk); VOP_UNLOCK(vp, 0, p); if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (flushparent) { if ((error = UFS_UPDATE(pvp, 1)) != 0) { vput(pvp); return (error); } } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, &bp); ret = VOP_BWRITE(bp->b_vp, bp); vput(pvp); if (error != 0) return (error); if (ret != 0) return (ret); } FREE_LOCK(&lk); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; if (vp->v_type != VBLK) panic("softdep_fsync_mountdev: vnode not VBLK"); ACQUIRE_LOCK(&lk); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP) { BUF_UNLOCK(bp); continue; } bremfree(bp); FREE_LOCK(&lk); (void) bawrite(bp); ACQUIRE_LOCK(&lk); /* * Since we may have slept during the I/O, we need * to start from a known point. */ nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); } drain_output(vp, 1); FREE_LOCK(&lk); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed by pushing the dirty blocks * associated with the file. If any I/O errors occur, they are returned. */ int softdep_sync_metadata(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct pagedep *pagedep; struct allocdirect *adp; struct allocindir *aip; struct buf *bp, *nbp; struct worklist *wk; int i, error, waitfor; /* * Check whether this vnode is involved in a filesystem * that is doing soft dependency processing. */ if (vp->v_type != VBLK) { if (!DOINGSOFTDEP(vp)) return (0); } else if (vp->v_specmountpoint == NULL || (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) return (0); /* * Ensure that any direct block dependencies have been cleared. */ ACQUIRE_LOCK(&lk); if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) { FREE_LOCK(&lk); return (error); } /* * For most files, the only metadata dependencies are the * cylinder group maps that allocate their inode or blocks. * The block allocation dependencies can be found by traversing * the dependency lists for any buffers that remain on their * dirty buffer list. The inode allocation dependency will * be resolved when the inode is updated with MNT_WAIT. * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. */ waitfor = MNT_NOWAIT; top: if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { FREE_LOCK(&lk); return (0); } bp = TAILQ_FIRST(&vp->v_dirtyblkhd); loop: /* * As we hold the buffer locked, none of its dependencies * will disappear. */ for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) { switch (wk->wk_type) { case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); if (adp->ad_state & DEPCOMPLETE) break; nbp = adp->ad_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); if (aip->ai_state & DEPCOMPLETE) break; nbp = aip->ai_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_INDIRDEP: restart: for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); aip; aip = LIST_NEXT(aip, ai_next)) { if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; if (getdirtybuf(&nbp, MNT_WAIT) == 0) goto restart; FREE_LOCK(&lk); if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); goto restart; } break; case D_INODEDEP: if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, WK_INODEDEP(wk)->id_ino)) != 0) { FREE_LOCK(&lk); bawrite(bp); return (error); } break; case D_PAGEDEP: /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, pagedep->pd_mnt, &pagedep->pd_diraddhd[i]))) { FREE_LOCK(&lk); bawrite(bp); return (error); } } break; case D_MKDIR: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_MKDIR(wk)->md_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; case D_BMSAFEMAP: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_BMSAFEMAP(wk)->sm_buf; if (getdirtybuf(&nbp, waitfor) == 0) break; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) { bawrite(bp); return (error); } ACQUIRE_LOCK(&lk); break; default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } (void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT); nbp = TAILQ_NEXT(bp, b_vnbufs); FREE_LOCK(&lk); bawrite(bp); ACQUIRE_LOCK(&lk); if (nbp != NULL) { bp = nbp; goto loop; } /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, proceed with the second pass * which will wait for the I/O as per above. */ drain_output(vp, 1); /* * The brief unlock is to allow any pent up dependency * processing to be done. */ if (waitfor == MNT_NOWAIT) { waitfor = MNT_WAIT; FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); goto top; } /* * If we have managed to get rid of all the dirty buffers, * then we are done. For certain directories and block * devices, we may need to do further work. */ if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) { FREE_LOCK(&lk); return (0); } FREE_LOCK(&lk); /* * If we are trying to sync a block device, some of its buffers may * contain metadata that cannot be written until the contents of some * partially written files have been written to disk. The only easy * way to accomplish this is to sync the entire filesystem (luckily * this happens rarely). */ if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, ap->a_p)) != 0) return (error); return (0); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(fs, ino) struct fs *fs; ino_t ino; { struct inodedep *inodedep; struct allocdirect *adp; int error, waitfor; struct buf *bp; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ for (waitfor = MNT_NOWAIT; ; ) { FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; if (getdirtybuf(&bp, waitfor) == 0) { if (waitfor == MNT_NOWAIT) continue; break; } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) { ACQUIRE_LOCK(&lk); return (error); } ACQUIRE_LOCK(&lk); break; } if (adp != NULL) continue; for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; if (getdirtybuf(&bp, waitfor) == 0) { if (waitfor == MNT_NOWAIT) continue; break; } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) { ACQUIRE_LOCK(&lk); return (error); } ACQUIRE_LOCK(&lk); break; } if (adp != NULL) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct proc *p = CURPROC; /* XXX */ struct inodedep *inodedep; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int gotit, error = 0; struct buf *bp; ino_t inum; ump = VFSTOUFS(mp); while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); if ((error = UFS_UPDATE(pvp, 1)) != 0) break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_PARENT) panic("flush_pagedep_deps: MKDIR"); } /* * Flush the file on which the directory entry depends. * If the inode has already been pushed out of the cache, * then all the block dependencies will have been flushed * leaving only inode dependencies (e.g., bitmaps). Thus, * we do a ufs_ihashget to check for the vnode in the cache. * If it is there, we do a full flush. If it is no longer * there we need only dispose of any remaining bitmap * dependencies and write the inode to disk. */ inum = dap->da_newinum; FREE_LOCK(&lk); if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) { ACQUIRE_LOCK(&lk); if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0 && dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 1 failed"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT); FREE_LOCK(&lk); if (gotit && (error = VOP_BWRITE(inodedep->id_buf->b_vp, inodedep->id_buf)) != 0) break; ACQUIRE_LOCK(&lk); } if (dap != LIST_FIRST(diraddhdp)) continue; /* * If the inode is still sitting in a buffer waiting * to be written, push it to disk. */ FREE_LOCK(&lk); if ((error = bread(ump->um_devvp, fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) break; if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) break; ACQUIRE_LOCK(&lk); if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 2 failed"); continue; } if (vp->v_type == VDIR) { /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. We do not want or need * the full semantics of a synchronous VOP_FSYNC as * that may end up here again, once for each directory * level in the filesystem. Instead, we push the blocks * and wait for them to clear. */ if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { vput(vp); break; } drain_output(vp, 0); } error = UFS_UPDATE(vp, 1); vput(vp); if (error) break; /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush 3 failed"); ACQUIRE_LOCK(&lk); } if (error) ACQUIRE_LOCK(&lk); return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. Therefore we deliberately slow things * down and speed up the I/O processing if we find ourselves with too * many dependencies in progress. */ static int request_cleanup(resource, islocked) int resource; int islocked; { struct callout_handle handle; struct proc *p = CURPROC; /* * We never hold up the filesystem syncer process. */ if (p == filesys_syncer) return (0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: stat_ino_limit_push += 1; req_clear_inodedeps = 1; break; case FLUSH_REMOVE: stat_blk_limit_push += 1; req_clear_remove = 1; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ if (islocked == 0) ACQUIRE_LOCK(&lk); if (proc_waiting == 0) { proc_waiting = 1; handle = timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2); } FREE_LOCK_INTERLOCKED(&lk); (void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0); ACQUIRE_LOCK_INTERLOCKED(&lk); if (proc_waiting) { untimeout(pause_timer, NULL, handle); proc_waiting = 0; } else { switch (resource) { case FLUSH_INODES: stat_ino_limit_hit += 1; break; case FLUSH_REMOVE: stat_blk_limit_hit += 1; break; } } if (islocked == 0) FREE_LOCK(&lk); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. */ void pause_timer(arg) void *arg; { proc_waiting = 0; wakeup(&proc_waiting); } /* * Flush out a directory with at least one removal dependency in an effort * to reduce the number of freefile and freeblks dependency structures. */ static void clear_remove(p) struct proc *p; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; static int next = 0; struct mount *mp; struct vnode *vp; int error, cnt; ino_t ino; ACQUIRE_LOCK(&lk); for (cnt = 0; cnt < pagedep_hash; cnt++) { pagedephd = &pagedep_hashtbl[next++]; if (next >= pagedep_hash) next = 0; for (pagedep = LIST_FIRST(pagedephd); pagedep; pagedep = LIST_NEXT(pagedep, pd_hash)) { if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) continue; mp = pagedep->pd_mnt; ino = pagedep->pd_ino; FREE_LOCK(&lk); if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_remove: vget", error); return; } if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_remove: fsync", error); drain_output(vp, 0); vput(vp); return; } } FREE_LOCK(&lk); } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(p) struct proc *p; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; static int next = 0; struct mount *mp; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ACQUIRE_LOCK(&lk); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt < inodedep_hash; cnt++) { inodedephd = &inodedep_hashtbl[next++]; if (next >= inodedep_hash) next = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } /* * Ugly code to find mount point given pointer to superblock. */ fs = inodedep->id_fs; for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; mp = CIRCLEQ_NEXT(mp, mnt_list)) if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) break; /* * Find the last inode in the block with dependencies. */ firstino = inodedep->id_ino & ~(INOPB(fs) - 1); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) continue; FREE_LOCK(&lk); if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); return; } if (ino == lastino) { if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_inodedeps: fsync2", error); drain_output(vp, 0); } vput(vp); ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); } /* * Acquire exclusive access to a buffer. * Must be called with splbio blocked. * Return 1 if buffer was acquired. */ static int getdirtybuf(bpp, waitfor) struct buf **bpp; int waitfor; { struct buf *bp; for (;;) { if ((bp = *bpp) == NULL) return (0); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) break; if (waitfor != MNT_WAIT) return (0); FREE_LOCK_INTERLOCKED(&lk); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) != ENOLCK) panic("getdirtybuf: inconsistent lock"); ACQUIRE_LOCK_INTERLOCKED(&lk); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (0); } bremfree(bp); return (1); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode locked. */ static void drain_output(vp, islocked) struct vnode *vp; int islocked; { if (!islocked) ACQUIRE_LOCK(&lk); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; FREE_LOCK_INTERLOCKED(&lk); tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drainvp", 0); ACQUIRE_LOCK_INTERLOCKED(&lk); } if (!islocked) FREE_LOCK(&lk); } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_flags & B_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 49534) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 49535) @@ -1,1299 +1,1297 @@ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 - * $Id: ffs_vfsops.c,v 1.99 1999/05/31 11:29:24 phk Exp $ + * $Id: ffs_vfsops.c,v 1.100 1999/07/11 19:16:50 phk Exp $ */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FFSNODE, "FFS node", "FFS vnode private part"); static int ffs_sbupdate __P((struct ufsmount *, int)); static int ffs_reload __P((struct mount *,struct ucred *,struct proc *)); static int ffs_oldfscompat __P((struct fs *)); static int ffs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); static int ffs_init __P((struct vfsconf *)); static struct vfsops ufs_vfsops = { ffs_mount, ufs_start, ffs_unmount, ufs_root, ufs_quotactl, ffs_statfs, ffs_sync, ffs_vget, ffs_fhtovp, ffs_vptofh, ffs_init, }; VFS_SET(ufs_vfsops, ufs, 0); /* * ffs_mount * * Called when mounting local physical media * * PARAMETERS: * mountroot * mp mount point structure * path NULL (flag for root mount!!!) * data * ndp * p process (user credentials check [statfs]) * * mount * mp mount point structure * path path to mount point * data pointer to argument struct in user space * ndp mount point namei() return (used for * credentials on reload), reused to look * up block device. * p process (user credentials check) * * RETURNS: 0 Success * !0 error number (errno.h) * * LOCK STATE: * * ENTRY * mount point is locked * EXIT * mount point is locked * * NOTES: * A NULL path can be used for a flag since the mount * system call will fail with EFAULT in copyinstr in * namei() if it is a genuine NULL from the user. */ static int ffs_mount( mp, path, data, ndp, p) struct mount *mp; /* mount struct pointer*/ char *path; /* path to mount point*/ caddr_t data; /* arguments to FS specific mount*/ struct nameidata *ndp; /* mount point credentials*/ struct proc *p; /* process requesting mount*/ { size_t size; int err = 0; struct vnode *devvp; struct ufs_args args; struct ufsmount *ump = 0; register struct fs *fs; int error, flags, ronly = 0; mode_t accessmode; /* * Use NULL path to flag a root mount */ if( path == NULL) { /* *** * Mounting root file system *** */ if ((err = bdevvp(rootdev, &rootvp))) { printf("ffs_mountroot: can't find rootvp"); return (err); } if (bdevsw(rootdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(rootdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if( ( err = ffs_mountfs(rootvp, mp, p, M_FFSNODE)) != 0) { /* fs specific cleanup (if any)*/ goto error_1; } goto dostatfs; /* success*/ } /* *** * Mounting non-root file system or updating a file system *** */ /* copy in user arguments*/ err = copyin(data, (caddr_t)&args, sizeof (struct ufs_args)); if (err) goto error_1; /* can't get arguments*/ /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. * Disallow clearing MNT_NOCLUSTERR and MNT_NOCLUSTERW flags, * if block device requests. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; err = 0; ronly = fs->fs_ronly; /* MNT_RELOAD might change this */ if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(ump->um_dev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; if (ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { err = softdep_flushfiles(mp, flags, p); } else { err = ffs_flushfiles(mp, flags, p); } ronly = 1; } if (!err && (mp->mnt_flag & MNT_RELOAD)) err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); if (err) { goto error_1; } if (ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (p->p_ucred->cr_uid != 0) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, p->p_ucred, p)) != 0) { VOP_UNLOCK(devvp, 0, p); return (error); } VOP_UNLOCK(devvp, 0, p); } if (fs->fs_clean == 0) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); err = EPERM; goto error_1; } } /* check to see if we need to start softdep */ if (fs->fs_flags & FS_DOSOFTDEP) { err = softdep_mount(devvp, mp, fs, p->p_ucred); if (err) goto error_1; } ronly = 0; } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) { mp->mnt_flag &= ~MNT_ASYNC; } /* if not updating name...*/ if (args.fspec == 0) { /* * Process export requests. Jumping to "success" * will return the vfs_export() error code. */ err = vfs_export(mp, &ump->um_export, &args.export); goto success; } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); err = namei(ndp); if (err) { /* can't get devvp!*/ goto error_1; } devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { err = ENOTBLK; goto error_2; } if (bdevsw(devvp->v_rdev) == NULL) { err = ENXIO; goto error_2; } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (p->p_ucred->cr_uid != 0) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p)) != 0) { vput(devvp); return (error); } VOP_UNLOCK(devvp, 0, p); } if (mp->mnt_flag & MNT_UPDATE) { /* ******************** * UPDATE * If it's not the same vnode, or at least the same device * then it's not correct. ******************** */ if (devvp != ump->um_devvp) { if ( devvp->v_rdev == ump->um_devvp->v_rdev) { vrele(devvp); } else { err = EINVAL; /* needs translation */ } } else vrele(devvp); /* * Update device name only on success */ if( !err) { /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); } } else { /* ******************** * NEW MOUNT ******************** */ if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERR) mp->mnt_flag |= MNT_NOCLUSTERR; if (bdevsw(devvp->v_rdev)->d_flags & D_NOCLUSTERW) mp->mnt_flag |= MNT_NOCLUSTERW; /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. */ /* Save "last mounted on" info for mount point (NULL pad)*/ copyinstr( path, /* mount point*/ mp->mnt_stat.f_mntonname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size); /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, /* device name*/ mp->mnt_stat.f_mntfromname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); err = ffs_mountfs(devvp, mp, p, M_FFSNODE); } if (err) { goto error_2; } dostatfs: /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname * * This code is common to root and non-root mounts */ (void)VFS_STATFS(mp, &mp->mnt_stat, p); goto success; error_2: /* error with devvp held*/ /* release devvp before failing*/ vrele(devvp); error_1: /* no state to back out*/ success: if (!err && path && (mp->mnt_flag & MNT_UPDATE)) { /* Update clean flag after changing read-onlyness. */ fs = ump->um_fs; if (ronly != fs->fs_ronly) { fs->fs_ronly = ronly; fs->fs_clean = ronly && (fs->fs_flags & FS_UNCLEAN) == 0 ? 1 : 0; ffs_sbupdate(ump, MNT_WAIT); } } return (err); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(mp, cred, p) register struct mount *mp; struct ucred *cred; struct proc *p; { register struct vnode *vp, *nvp, *devvp; struct inode *ip; struct csum *space; struct buf *bp; struct fs *fs, *newfs; struct partinfo dpart; dev_t dev; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, 0, cred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); if (error) panic("ffs_reload: dirty1"); dev = devvp->v_rdev; /* * Only VMIO the backing device if the backing device is a real * block device. See ffs_mountmfs() for more details. */ if (devvp->v_tag != VT_MFS && devvp->v_type == VBLK) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); vfs_object_create(devvp, p, p->p_ucred); simple_lock(&devvp->v_interlock); VOP_UNLOCK(devvp, LK_INTERLOCK, p); } /* * Step 2: re-read superblock from disk. */ if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) size = DEV_BSIZE; else size = dpart.disklab->d_secsize; if ((error = bread(devvp, (ufs_daddr_t)(SBOFF/size), SBSIZE, NOCRED,&bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if (newfs->fs_magic != FS_MAGIC || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOUFS(mp)->um_fs; /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ bcopy(&fs->fs_csp[0], &newfs->fs_csp[0], sizeof(fs->fs_csp)); newfs->fs_maxcluster = fs->fs_maxcluster; bcopy(newfs, fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat(fs); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp[0]; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, fs->fs_csp[fragstoblks(fs, i)], (u_int)size); brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { if (vp->v_mount != mp) { simple_unlock(&mntvnode_slock); goto loop; } nvp = vp->v_mntvnodes.le_next; /* * Step 4: invalidate all inactive vnodes. */ if (vrecycle(vp, &mntvnode_slock, p)) goto loop; /* * Step 5: invalidate all cached file data. */ simple_lock(&vp->v_interlock); simple_unlock(&mntvnode_slock); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { goto loop; } if (vinvalbuf(vp, 0, cred, p, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); return (error); } ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); return (0); } /* * Common code for mount and mountroot */ int ffs_mountfs(devvp, mp, p, malloctype) register struct vnode *devvp; struct mount *mp; struct proc *p; struct malloc_type *malloctype; { register struct ufsmount *ump; struct buf *bp; register struct fs *fs; dev_t dev; struct partinfo dpart; caddr_t base, space; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; u_int64_t maxfilesize; /* XXX */ size_t strsize; int ncount; dev = devvp->v_rdev; cred = p ? p->p_ucred : NOCRED; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); ncount = vcount(devvp); if (ncount > 1 && devvp != rootvp) return (EBUSY); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0); VOP_UNLOCK(devvp, 0, p); if (error) return (error); /* * Only VMIO the backing device if the backing device is a real * block device. This excludes the original MFS implementation. * Note that it is optional that the backing device be VMIOed. This * increases the opportunity for metadata caching. */ if (devvp->v_tag != VT_MFS && devvp->v_type == VBLK) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); vfs_object_create(devvp, p, p->p_ucred); simple_lock(&devvp->v_interlock); VOP_UNLOCK(devvp, LK_INTERLOCK, p); } ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, cred, p) != 0) size = DEV_BSIZE; else size = dpart.disklab->d_secsize; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || fs->fs_bsize < sizeof(struct fs)) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } } /* XXX updating 4.2 FFS superblocks trashes rotational layout tables */ if (fs->fs_postblformat == FS_42POSTBLFMT && !ronly) { error = EROFS; /* needs translation */ goto out; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); ump->um_malloctype = malloctype; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; brelse(bp); bp = NULL; fs = ump->um_fs; fs->fs_ronly = ronly; if (ronly == 0) { fs->fs_fmod = 1; fs->fs_clean = 0; } size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); base = space = malloc((u_long)size, M_UFSMNT, M_WAITOK); for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(base, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space; space += size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = (int32_t *)space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || vfs_getvfs(&mp->mnt_stat.f_fsid)) vfs_getnewfsid(mp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; devvp->v_specmountpoint = mp; ffs_oldfscompat(fs); /* * Set FS local "last mounted on" information (NULL pad) */ copystr( mp->mnt_stat.f_mntonname, /* mount point*/ fs->fs_fsmnt, /* copy area*/ sizeof(fs->fs_fsmnt) - 1, /* max size*/ &strsize); /* real size*/ bzero( fs->fs_fsmnt + strsize, sizeof(fs->fs_fsmnt) - strsize); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } ump->um_savedmaxfilesize = fs->fs_maxfilesize; /* XXX */ maxfilesize = (u_int64_t)0x40000000 * fs->fs_bsize - 1; /* XXX */ if (fs->fs_maxfilesize > maxfilesize) /* XXX */ fs->fs_maxfilesize = maxfilesize; /* XXX */ if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(base, M_UFSMNT); goto out; } fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT); } return (0); out: devvp->v_specmountpoint = NULL; if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); if (ump) { free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } /* * Sanity checks for old file systems. * * XXX - goes away some day. */ static int ffs_oldfscompat(fs) struct fs *fs; { fs->fs_npsect = max(fs->fs_npsect, fs->fs_nsect); /* XXX */ fs->fs_interleave = max(fs->fs_interleave, 1); /* XXX */ if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ fs->fs_nrpos = 8; /* XXX */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ #if 0 int i; /* XXX */ u_int64_t sizepb = fs->fs_bsize; /* XXX */ /* XXX */ fs->fs_maxfilesize = fs->fs_bsize * NDADDR - 1; /* XXX */ for (i = 0; i < NIADDR; i++) { /* XXX */ sizepb *= NINDIR(fs); /* XXX */ fs->fs_maxfilesize += sizepb; /* XXX */ } /* XXX */ #endif fs->fs_maxfilesize = (u_quad_t) 1LL << 39; fs->fs_qbmask = ~fs->fs_bmask; /* XXX */ fs->fs_qfmask = ~fs->fs_fmask; /* XXX */ } /* XXX */ return (0); } /* * unmount system call */ int ffs_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct ufsmount *ump; register struct fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, p)) != 0) return (error); } else { if ((error = ffs_flushfiles(mp, flags, p)) != 0) return (error); } ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & FS_UNCLEAN ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT); if (error) { fs->fs_clean = 0; return (error); } } ump->um_devvp->v_specmountpoint = NULL; vinvalbuf(ump->um_devvp, V_SAVE, NOCRED, p, 0, 0); error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vrele(ump->um_devvp); free(fs->fs_csp[0], M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, p) register struct mount *mp; int flags; struct proc *p; { register struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, NULLVP, SKIPSYSTEM|flags); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(p, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif /* * Flush all the files. */ if ((error = vflush(mp, NULL, flags)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(ump->um_devvp, p->p_ucred, MNT_WAIT, p); VOP_UNLOCK(ump->um_devvp, 0, p); return (error); } /* * Get file system statistics. */ int ffs_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { register struct ufsmount *ump; register struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_MAGIC) panic("ffs_statfs"); sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree; sbp->f_bavail = freespace(fs, fs->fs_minfree); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree; if (sbp != &mp->mnt_stat) { sbp->f_type = mp->mnt_vfc->vfc_typenum; bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int ffs_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { struct vnode *nvp, *vp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, allerror = 0; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; ip = VTOI(vp); if ((vp->v_type == VNON) || (((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) && (TAILQ_EMPTY(&vp->v_dirtyblkhd) || (waitfor == MNT_LAZY)))) { simple_unlock(&vp->v_interlock); continue; } if (vp->v_type != VCHR) { simple_unlock(&mntvnode_slock); error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p); if (error) { simple_lock(&mntvnode_slock); if (error == ENOENT) goto loop; continue; } if ((error = VOP_FSYNC(vp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(vp, 0, p); vrele(vp); simple_lock(&mntvnode_slock); } else { simple_unlock(&mntvnode_slock); simple_unlock(&vp->v_interlock); /* UFS_UPDATE(vp, waitfor == MNT_WAIT); */ UFS_UPDATE(vp, 0); simple_lock(&mntvnode_slock); } } simple_unlock(&mntvnode_slock); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { if (ump->um_mountp->mnt_flag & MNT_SOFTDEP) waitfor = MNT_NOWAIT; vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p); if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0, p); } #ifdef QUOTA qsync(mp); #endif /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0) allerror = error; return (allerror); } /* * Look up a FFS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ffs_inode_hash_lock; int ffs_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; dev_t dev; int error; ump = VFSTOUFS(mp); dev = ump->um_dev; restart: if ((*vpp = ufs_ihashget(dev, ino)) != NULL) { return (0); } /* * Lock out the creation of new entries in the FFS hash table in * case getnewvnode() or MALLOC() blocks, otherwise a duplicate * may occur! */ if (ffs_inode_hash_lock) { while (ffs_inode_hash_lock) { ffs_inode_hash_lock = -1; tsleep(&ffs_inode_hash_lock, PVM, "ffsvgt", 0); } goto restart; } ffs_inode_hash_lock = 1; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ MALLOC(ip, struct inode *, sizeof(struct inode), ump->um_malloctype, M_WAITOK); /* Allocate a new vnode/inode. */ error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp); if (error) { if (ffs_inode_hash_lock < 0) wakeup(&ffs_inode_hash_lock); ffs_inode_hash_lock = 0; *vpp = NULL; FREE(ip, ump->um_malloctype); return (error); } bzero((caddr_t)ip, sizeof(struct inode)); lockinit(&ip->i_lock, PINOD, "inode", 0, 0); vp->v_data = ip; ip->i_vnode = vp; ip->i_fs = fs = ump->um_fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ ufs_ihashins(ip); if (ffs_inode_hash_lock < 0) wakeup(&ffs_inode_hash_lock); ffs_inode_hash_lock = 0; /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; VREF(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din.di_ouid; /* XXX */ ip->i_gid = ip->i_din.di_ogid; /* XXX */ } /* XXX */ *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ int ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct sockaddr *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { register struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int ffs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct inode *ip; register struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the filesystem; just use ufs_init. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Write a superblock and associated information back to disk. */ static int ffs_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { register struct fs *dfs, *fs = mp->um_fs; register struct buf *bp; int blks; caddr_t space; int i, size, error, allerror = 0; /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = (caddr_t)fs->fs_csp[0]; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0); bcopy(space, bp->b_data, (u_int)size); space += size; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) return (allerror); bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0); fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); /* Restore compatibility to old file systems. XXX */ dfs = (struct fs *)bp->b_data; /* XXX */ if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ dfs->fs_nrpos = -1; /* XXX */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ int32_t *lp, tmp; /* XXX */ /* XXX */ lp = (int32_t *)&dfs->fs_qbmask; /* XXX */ tmp = lp[4]; /* XXX */ for (i = 4; i > 0; i--) /* XXX */ lp[i] = lp[i-1]; /* XXX */ lp[0] = tmp; /* XXX */ } /* XXX */ dfs->fs_maxfilesize = mp->um_savedmaxfilesize; /* XXX */ if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c (revision 49534) +++ head/sys/ufs/ffs/ffs_vnops.c (revision 49535) @@ -1,266 +1,265 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 - * $Id: ffs_vnops.c,v 1.57 1999/06/18 05:49:46 mckusick Exp $ + * $Id: ffs_vnops.c,v 1.58 1999/06/26 02:46:39 mckusick Exp $ */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include - -#include static int ffs_fsync __P((struct vop_fsync_args *)); static int ffs_getpages __P((struct vop_getpages_args *)); static int ffs_putpages __P((struct vop_putpages_args *)); static int ffs_read __P((struct vop_read_args *)); static int ffs_write __P((struct vop_write_args *)); /* Global vfs data structures for ufs. */ vop_t **ffs_vnodeop_p; static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) ufs_vnoperate }, { &vop_fsync_desc, (vop_t *) ffs_fsync }, { &vop_getpages_desc, (vop_t *) ffs_getpages }, { &vop_putpages_desc, (vop_t *) ffs_putpages }, { &vop_read_desc, (vop_t *) ffs_read }, { &vop_balloc_desc, (vop_t *) ffs_balloc }, { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks }, { &vop_write_desc, (vop_t *) ffs_write }, { NULL, NULL } }; static struct vnodeopv_desc ffs_vnodeop_opv_desc = { &ffs_vnodeop_p, ffs_vnodeop_entries }; vop_t **ffs_specop_p; static struct vnodeopv_entry_desc ffs_specop_entries[] = { { &vop_default_desc, (vop_t *) ufs_vnoperatespec }, { &vop_fsync_desc, (vop_t *) ffs_fsync }, { NULL, NULL } }; static struct vnodeopv_desc ffs_specop_opv_desc = { &ffs_specop_p, ffs_specop_entries }; vop_t **ffs_fifoop_p; static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { { &vop_default_desc, (vop_t *) ufs_vnoperatefifo }, { &vop_fsync_desc, (vop_t *) ffs_fsync }, { NULL, NULL } }; static struct vnodeopv_desc ffs_fifoop_opv_desc = { &ffs_fifoop_p, ffs_fifoop_entries }; VNODEOP_SET(ffs_vnodeop_opv_desc); VNODEOP_SET(ffs_specop_opv_desc); VNODEOP_SET(ffs_fifoop_opv_desc); #include /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct buf *bp; struct buf *nbp; int s, error, passes, skipmeta; daddr_t lbn; if (vp->v_type == VBLK) { lbn = INT_MAX; if (vp->v_specmountpoint != NULL && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) softdep_fsync_mountdev(vp); } else { struct inode *ip; ip = VTOI(vp); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); } /* * Flush all dirty buffers associated with a vnode. */ passes = NIADDR + 1; skipmeta = 0; if (ap->a_waitfor == MNT_WAIT) skipmeta = 1; s = splbio(); loop: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = TAILQ_NEXT(bp, b_vnbufs)) bp->b_flags &= ~B_SCANNED; for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); /* * First time through on a synchronous call, * or if it's already scheduled, skip to the next * buffer */ if ((bp->b_flags & B_SCANNED) || ((skipmeta == 1) && (bp->b_lblkno < 0)) || BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * If data is outstanding to another vnode, or we were * asked to wait for everything, or it's not a file or BDEV, * start the IO on this buffer immediatly. */ bp->b_flags |= B_SCANNED; if (((bp->b_vp != vp) || (ap->a_waitfor == MNT_WAIT)) || ((vp->v_type != VREG) && (vp->v_type != VBLK))) { /* * On our final pass through, do all I/O synchronously * so that we can find out if our flush is failing * because of write errors. */ if (passes > 0 || (ap->a_waitfor != MNT_WAIT)) { if ((bp->b_flags & B_CLUSTEROK) && ap->a_waitfor != MNT_WAIT) { BUF_UNLOCK(bp); (void) vfs_bio_awrite(bp); } else { bremfree(bp); splx(s); (void) bawrite(bp); s = splbio(); } } else { bremfree(bp); splx(s); if ((error = bwrite(bp)) != 0) return (error); s = splbio(); } } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { /* * If the buffer is for data that has been truncated * off the file, then throw it away. */ bremfree(bp); bp->b_flags |= B_INVAL | B_NOCACHE; splx(s); brelse(bp); s = splbio(); } else { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } /* * Since we may have slept during the I/O, we need * to start from a known point. */ nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); } /* * If we were asked to do this synchronously, then go back for * another pass, this time doing the metadata. */ if (skipmeta) { skipmeta = 0; goto loop; } if (ap->a_waitfor == MNT_WAIT) { while (vp->v_numoutput) { vp->v_flag |= VBWAIT; (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 4, "ffsfsn", 0); } /* * Ensure that any filesystem metatdata associated * with the vnode has been written. */ splx(s); if ((error = softdep_sync_metadata(ap)) != 0) return (error); s = splbio(); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { /* * Block devices associated with filesystems may * have new I/O requests posted for them even if * the vnode is locked, so no amount of trying will * get them clean. Thus we give block devices a * good effort, then just give up. For all other file * types, go around and try again until it is clean. */ if (passes > 0) { passes -= 1; goto loop; } #ifdef DIAGNOSTIC if (vp->v_type != VBLK) vprint("ffs_fsync: dirty", vp); #endif } } splx(s); return (UFS_UPDATE(vp, ap->a_waitfor == MNT_WAIT)); } Index: head/sys/ufs/mfs/mfs_vnops.c =================================================================== --- head/sys/ufs/mfs/mfs_vnops.c (revision 49534) +++ head/sys/ufs/mfs/mfs_vnops.c (revision 49535) @@ -1,429 +1,428 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95 - * $Id: mfs_vnops.c,v 1.44 1999/05/02 23:56:57 alc Exp $ + * $Id: mfs_vnops.c,v 1.45 1999/06/26 02:46:41 mckusick Exp $ */ #include #include #include #include #include #include #include #include #include - -#include +#include #include #include static int mfs_badop __P((struct vop_generic_args *)); static int mfs_bmap __P((struct vop_bmap_args *)); static int mfs_close __P((struct vop_close_args *)); static int mfs_fsync __P((struct vop_fsync_args *)); static int mfs_freeblks __P((struct vop_freeblks_args *)); static int mfs_inactive __P((struct vop_inactive_args *)); /* XXX */ static int mfs_open __P((struct vop_open_args *)); static int mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */ static int mfs_print __P((struct vop_print_args *)); /* XXX */ static int mfs_strategy __P((struct vop_strategy_args *)); /* XXX */ static int mfs_getpages __P((struct vop_getpages_args *)); /* XXX */ /* * mfs vnode operations. */ vop_t **mfs_vnodeop_p; static struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) mfs_badop }, { &vop_bmap_desc, (vop_t *) mfs_bmap }, { &vop_bwrite_desc, (vop_t *) vop_defaultop }, { &vop_close_desc, (vop_t *) mfs_close }, { &vop_freeblks_desc, (vop_t *) mfs_freeblks }, { &vop_fsync_desc, (vop_t *) mfs_fsync }, { &vop_getpages_desc, (vop_t *) mfs_getpages }, { &vop_inactive_desc, (vop_t *) mfs_inactive }, { &vop_ioctl_desc, (vop_t *) vop_enotty }, { &vop_islocked_desc, (vop_t *) vop_defaultop }, { &vop_lock_desc, (vop_t *) vop_defaultop }, { &vop_open_desc, (vop_t *) mfs_open }, { &vop_print_desc, (vop_t *) mfs_print }, { &vop_reclaim_desc, (vop_t *) mfs_reclaim }, { &vop_strategy_desc, (vop_t *) mfs_strategy }, { &vop_unlock_desc, (vop_t *) vop_defaultop }, { NULL, NULL } }; static struct vnodeopv_desc mfs_vnodeop_opv_desc = { &mfs_vnodeop_p, mfs_vnodeop_entries }; VNODEOP_SET(mfs_vnodeop_opv_desc); /* * Vnode Operations. * * Open called to allow memory filesystem to initialize and * validate before actual IO. Record our process identifier * so we can tell when we are doing I/O to ourself. */ /* ARGSUSED */ static int mfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { if (ap->a_vp->v_type != VBLK) { panic("mfs_open not VBLK"); /* NOTREACHED */ } return (0); } static int mfs_fsync(ap) struct vop_fsync_args *ap; { return (VOCALL(spec_vnodeop_p, VOFFSET(vop_fsync), ap)); } /* * mfs_freeblks() - hook to allow us to free physical memory. * * We implement the B_FREEBUF strategy. We can't just madvise() * here because we have to do it in the correct order vs other bio * requests, so we queue it. * * Note: geteblk() sets B_INVAL. We leave it set to guarentee buffer * throw-away on brelse()? XXX */ static int mfs_freeblks(ap) struct vop_freeblks_args /* { struct vnode *a_vp; daddr_t a_addr; daddr_t a_length; } */ *ap; { struct buf *bp; struct vnode *vp; if (!vfinddev(ap->a_vp->v_rdev, VBLK, &vp) || vp->v_usecount == 0) panic("mfs_freeblks: bad dev"); bp = geteblk(ap->a_length); bp->b_flags |= B_FREEBUF | B_ASYNC; bp->b_dev = ap->a_vp->v_rdev; bp->b_blkno = ap->a_addr; bp->b_offset = dbtob(ap->a_addr); bp->b_bcount = ap->a_length; BUF_KERNPROC(bp); VOP_STRATEGY(vp, bp); return(0); } /* * Pass I/O requests to the memory filesystem process. */ static int mfs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct mfsnode *mfsp; struct vnode *vp; struct proc *p = curproc; /* XXX */ int s; if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0) panic("mfs_strategy: bad dev"); mfsp = VTOMFS(vp); /* * splbio required for queueing/dequeueing, in case of forwarded * BPs from bio interrupts (?). It may not be necessary. */ s = splbio(); if (mfsp->mfs_pid == 0) { /* * mini-root. Note: B_FREEBUF not supported at the moment, * I'm not sure what kind of dataspace b_data is in. */ caddr_t base; base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); if (bp->b_flags & B_FREEBUF) ; if (bp->b_flags & B_READ) bcopy(base, bp->b_data, bp->b_bcount); else bcopy(bp->b_data, base, bp->b_bcount); biodone(bp); } else if (mfsp->mfs_pid == p->p_pid) { /* * VOP to self */ splx(s); mfs_doio(bp, mfsp); s = splbio(); } else { /* * VOP from some other process, queue to MFS process and * wake it up. */ bufq_insert_tail(&mfsp->buf_queue, bp); wakeup((caddr_t)vp); } splx(s); return (0); } /* * Memory file system I/O. * * Trivial on the HP since buffer has already been mapping into KVA space. * * Read and Write are handled with a simple copyin and copyout. * * We also partially support VOP_FREEBLKS() via B_FREEBUF. We can't implement * completely -- for example, on fragments or inode metadata, but we can * implement it for page-aligned requests. */ void mfs_doio(bp, mfsp) register struct buf *bp; struct mfsnode *mfsp; { caddr_t base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); if (bp->b_flags & B_FREEBUF) { /* * Implement B_FREEBUF, which allows the filesystem to tell * a block device when blocks are no longer needed (like when * a file is deleted). We use the hook to MADV_FREE the VM. * This makes an MFS filesystem work as well or better then * a sun-style swap-mounted filesystem. */ int bytes = bp->b_bcount; if ((vm_offset_t)base & PAGE_MASK) { int n = PAGE_SIZE - ((vm_offset_t)base & PAGE_MASK); bytes -= n; base += n; } if (bytes > 0) { struct madvise_args uap; bytes &= ~PAGE_MASK; if (bytes != 0) { bzero(&uap, sizeof(uap)); uap.addr = base; uap.len = bytes; uap.behav = MADV_FREE; madvise(curproc, &uap); } } bp->b_error = 0; } else if (bp->b_flags & B_READ) { /* * Read data from our 'memory' disk */ bp->b_error = copyin(base, bp->b_data, bp->b_bcount); } else { /* * Write data to our 'memory' disk */ bp->b_error = copyout(bp->b_data, base, bp->b_bcount); } if (bp->b_error) bp->b_flags |= B_ERROR; biodone(bp); } /* * This is a noop, simply returning what one has been given. */ static int mfs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; ufs_daddr_t a_bn; struct vnode **a_vpp; ufs_daddr_t *a_bnp; int *a_runp; } */ *ap; { if (ap->a_vpp != NULL) *ap->a_vpp = ap->a_vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; return (0); } /* * Memory filesystem close routine */ /* ARGSUSED */ static int mfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct mfsnode *mfsp = VTOMFS(vp); register struct buf *bp; int error; /* * Finish any pending I/O requests. */ while ((bp = bufq_first(&mfsp->buf_queue)) != NULL) { bufq_remove(&mfsp->buf_queue, bp); mfs_doio(bp, mfsp); wakeup((caddr_t)bp); } /* * On last close of a memory filesystem * we must invalidate any in core blocks, so that * we can, free up its vnode. */ if ((error = vinvalbuf(vp, 1, ap->a_cred, ap->a_p, 0, 0)) != 0) return (error); /* * There should be no way to have any more uses of this * vnode, so if we find any other uses, it is a panic. */ if (vp->v_usecount > 1) printf("mfs_close: ref count %d > 1\n", vp->v_usecount); if (vp->v_usecount > 1 || (bufq_first(&mfsp->buf_queue) != NULL)) panic("mfs_close"); /* * Send a request to the filesystem server to exit. */ mfsp->mfs_active = 0; wakeup((caddr_t)vp); return (0); } /* * Memory filesystem inactive routine */ /* ARGSUSED */ static int mfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct mfsnode *mfsp = VTOMFS(vp); if (bufq_first(&mfsp->buf_queue) != NULL) panic("mfs_inactive: not inactive (next buffer %p)", bufq_first(&mfsp->buf_queue)); VOP_UNLOCK(vp, 0, ap->a_p); return (0); } /* * Reclaim a memory filesystem devvp so that it can be reused. */ static int mfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; FREE(vp->v_data, M_MFSNODE); vp->v_data = NULL; return (0); } /* * Print out the contents of an mfsnode. */ static int mfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct mfsnode *mfsp = VTOMFS(ap->a_vp); printf("tag VT_MFS, pid %ld, base %p, size %ld\n", (long)mfsp->mfs_pid, (void *)mfsp->mfs_baseoff, mfsp->mfs_size); return (0); } /* * Block device bad operation */ static int mfs_badop(struct vop_generic_args *ap) { int i; printf("mfs_badop[%s]\n", ap->a_desc->vdesc_name); i = vop_defaultop(ap); printf("mfs_badop[%s] = %d\n", ap->a_desc->vdesc_name,i); return (i); } static int mfs_getpages(ap) struct vop_getpages_args *ap; { return (VOCALL(spec_vnodeop_p, VOFFSET(vop_getpages), ap)); } Index: head/sys/ufs/ufs/ufs_bmap.c =================================================================== --- head/sys/ufs/ufs/ufs_bmap.c (revision 49534) +++ head/sys/ufs/ufs/ufs_bmap.c (revision 49535) @@ -1,355 +1,354 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 - * $Id: ufs_bmap.c,v 1.27 1999/05/07 10:11:36 phk Exp $ + * $Id: ufs_bmap.c,v 1.28 1999/05/08 06:40:25 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include -#include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; ufs_daddr_t a_bn; struct vnode **a_vpp; ufs_daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; ufs_daddr_t bn; ufs_daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; ufs_daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL)) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { *runp = 0; } if (runb) { *runb = 0; } maxrun = 0; if (runp || runb || (vp->v_maxio == 0)) { struct vnode *devvp; int blksize; blksize = mp->mnt_stat.f_iosize; /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ devvp = ip->i_devvp; if (devvp != NULL && devvp->v_tag != VT_MFS && devvp->v_type == VBLK) { if (bdevsw(devvp->v_rdev)->d_maxio > MAXPHYS) { maxrun = MAXPHYS; vp->v_maxio = MAXPHYS; } else { maxrun = bdevsw(devvp->v_rdev)->d_maxio; vp->v_maxio = bdevsw(devvp->v_rdev)->d_maxio; } maxrun = maxrun / blksize; maxrun -= 1; } if (maxrun <= 0) { vp->v_maxio = DFLTPHYS; maxrun = DFLTPHYS / blksize; maxrun -= 1; } } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; bp->b_flags &= ~(B_INVAL|B_ERROR); vfs_busy_pages(bp, 0); VOP_STRATEGY(bp->b_vp, bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((ufs_daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((ufs_daddr_t *)bp->b_data)[bn - 1], ((ufs_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; ufs_daddr_t bn; struct indir *ap; int *nump; { long blockcnt, metalbn, realbn; struct ufsmount *ump; int i, numlevels, off; int64_t qblockcnt; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the previous level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); /* * Use int64_t's here to avoid overflow for triple indirect * blocks when longs have 32 bits and the block size is more * than 4K. */ qblockcnt = (int64_t)blockcnt * MNINDIR(ump); if (bn < qblockcnt) break; blockcnt = qblockcnt; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; blockcnt /= MNINDIR(ump); } if (nump) *nump = numlevels; return (0); } Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 49534) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 49535) @@ -1,2337 +1,2337 @@ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 - * $Id: ufs_vnops.c,v 1.115 1999/06/16 23:27:53 mckusick Exp $ + * $Id: ufs_vnops.c,v 1.116 1999/07/13 18:20:13 mckusick Exp $ */ #include "opt_quota.h" #include "opt_suiddir.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include -#include #include #include #include #include #include #include static int ufs_abortop __P((struct vop_abortop_args *)); static int ufs_access __P((struct vop_access_args *)); static int ufs_advlock __P((struct vop_advlock_args *)); static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); static int ufs_chown __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); static int ufs_close __P((struct vop_close_args *)); static int ufs_create __P((struct vop_create_args *)); static int ufs_getattr __P((struct vop_getattr_args *)); static int ufs_link __P((struct vop_link_args *)); static int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); static int ufs_missingop __P((struct vop_generic_args *ap)); static int ufs_mkdir __P((struct vop_mkdir_args *)); static int ufs_mknod __P((struct vop_mknod_args *)); static int ufs_mmap __P((struct vop_mmap_args *)); static int ufs_open __P((struct vop_open_args *)); static int ufs_pathconf __P((struct vop_pathconf_args *)); static int ufs_print __P((struct vop_print_args *)); static int ufs_readdir __P((struct vop_readdir_args *)); static int ufs_readlink __P((struct vop_readlink_args *)); static int ufs_remove __P((struct vop_remove_args *)); static int ufs_rename __P((struct vop_rename_args *)); static int ufs_rmdir __P((struct vop_rmdir_args *)); static int ufs_setattr __P((struct vop_setattr_args *)); static int ufs_strategy __P((struct vop_strategy_args *)); static int ufs_symlink __P((struct vop_symlink_args *)); static int ufs_whiteout __P((struct vop_whiteout_args *)); static int ufsfifo_close __P((struct vop_close_args *)); static int ufsfifo_read __P((struct vop_read_args *)); static int ufsfifo_write __P((struct vop_write_args *)); static int ufsspec_close __P((struct vop_close_args *)); static int ufsspec_read __P((struct vop_read_args *)); static int ufsspec_write __P((struct vop_write_args *)); union _qcvt { int64_t qcvt; int32_t val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; void ufs_itimes(vp) struct vnode *vp; { struct inode *ip; time_t tv_sec; ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { tv_sec = time_second; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if (ip->i_flag & IN_ACCESS) ip->i_atime = tv_sec; if (ip->i_flag & IN_UPDATE) { ip->i_mtime = tv_sec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) ip->i_ctime = tv_sec; } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } /* * Create a regular file */ int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); VN_POLLEVENT(ap->a_dvp, POLLWRITE); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_POLLEVENT(ap->a_dvp, POLLWRITE); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ ip->i_rdev = vap->va_rdev; } /* * Remove inode so that it will be reloaded by VFS_VGET and * checked to see if it is an alias of an existing entry in * the inode cache. */ vput(*vpp); (*vpp)->v_type = VNON; vgone(*vpp); *vpp = 0; return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); simple_unlock(&vp->v_interlock); return (0); } int ufs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; register gid_t *gp; int i; #ifdef QUOTA int error; #endif /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); #endif break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) return (EPERM); /* Otherwise, user id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->i_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->i_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* ARGSUSED */ int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); register struct vattr *vap = ap->a_vap; ufs_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_din.di_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = ip->i_atimensec; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = ip->i_mtimensec; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = ip->i_ctimensec; vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; /* * Use the information contained in v_specinfo for VBLK and VCHR * vnodes, and in the underlying mount point for (typically) VREG * vnodes. Note that vp->v_specmountpoint can be NULL. */ if (vp->v_type == VBLK) { vap->va_blocksize = vp->v_specinfo->si_bsize_best; } else if (vp->v_type == VCHR) { vap->va_blocksize = vp->v_specinfo->si_bsize_max; } else { vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; } vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != ip->i_uid && (error = suser_xxx(cred, p, PRISON_ROOT))) return (error); if (cred->cr_uid == 0) { if ((ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) && securelevel > 0) return (EPERM); ip->i_flags = vap->va_flags; } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (vap->va_flags & UF_SETTABLE) != vap->va_flags) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = UFS_TRUNCATE(vp, vap->va_size, 0, cred, p)) != 0) return (error); } ip = VTOI(vp); if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != ip->i_uid && (error = suser_xxx(cred, p, PRISON_ROOT)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, p)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; ufs_itimes(vp); if (vap->va_atime.tv_sec != VNOVAL) ip->i_atime = vap->va_atime.tv_sec; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_mtime = vap->va_mtime.tv_sec; error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ufs_chmod(vp, (int)vap->va_mode, cred, p); } VN_POLLEVENT(vp, POLLATTRIB); return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, p) register struct vnode *vp; register int mode; register struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); int error; if (cred->cr_uid != ip->i_uid) { error = suser_xxx(cred, p, PRISON_ROOT); if (error) return (error); } if (cred->cr_uid) { if (vp->v_type != VDIR && (mode & S_ISTXT)) return (EFTYPE); if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) return (EPERM); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, p) register struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA register int i; long change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * If we don't own the file, are trying to change the owner * of the file, or are not a member of the target group, * the caller must be superuser or the call fails. */ if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || (gid != ip->i_gid && !groupmember((gid_t)gid, cred))) && (error = suser_xxx(cred, p, PRISON_ROOT))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = ip->i_blocks; (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; ip->i_uid = uid; #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; ip->i_uid = ouid; if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if (cred->cr_uid != 0 && (ouid != uid || ogid != gid)) ip->i_mode &= ~(ISUID | ISGID); return (0); } /* * Mmap a file * * NB Currently unsupported. */ /* ARGSUSED */ int ufs_mmap(ap) struct vop_mmap_args /* { struct vnode *a_vp; int a_fflags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { return (EINVAL); } int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); VN_POLLEVENT(vp, POLLNLINK); VN_POLLEVENT(dvp, POLLWRITE); out: return (error); } /* * link vnode call */ int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct proc *p = cnp->cn_proc; struct inode *ip; struct direct newdir; int error; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (tdvp->v_mount != vp->v_mount) { VOP_ABORTOP(tdvp, cnp); error = EXDEV; goto out2; } if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { VOP_ABORTOP(tdvp, cnp); goto out2; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { VOP_ABORTOP(tdvp, cnp); error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { VOP_ABORTOP(tdvp, cnp); error = EPERM; goto out1; } ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_increase_linkcnt(ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; } zfree(namei_zone, cnp->cn_pnbuf); out1: if (tdvp != vp) VOP_UNLOCK(vp, 0, p); out2: VN_POLLEVENT(vp, POLLNLINK); VN_POLLEVENT(tdvp, POLLWRITE); return (error); } /* * whiteout vnode call */ int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef DIAGNOSTIC if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } if (cnp->cn_flags & HASBUF) { zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_flags &= ~HASBUF; } return (error); } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; register struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; struct inode *ip, *xp, *dp; struct direct newdir; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0, ioflag; #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, tcnp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Check if just deleting a link name or if we've lost a race. * If another process completes the same rename after we've looked * up the source and have blocked looking up the target, then the * source and target inodes may be identical now although the * names were never linked. */ if (fvp == tvp) { if (fvp->v_type == VDIR) { /* * Linked directories are impossible, so we must * have lost the race. Pretend that the rename * completed before the lookup. */ #ifdef UFS_RENAME_DEBUG printf("ufs_rename: fvp == tvp for directories\n"); #endif error = ENOENT; goto abortit; } /* Release destination completely. */ VOP_ABORTOP(tdvp, tcnp); vput(tdvp); vput(tvp); /* * Delete source. There is another race now that everything * is unlocked, but this doesn't cause any new complications. * Relookup() may find a file that is unrelated to the * original one, or it may fail. Too bad. */ vrele(fdvp); vrele(fvp); fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); fcnp->cn_nameiop = DELETE; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp == NULL) { #ifdef UFS_RENAME_DEBUG printf("ufs_rename: from name disappeared\n"); #endif return (ENOENT); } error = VOP_REMOVE(fdvp, fvp, fcnp); if (fdvp == fvp) vrele(fdvp); else vput(fdvp); vput(fvp); return (error); } if ((error = vn_lock(fvp, LK_EXCLUSIVE, p)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= LINK_MAX) { VOP_UNLOCK(fvp, 0, p); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0, p); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory = 1; } VN_POLLEVENT(fdvp, POLLWRITE); vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_increase_linkcnt(ip); if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)))) != 0) { VOP_UNLOCK(fvp, 0, p); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp, 0, p); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ufs_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_increase_linkcnt(dp); error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | DOINGASYNC(tdvp))); if (error) goto bad; } ufs_makedirentry(ip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) { if (doingdirectory && newparent) { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; (void)UFS_UPDATE(tdvp, 1); } goto bad; } VN_POLLEVENT(tdvp, POLLWRITE); vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { if ((xp->i_effnlink > 2) || !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ufs_dirrewrite(dp, xp, ip->i_number, IFTODT(ip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) goto bad; if (doingdirectory) { if (!newparent) { dp->i_effnlink--; dp->i_flag |= IN_CHANGE; } xp->i_effnlink--; xp->i_flag |= IN_CHANGE; } VN_POLLEVENT(tdvp, POLLWRITE); if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory * is "." and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) dp->i_nlink--; xp->i_nlink--; ioflag = DOINGASYNC(tvp) ? 0 : IO_SYNC; if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_proc)) != 0) goto bad; } vput(tdvp); VN_POLLEVENT(tvp, POLLNLINK); /* XXX this right? */ vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. */ if (doingdirectory) panic("ufs_rename: lost dir entry"); vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; the IN_RENAME * flag ensures that it cannot be moved by another rename or removed * by a rmdir. */ if (xp != ip) { if (doingdirectory) panic("ufs_rename: lost dir entry"); } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { xp->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } /* * Mkdir system call */ int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif I /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { zfree(namei_zone, cnp->cn_pnbuf); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { zfree(namei_zone, cnp->cn_pnbuf); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; if (DOINGSOFTDEP(tvp)) softdep_increase_linkcnt(ip); if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_increase_linkcnt(dp); error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0 ) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; if ((error = VOP_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, B_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE | IN_UPDATE; vnode_pager_setsize(tvp, (u_long)ip->i_size); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp)))) != 0) { (void)VOP_BWRITE(bp->b_vp, bp); goto bad; } VN_POLLEVENT(dvp, POLLWRITE); /* XXX right place? */ /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = VOP_BWRITE(bp->b_vp, bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { *ap->a_vpp = tvp; } else { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } out: zfree(namei_zone, cnp->cn_pnbuf); return (error); } /* * Rmdir system call. */ int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error, ioflag; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (ip->i_flag & IN_RENAME) { error = EINVAL; goto out; } if (ip->i_effnlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) goto out; VN_POLLEVENT(dvp, POLLWRITE|POLLNLINK); cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we are quashing * it. We have removed the "." reference and the reference in the * parent directory, but there may be other hard links. So, * ufs_dirremove will set the UF_IMMUTABLE flag to ensure that no * new entries are made. The soft dependency code will arrange to * do these operations after the parent directory entry has been * deleted on disk, so when running with that code we avoid doing * them now. */ dp->i_effnlink--; dp->i_flag |= IN_CHANGE; ip->i_effnlink--; ip->i_flag |= IN_CHANGE; if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; ip->i_nlink--; ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC; error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, cnp->cn_proc); } cache_purge(vp); out: VN_POLLEVENT(vp, POLLNLINK); return (error); } /* * symlink -- make a symbolic link */ int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; register struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_POLLEVENT(ap->a_dvp, POLLWRITE); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, (struct proc *)0); vput(vp); return (error); } /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { register struct uio *uio = ap->a_uio; int error; size_t count, lost; off_t off; if (ap->a_ncookies != NULL) /* * Ensure that the block is aligned. The caller can use * the cookies to determine where in the block to start. */ uio->uio_offset &= ~(DIRBLKSIZ - 1); off = uio->uio_offset; count = uio->uio_resid; /* Make sure we don't return partial entries. */ if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1))) return (EINVAL); count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); lost = uio->uio_resid - count; uio->uio_resid = count; uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); } else { struct dirent *dp, *edp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; dp->d_namlen = dp->d_type; dp->d_type = tmp; if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, uio); } FREE(dirbuf, M_TEMP); } # else error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dpEnd; struct dirent* dp; int ncookies; u_long *cookies; u_long *cookiep; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ufs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) (uio->uio_iov->iov_base - (uio->uio_offset - off)); dpEnd = (struct dirent *) uio->uio_iov->iov_base; for (dp = dpStart, ncookies = 0; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) ncookies++; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { off += dp->d_reclen; *cookiep++ = (u_long) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } uio->uio_resid += lost; if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || (ip->i_din.di_blocks == 0)) { /* XXX - for old fastlink support */ uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually * done. If a buffer has been saved in anticipation of a CREATE, delete it. */ /* ARGSUSED */ int ufs_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) zfree(namei_zone, ap->a_cnp->cn_pnbuf); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the VOP_BMAP operation may not * deadlock on memory. See ufs_bmap() for details. */ int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; register struct inode *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); if (error) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } /* * Print out the contents of an inode. */ int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); printf("tag VT_UFS, ino %lu, on dev %#lx (%d, %d)", (u_long)ip->i_number, (u_long)ip->i_dev, major(ip->i_dev), minor(ip->i_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); lockmgr_printinfo(&ip->i_lock); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap); /* * The inode may have been revoked during the call, so it must not * be accessed blindly here or in the other wrapper functions. */ ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) ip->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for special devices. */ int ufsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); simple_unlock(&vp->v_interlock); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ int ufsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap); ip = VTOI(ap->a_vp); if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for fifos. */ int ufsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); simple_unlock(&vp->v_interlock); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, specops, fifoops, vpp) struct mount *mntp; vop_t **specops; vop_t **fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp, *nvp; struct timeval tv; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; nvp = checkalias(vp, ip->i_rdev, mntp); if (nvp) { /* * Discard unneeded vnode, but save its inode. * Note that the lock is carried over in the inode * to the replacement vnode. */ nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased inode. */ vp = nvp; ip->i_vnode = vp; } break; case VFIFO: vp->v_op = fifoops; break; default: break; } if (ip->i_number == ROOTINO) vp->v_flag |= VROOT; /* * Initialize modrev times */ getmicrouptime(&tv); SETHIGH(ip->i_modrev, tv.tv_sec); SETLOW(ip->i_modrev, tv.tv_usec * 4294); *vpp = vp; return (0); } /* * Allocate a new inode. */ int ufs_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { register struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) { zfree(namei_zone, cnp->cn_pnbuf); return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif I /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { zfree(namei_zone, cnp->cn_pnbuf); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { zfree(namei_zone, cnp->cn_pnbuf); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; if (DOINGSOFTDEP(tvp)) softdep_increase_linkcnt(ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && suser_xxx(cnp->cn_cred, 0, 0)) ip->i_mode &= ~ISGID; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); if (error) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; if ((cnp->cn_flags & SAVESTART) == 0) zfree(namei_zone, cnp->cn_pnbuf); *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ zfree(namei_zone, cnp->cn_pnbuf); ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } static int ufs_missingop(ap) struct vop_generic_args *ap; { panic("no vop function for %s in ufs child", ap->a_desc->vdesc_name); return (EOPNOTSUPP); } /* Global vfs data structures for ufs. */ static vop_t **ufs_vnodeop_p; static struct vnodeopv_entry_desc ufs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_read_desc, (vop_t *) ufs_missingop }, { &vop_reallocblks_desc, (vop_t *) ufs_missingop }, { &vop_write_desc, (vop_t *) ufs_missingop }, { &vop_abortop_desc, (vop_t *) ufs_abortop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_advlock_desc, (vop_t *) ufs_advlock }, { &vop_bmap_desc, (vop_t *) ufs_bmap }, { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, { &vop_close_desc, (vop_t *) ufs_close }, { &vop_create_desc, (vop_t *) ufs_create }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_link_desc, (vop_t *) ufs_link }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_mkdir_desc, (vop_t *) ufs_mkdir }, { &vop_mknod_desc, (vop_t *) ufs_mknod }, { &vop_mmap_desc, (vop_t *) ufs_mmap }, { &vop_open_desc, (vop_t *) ufs_open }, { &vop_pathconf_desc, (vop_t *) ufs_pathconf }, { &vop_poll_desc, (vop_t *) vop_stdpoll }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_readdir_desc, (vop_t *) ufs_readdir }, { &vop_readlink_desc, (vop_t *) ufs_readlink }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_remove_desc, (vop_t *) ufs_remove }, { &vop_rename_desc, (vop_t *) ufs_rename }, { &vop_rmdir_desc, (vop_t *) ufs_rmdir }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_strategy_desc, (vop_t *) ufs_strategy }, { &vop_symlink_desc, (vop_t *) ufs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, { NULL, NULL } }; static struct vnodeopv_desc ufs_vnodeop_opv_desc = { &ufs_vnodeop_p, ufs_vnodeop_entries }; static vop_t **ufs_specop_p; static struct vnodeopv_entry_desc ufs_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsspec_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsspec_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc ufs_specop_opv_desc = { &ufs_specop_p, ufs_specop_entries }; static vop_t **ufs_fifoop_p; static struct vnodeopv_entry_desc ufs_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsfifo_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsfifo_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc ufs_fifoop_opv_desc = { &ufs_fifoop_p, ufs_fifoop_entries }; VNODEOP_SET(ufs_vnodeop_opv_desc); VNODEOP_SET(ufs_specop_opv_desc); VNODEOP_SET(ufs_fifoop_opv_desc); int ufs_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatefifo(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_fifoop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatespec(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_specop_p, ap->a_desc->vdesc_offset, ap)); } Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 49534) +++ head/sys/vm/vm_mmap.c (revision 49535) @@ -1,1093 +1,1091 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 - * $Id: vm_mmap.c,v 1.99 1999/05/17 00:53:56 alc Exp $ + * $Id: vm_mmap.c,v 1.100 1999/06/05 18:21:53 alc Exp $ */ /* * Mapped file (mmap) interface to VM */ #include "opt_compat.h" #include "opt_rlimit.h" #include #include #include #include #include #include #include #include #include #include #include #include - -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif /* ARGSUSED */ int sbrk(p, uap) struct proc *p; struct sbrk_args *uap; { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* ARGSUSED */ int sstk(p, uap) struct proc *p; struct sstk_args *uap; { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif /* ARGSUSED */ int ogetpagesize(p, uap) struct proc *p; struct getpagesize_args *uap; { p->p_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int mmap(p, uap) struct proc *p; register struct mmap_args *uap; { register struct filedesc *fdp = p->p_fd; register struct file *fp; struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot, maxprot; void *handle; int flags, error; int disablexworkaround; off_t pos; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; pos = uap->pos; /* make sure mapping fits into numeric range etc */ if ((ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); if (flags & MAP_STACK) { if ((uap->fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); } /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ else if (addr == 0 || (addr >= round_page((vm_offset_t)p->p_vmspace->vm_taddr) && addr < round_page((vm_offset_t)p->p_vmspace->vm_daddr + MAXDSIZ))) addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + MAXDSIZ); if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; pos = 0; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ if (((unsigned) uap->fd) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) return (EINVAL); vp = (struct vnode *) fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) return (EINVAL); /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; pos = 0; } else { /* * cdevs does not provide private mappings of any kind. */ /* * However, for XIG X server to continue to work, * we should allow the superuser to do it anyway. * We only allow it at securelevel < 1. * (Because the XIG X server writes directly to video * memory via /dev/mem, it should never work at any * other securelevel. * XXX this will have to go */ if (securelevel >= 1) disablexworkaround = 1; else disablexworkaround = suser(p); if (vp->v_type == VCHR && disablexworkaround && (flags & (MAP_PRIVATE|MAP_COPY))) return (EINVAL); /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) return (EACCES); /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. Check for superuser, only if * we're at securelevel < 1, to allow the XIG X server * to continue to work. */ if ((flags & MAP_SHARED) != 0 || (vp->v_type == VCHR && disablexworkaround)) { if ((fp->f_flag & FWRITE) != 0) { struct vattr va; if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) return (error); if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) return (EPERM); } else if ((prot & PROT_WRITE) != 0) return (EACCES); } else maxprot |= VM_PROT_WRITE; handle = (void *)vp; } } error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, pos); if (error == 0) p->p_retval[0] = (register_t) (addr + pageoff); return (error); } #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(p, uap) struct proc *p; register struct ommap_args *uap; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 #define OMAP_INHERIT 0x0800 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; if (uap->flags & OMAP_INHERIT) nargs.flags |= MAP_INHERIT; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(p, &nargs)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; int len; int flags; }; #endif int msync(p, uap) struct proc *p; struct msync_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &p->p_vmspace->vm_map; /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we don't * really keep track of individual mmaps so we approximate by flushing * the range of the map entry containing addr. This can be incorrect * if the region splits or is coalesced with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); vm_map_unlock_read(map); if (rv == FALSE) return (EINVAL); addr = entry->start; size = entry->end - entry->start; } /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif int munmap(p, uap) register struct proc *p; register struct munmap_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if (size == 0) return (0); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif map = &p->p_vmspace->vm_map; /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) return (EINVAL); /* returns nothing but KERN_SUCCESS anyway */ (void) vm_map_remove(map, addr, addr + size); return (0); } void munmapfd(p, fd) struct proc *p; int fd; { /* * XXX should unmap any regions mapped to this file */ p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif int mprotect(p, uap) struct proc *p; struct mprotect_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; register vm_prot_t prot; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif int minherit(p, uap) struct proc *p; struct minherit_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; register vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif /* ARGSUSED */ int madvise(p, uap) struct proc *p; struct madvise_args *uap; { vm_offset_t start, end; /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page((vm_offset_t) uap->addr); end = round_page((vm_offset_t) uap->addr + uap->len); vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif /* ARGSUSED */ int mincore(p, uap) struct proc *p; struct mincore_args *uap; { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error; int vecindex, lastvecindex; register vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) return (EINVAL); if (end < addr) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; map = &p->p_vmspace->vm_map; pmap = vmspace_pmap(p->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for(current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while(addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ mincoreinfo = pmap_mincore(pmap, addr); if (!mincoreinfo) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); m = vm_page_lookup(current->object.vm_object, pindex); /* * if the page is resident, then gather information about * it. */ if (m) { mincoreinfo = MINCORE_INCORE; if (m->dirty || pmap_is_modified(VM_PAGE_TO_PHYS(m))) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(VM_PAGE_TO_PHYS(m))) { vm_page_flag_set(m, PG_REFERENCED); mincoreinfo |= MINCORE_REFERENCED_OTHER; } } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { return (EFAULT); } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte( vec + vecindex, mincoreinfo); if (error) { return (EFAULT); } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { return (EFAULT); } ++lastvecindex; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif int mlock(p, uap) struct proc *p; struct mlock_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); if (atop(size) + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef pmap_wired_count if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (ENOMEM); #else error = suser(p); if (error) return (error); #endif error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int mlockall(p, uap) struct proc *p; struct mlockall_args *uap; { return 0; } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int munlockall(p, uap) struct proc *p; struct munlockall_args *uap; { return 0; } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif int munlock(p, uap) struct proc *p; struct munlock_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); #ifndef pmap_wired_count error = suser(p); if (error) return (error); #endif error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * Internal version of mmap. * Currently used by mmap, exec, and sys5 shared memory. * Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) { boolean_t fitit; vm_object_t object; struct vnode *vp = NULL; objtype_t type; int rv = KERN_SUCCESS; vm_ooffset_t objsize; int docow; struct proc *p = curproc; if (size == 0) return (0); objsize = size = round_page(size); /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; (void) vm_map_remove(map, *addr, *addr + size); } /* * Lookup/allocate object. */ if (flags & MAP_ANON) { type = OBJT_DEFAULT; /* * Unnamed anonymous regions always start at 0. */ if (handle == 0) foff = 0; } else { vp = (struct vnode *) handle; if (vp->v_type == VCHR) { type = OBJT_DEVICE; handle = (void *)(intptr_t)vp->v_rdev; } else { struct vattr vat; int error; error = VOP_GETATTR(vp, &vat, p->p_ucred, p); if (error) return (error); objsize = round_page(vat.va_size); type = OBJT_VNODE; } } if (handle == NULL) { object = NULL; docow = 0; } else { object = vm_pager_allocate(type, handle, objsize, prot, foff); if (object == NULL) return (type == OBJT_DEVICE ? EINVAL : ENOMEM); docow = MAP_PREFAULT_PARTIAL; } /* * Force device mappings to be shared. */ if (type == OBJT_DEVICE) { flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } if ((flags & (MAP_ANON|MAP_SHARED)) == 0) { docow |= MAP_COPY_ON_WRITE; } #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif if (fitit) { *addr = pmap_addr_hint(object, *addr, size); } if (flags & MAP_STACK) rv = vm_map_stack (map, *addr, size, prot, maxprot, docow); else rv = vm_map_find(map, object, foff, addr, size, fitit, prot, maxprot, docow); if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. */ vm_object_deallocate(object); goto out; } /* * Shared memory is also shared with children. */ if (flags & (MAP_SHARED|MAP_INHERIT)) { rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { (void) vm_map_remove(map, *addr, *addr + size); goto out; } } out: switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: head/sys/vm/vm_swap.c =================================================================== --- head/sys/vm/vm_swap.c (revision 49534) +++ head/sys/vm/vm_swap.c (revision 49535) @@ -1,388 +1,386 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 - * $Id: vm_swap.c,v 1.78 1999/07/17 19:59:55 phk Exp $ + * $Id: vm_swap.c,v 1.79 1999/07/20 21:29:11 green Exp $ */ #include "opt_devfs.h" #include "opt_swap.h" #include #include #include #include -#include #ifdef DEVFS #include #endif #include #include #include /* XXX */ #include #include #include #include #include +#include #include #include #include - -#include /* * "sw" is a fake device implemented * in vm_swap.c and used only internally to get to swstrategy. * It cannot be provided to the users, because the * swstrategy routine munches the b_dev and b_blkno entries * before calling the appropriate driver. This would horribly * confuse, e.g. the hashing routines. Instead, /dev/drum is * provided as a character (raw) device. */ static d_strategy_t swstrategy; #define CDEV_MAJOR 4 #define BDEV_MAJOR 26 static struct cdevsw sw_cdevsw = { /* open */ nullopen, /* close */ nullclose, /* read */ physread, /* write */ physwrite, /* ioctl */ noioctl, /* stop */ nostop, /* reset */ noreset, /* devtotty */ nodevtotty, /* poll */ nopoll, /* mmap */ nommap, /* strategy */ swstrategy, /* name */ "sw", /* parms */ noparms, /* maj */ CDEV_MAJOR, /* dump */ nodump, /* psize */ nopsize, /* flags */ 0, /* maxio */ 0, /* bmaj */ BDEV_MAJOR }; /* * Indirect driver for multi-controller paging. */ #ifndef NSWAPDEV #define NSWAPDEV 4 #endif static struct swdevt should_be_malloced[NSWAPDEV]; static struct swdevt *swdevt = should_be_malloced; struct vnode *swapdev_vp; static int nswap; /* first block after the interleaved devs */ static int nswdev = NSWAPDEV; int vm_swap_size; /* * swstrategy: * * Perform swap strategy interleave device selection * * The bp is expected to be locked and *not* B_DONE on call. */ static void swstrategy(bp) register struct buf *bp; { int s, sz, off, seg, index; register struct swdevt *sp; struct vnode *vp; sz = howmany(bp->b_bcount, PAGE_SIZE); /* * Convert interleaved swap into per-device swap. Note that * the block size is left in PAGE_SIZE'd chunks (for the newswap) * here. */ if (nswdev > 1) { off = bp->b_blkno % dmmax; if (off + sz > dmmax) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } seg = bp->b_blkno / dmmax; index = seg % nswdev; seg /= nswdev; bp->b_blkno = seg * dmmax + off; } else { index = 0; } sp = &swdevt[index]; if (bp->b_blkno + sz > sp->sw_nblks) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } bp->b_dev = sp->sw_device; if (sp->sw_vp == NULL) { bp->b_error = ENODEV; bp->b_flags |= B_ERROR; biodone(bp); return; } /* * Convert from PAGE_SIZE'd to DEV_BSIZE'd chunks for the actual I/O */ bp->b_blkno = ctodb(bp->b_blkno); vhold(sp->sw_vp); s = splvm(); if ((bp->b_flags & B_READ) == 0) { vp = bp->b_vp; if (vp) { vp->v_numoutput--; if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { vp->v_flag &= ~VBWAIT; wakeup(&vp->v_numoutput); } } sp->sw_vp->v_numoutput++; } pbreassignbuf(bp, sp->sw_vp); splx(s); VOP_STRATEGY(bp->b_vp, bp); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* ARGSUSED */ int swapon(p, uap) struct proc *p; struct swapon_args *uap; { register struct vnode *vp; dev_t dev; struct nameidata nd; int error; error = suser(p); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, p); error = namei(&nd); if (error) return (error); vp = nd.ni_vp; switch (vp->v_type) { case VBLK: dev = vp->v_rdev; if (bdevsw(dev) == NULL) { error = ENXIO; break; } error = swaponvp(p, vp, dev, 0); break; case VCHR: /* * For now, we disallow swapping to regular files. * It requires logical->physcal block translation * support in the swap pager before it will work. */ error = ENOTBLK; break; #if 0 error = VOP_GETATTR(vp, &attr, p->p_ucred, p); if (!error) error = swaponvp(p, vp, NODEV, attr.va_size / DEV_BSIZE); break; #endif default: error = EINVAL; break; } if (error) vrele(vp); return (error); } /* * Swfree(index) frees the index'th portion of the swap map. * Each of the nswdev devices provides 1/nswdev'th of the swap * space, which is laid out with blocks of dmmax pages circularly * among the devices. * * The new swap code uses page-sized blocks. The old swap code used * DEV_BSIZE'd chunks. * * XXX locking when multiple swapon's run in parallel */ int swaponvp(p, vp, dev, nblks) struct proc *p; struct vnode *vp; dev_t dev; u_long nblks; { int index; register struct swdevt *sp; register swblk_t vsbase; register long blk; swblk_t dvbase; int error; ASSERT_VOP_UNLOCKED(vp, "swaponvp"); for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) { if (sp->sw_vp == vp) return EBUSY; if (!sp->sw_vp) goto found; } return EINVAL; found: (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_OPEN(vp, FREAD | FWRITE, p->p_ucred, p); (void) VOP_UNLOCK(vp, 0, p); if (error) return (error); if (nblks == 0 && dev != NODEV && (bdevsw(dev)->d_psize == 0 || (nblks = (*bdevsw(dev)->d_psize) (dev)) == -1)) { (void) VOP_CLOSE(vp, FREAD | FWRITE, p->p_ucred, p); return (ENXIO); } if (nblks == 0) { (void) VOP_CLOSE(vp, FREAD | FWRITE, p->p_ucred, p); return (ENXIO); } /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); sp->sw_vp = vp; sp->sw_dev = dev2budev(dev); sp->sw_device = dev; sp->sw_flags |= SW_FREED; sp->sw_nblks = nblks; /* * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not * DEV_BSIZE'd. */ if (nblks * nswdev > nswap) nswap = (nblks+1) * nswdev; if (swapblist == NULL) swapblist = blist_create(nswap); else blist_resize(&swapblist, nswap, 0); for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { blk = min(nblks - dvbase, dmmax); vsbase = index * dmmax + dvbase * nswdev; blist_free(swapblist, vsbase, blk); vm_swap_size += blk; } if (!swapdev_vp) { struct vnode *vp1; struct vnode *nvp; error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); if (error) panic("Cannot get vnode for swapdev"); vp1 = nvp; vp1->v_type = VBLK; if ((nvp = checkalias(vp1, makeudev(BDEV_MAJOR, 0), (struct mount *) 0))) { vput(vp1); vp1 = nvp; } swapdev_vp = vp1; } return (0); } static int sw_devsw_installed; #ifdef DEVFS static void *drum_devfs_token; #endif static void sw_drvinit(void *unused) { if( ! sw_devsw_installed ) { cdevsw_add(&sw_cdevsw); /* * XXX: This is pretty gross, but it will disappear with * the blockdevices RSN. */ sw_cdevsw.d_open = nullopen; sw_cdevsw.d_close = nullclose; sw_devsw_installed = 1; #ifdef DEVFS drum_devfs_token = devfs_add_devswf(&sw_cdevsw, 0, DV_CHR, UID_ROOT, GID_KMEM, 0640, "drum"); #endif } } SYSINIT(swdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,sw_drvinit,NULL)