Index: head/sys/fs/devfs/devfs_vnops.c =================================================================== --- head/sys/fs/devfs/devfs_vnops.c (revision 137028) +++ head/sys/fs/devfs/devfs_vnops.c (revision 137029) @@ -1,1443 +1,1443 @@ /* * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * remove empty directories * mknod: hunt down DE_DELETED, compare name, reinstantiate. * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int devfs_access(struct vop_access_args *ap); static int devfs_advlock(struct vop_advlock_args *ap); static int devfs_close(struct vop_close_args *ap); static int devfs_fsync(struct vop_fsync_args *ap); static int devfs_getattr(struct vop_getattr_args *ap); static int devfs_ioctl(struct vop_ioctl_args *ap); static int devfs_kqfilter(struct vop_kqfilter_args *ap); static int devfs_lookupx(struct vop_lookup_args *ap); static int devfs_mknod(struct vop_mknod_args *ap); static int devfs_open(struct vop_open_args *ap); static int devfs_pathconf(struct vop_pathconf_args *ap); static int devfs_poll(struct vop_poll_args *ap); static int devfs_print(struct vop_print_args *ap); static int devfs_read(struct vop_read_args *ap); static int devfs_readdir(struct vop_readdir_args *ap); static int devfs_readlink(struct vop_readlink_args *ap); static int devfs_reclaim(struct vop_reclaim_args *ap); static int devfs_remove(struct vop_remove_args *ap); static int devfs_revoke(struct vop_revoke_args *ap); static int devfs_rioctl(struct vop_ioctl_args *ap); static int devfs_rread(struct vop_read_args *ap); static int devfs_setattr(struct vop_setattr_args *ap); #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap); #endif static int devfs_specstrategy(struct vop_specstrategy_args *); static int devfs_symlink(struct vop_symlink_args *ap); static int devfs_write(struct vop_write_args *ap); static vop_t **devfs_vnodeop_p; vop_t **devfs_specop_p; /* * Construct the fully qualified path name relative to the mountpoint */ static char * devfs_fqpn(char *buf, struct vnode *dvp, struct componentname *cnp) { int i; struct devfs_dirent *de, *dd; struct devfs_mount *dmp; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; i = SPECNAMELEN; buf[i] = '\0'; i -= cnp->cn_namelen; if (i < 0) return (NULL); bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_basedir) { i--; if (i < 0) return (NULL); buf[i] = '/'; i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = TAILQ_FIRST(&de->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; } return (buf + i); } int devfs_allocv(struct devfs_dirent *de, struct mount *mp, struct vnode **vpp, struct thread *td) { int error; struct vnode *vp; struct cdev *dev; KASSERT(td == curthread, ("devfs_allocv: td != curthread")); loop: vp = de->de_vnode; if (vp != NULL) { if (vget(vp, LK_EXCLUSIVE, td)) goto loop; *vpp = vp; return (0); } if (de->de_dirent->d_type == DT_CHR) { dev = *devfs_itod(de->de_inode); if (dev == NULL) return (ENOENT); } else { dev = NULL; } error = getnewvnode("devfs", mp, devfs_vnodeop_p, &vp); if (error != 0) { printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; vp = addaliasu(vp, dev->si_udev); vp->v_op = devfs_specop_p; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vp->v_data = de; de->de_vnode = vp; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC mac_associate_vnode_devfs(mp, de, vp); #endif *vpp = vp; return (0); } static int devfs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_mode, ap->a_cred, NULL); if (!error) return (error); if (error != EACCES) return (error); /* We do, however, allow access to the controlling terminal */ if (!(ap->a_td->td_proc->p_flag & P_CONTROLT)) return (error); if (ap->a_td->td_proc->p_session->s_ttyvp == de->de_vnode) return (0); return (error); } /* * Special device advisory byte-level locks. */ /* ARGSUSED */ static int devfs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* * Device close routine */ /* ARGSUSED */ static int devfs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int error; /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ /* * This needs to be rewritten to take the vp interlock into * consideration. */ oldvp = NULL; sx_xlock(&proctree_lock); if (td && vp == td->td_proc->p_session->s_ttyvp) { SESS_LOCK(td->td_proc->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_XLOCK) == 0) { td->td_proc->p_session->s_ttyvp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(td->td_proc->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); VI_LOCK(vp); if (vp->v_iflag & VI_XLOCK) { /* Forced close. */ } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev); return (0); } VI_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); PICKUP_GIANT(); } else error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); dev_relthread(dev); return (error); } /* * Synch buffers associated with a block device */ /* ARGSUSED */ static int devfs_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { if (!vn_isdisk(ap->a_vp, NULL)) return (0); return (vop_stdfsync(ap)); } static int devfs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; int error = 0; struct devfs_dirent *de; struct cdev *dev; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; bzero((caddr_t) vap, sizeof(*vap)); vattr_null(vap); vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; #define fix(aa) \ do { \ if ((aa).tv_sec == 0) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = dev->si_udev; } vap->va_gen = 0; vap->va_flags = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* * Device ioctl operation. */ /* ARGSUSED */ static int devfs_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct cdev *dev; int error; struct cdevsw *dsw; dev = ap->a_vp->v_rdev; dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs_ioctl() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_td); PICKUP_GIANT(); } else error = dsw->d_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_td); dev_relthread(dev); if (error == ENOIOCTL) error = ENOTTY; return (error); } /* ARGSUSED */ static int devfs_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { struct cdev *dev; struct cdevsw *dsw; int error; dev = ap->a_vp->v_rdev; dsw = dev_refthread(dev); if (dsw == NULL) return(0); KASSERT(dev->si_refcount > 0, ("devfs_kqfilter() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_kqfilter(dev, ap->a_kn); PICKUP_GIANT(); } else error = dsw->d_kqfilter(dev, ap->a_kn); dev_relthread(dev); return (error); } static int devfs_lookupx(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct cdev *cdev; int error, flags, nameiop; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; *vpp = NULLVP; cnp->cn_flags &= ~PDIRUNLOCK; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; de = TAILQ_FIRST(&dd->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; error = devfs_allocv(de, dvp->v_mount, vpp, td); if (error || ((flags & LOCKPARENT) && (flags & ISLASTCN))) { vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); cnp->cn_flags &= ~PDIRUNLOCK; } return (error); } devfs_populate(dmp); dd = dvp->v_data; TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) goto notfound; goto found; } if (nameiop == DELETE) goto notfound; /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dvp, cnp); if (pname == NULL) goto notfound; cdev = NULL; EVENTHANDLER_INVOKE(dev_clone, pname, strlen(pname), &cdev); if (cdev == NULL) goto notfound; devfs_populate(dmp); dde = devfs_itode(dmp, cdev->si_inode); if (dde == NULL || *dde == NULL || *dde == DE_DELETED) goto notfound; if ((*dde)->de_flags & DE_WHITEOUT) goto notfound; de = *dde; goto found; notfound: if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; if (!(flags & LOCKPARENT)) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; } return (EJUSTRETURN); } return (ENOENT); found: if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } error = devfs_allocv(de, dvp->v_mount, vpp, td); if (error) return (error); if (!(flags & LOCKPARENT)) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; } return (0); } error = devfs_allocv(de, dvp->v_mount, vpp, td); if (error) return (error); if (!(flags & LOCKPARENT) || !(flags & ISLASTCN)) { VOP_UNLOCK(dvp, 0, td); cnp->cn_flags |= PDIRUNLOCK; } return (0); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; dmp = VFSTODEVFS(ap->a_dvp->v_mount); lockmgr(&dmp->dm_lock, LK_SHARED, 0, curthread); j = devfs_lookupx(ap); lockmgr(&dmp->dm_lock, LK_RELEASE, 0, curthread); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) /* struct vop_mknod_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); lockmgr(&dmp->dm_lock, LK_EXCLUSIVE, 0, curthread); cnp = ap->a_cnp; vpp = ap->a_vpp; td = cnp->cn_thread; dd = dvp->v_data; error = ENOENT; TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, vpp, td); notfound: lockmgr(&dmp->dm_lock, LK_RELEASE, 0, curthread); return (error); } /* * Open a special file. */ /* ARGSUSED */ static int devfs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; int error; struct cdevsw *dsw; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; /* * XXX: Disks get special billing here, but it is mostly wrong. * XXX: Disk partitions can overlap and the real checks should * XXX: take this into account, and consequently they need to * XXX: live in the disk slice code. Some checks do. */ if (vn_isdisk(vp, NULL) && ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { /* * Never allow opens for write if the disk is mounted R/W. */ if (vp->v_rdev->si_mountpoint != NULL && !(vp->v_rdev->si_mountpoint->mnt_flag & MNT_RDONLY)) return (EBUSY); /* * When running in secure mode, do not allow opens * for writing if the disk is mounted. */ error = securelevel_ge(td->td_ucred, 1); if (error && vfs_mountedon(vp)) return (error); /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); /* XXX: Special casing of ttys for deadfs. Probably redundant. */ if (dsw->d_flags & D_TTY) vp->v_vflag |= VV_ISTTY; VOP_UNLOCK(vp, 0, td); if(!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); PICKUP_GIANT(); } else if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); dev_relthread(dev); if (error) return (error); if (vn_isdisk(vp, NULL)) { if (!dev->si_bsize_phys) dev->si_bsize_phys = DEV_BSIZE; vp->v_bufobj.bo_bsize = dev->si_bsize_phys; } return (error); } static int devfs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct cdev *dev; struct cdevsw *dsw; int error; dev = ap->a_vp->v_rdev; dsw = dev_refthread(dev); if (dsw == NULL) return(0); KASSERT(dev->si_refcount > 0, ("devfs_poll() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { /* XXX: not yet DROP_GIANT(); */ error = dsw->d_poll(dev, ap->a_events, ap->a_td); /* XXX: not yet PICKUP_GIANT(); */ } else error = dsw->d_poll(dev, ap->a_events, ap->a_td); dev_relthread(dev); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } /* * Vnode op for read */ /* ARGSUSED */ static int devfs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct thread *td; struct uio *uio; struct cdev *dev; int error, resid; struct cdevsw *dsw; vp = ap->a_vp; dev = vp->v_rdev; uio = ap->a_uio; td = uio->uio_td; resid = uio->uio_resid; if (resid == 0) return (0); KASSERT(dev->si_refcount > 0, ("specread() on un-referenced struct cdev *(%s)", devtoname(dev))); dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); VOP_UNLOCK(vp, 0, td); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_read(dev, uio, ap->a_ioflag); PICKUP_GIANT(); } else error = dsw->d_read(dev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); dev_relthread(dev); if (uio->uio_resid != resid || (error == 0 && resid != 0)) vfs_timestamp(&dev->si_atime); return (error); } static int devfs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off, oldoff; int ncookies = 0; u_long *cookiebuf, *cookiep; struct dirent *dps, *dpe; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); dmp = VFSTODEVFS(ap->a_vp->v_mount); lockmgr(&dmp->dm_lock, LK_SHARED, 0, curthread); devfs_populate(dmp); error = 0; de = ap->a_vp->v_data; off = 0; oldoff = uio->uio_offset; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { if (dd->de_flags & DE_WHITEOUT) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; if (off >= uio->uio_offset) { ncookies++; error = uiomove(dp, dp->d_reclen, uio); if (error) break; } off += dp->d_reclen; } if( !error && ap->a_ncookies != NULL && ap->a_cookies != NULL ) { MALLOC(cookiebuf, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); cookiep = cookiebuf; dps = (struct dirent *)((char *)uio->uio_iov->iov_base - (uio->uio_offset - oldoff)); dpe = (struct dirent *) uio->uio_iov->iov_base; for( dp = dps; dp < dpe; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { oldoff += dp->d_reclen; *cookiep++ = (u_long) oldoff; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookiebuf; } lockmgr(&dmp->dm_lock, LK_RELEASE, 0, curthread); uio->uio_offset = off; return (error); } static int devfs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cead; } */ *ap; { int error; struct devfs_dirent *de; de = ap->a_vp->v_data; error = uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio); return (error); } static int devfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; int i; de = vp->v_data; if (de != NULL) de->de_vnode = NULL; vp->v_data = NULL; if (vp->v_rdev != NULL && vp->v_rdev != NULL) { i = vcount(vp); if ((vp->v_rdev->si_flags & SI_CHEAPCLONE) && i == 0 && (vp->v_rdev->si_flags & SI_NAMED)) destroy_dev(vp->v_rdev); } return (0); } static int devfs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); lockmgr(&dmp->dm_lock, LK_EXCLUSIVE, 0, curthread); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_dirent->d_type == DT_LNK) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_vnode) de->de_vnode->v_data = NULL; #ifdef MAC mac_destroy_devfsdirent(de); #endif FREE(de, M_DEVFS); } else { de->de_flags |= DE_WHITEOUT; } lockmgr(&dmp->dm_lock, LK_RELEASE, 0, curthread); return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. */ static int devfs_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; de = vp->v_data; de->de_vnode = NULL; vop_revoke(ap); return (0); } static int devfs_rioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { int error; struct devfs_mount *dmp; dmp = VFSTODEVFS(ap->a_vp->v_mount); lockmgr(&dmp->dm_lock, LK_SHARED, 0, curthread); devfs_populate(dmp); lockmgr(&dmp->dm_lock, LK_RELEASE, 0, curthread); error = devfs_rules_ioctl(ap->a_vp->v_mount, ap->a_command, ap->a_data, ap->a_td); return (error); } static int devfs_rread(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if (((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) && (error = suser_cred(ap->a_td->td_ucred, SUSER_ALLOWJAIL)) != 0) return (error); de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != de->de_uid) && (error = suser_cred(ap->a_td->td_ucred, SUSER_ALLOWJAIL))) return (error); de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } return (0); } #ifdef MAC static int devfs_setlabel(ap) struct vop_setlabel_args /* { struct vnode *a_vp; struct mac *a_label; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_relabel_vnode(ap->a_cred, vp, ap->a_label); mac_update_devfsdirent(vp->v_mount, de, vp); return (0); } #endif static int doslowdown = 0; SYSCTL_INT(_debug, OID_AUTO, doslowdown, CTLFLAG_RW, &doslowdown, 0, ""); static int devfs_specstrategy(ap) struct vop_specstrategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; struct mount *mp; struct thread *td = curthread; KASSERT(ap->a_vp->v_rdev == ap->a_bp->b_dev, ("%s, dev %s != %s", __func__, devtoname(ap->a_vp->v_rdev), devtoname(ap->a_bp->b_dev))); KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE, ("Wrong b_iocmd buf=%p cmd=%d", bp, bp->b_iocmd)); /* * Slow down disk requests for niced processes. */ if (doslowdown && td && td->td_proc->p_nice > 0) { msleep(td, NULL, PPAUSE | PCATCH, "ioslow", td->td_proc->p_nice); } /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) { if (bp->b_iocmd == BIO_WRITE) { if (bp->b_lock.lk_lockholder == LK_KERNPROC) mp->mnt_stat.f_asyncwrites++; else mp->mnt_stat.f_syncwrites++; } else { if (bp->b_lock.lk_lockholder == LK_KERNPROC) mp->mnt_stat.f_asyncreads++; else mp->mnt_stat.f_syncreads++; } } - dev_strategy(bp); + dev_strategy(bp->b_dev, bp); return (0); } static int devfs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; struct thread *td; td = ap->a_cnp->cn_thread; KASSERT(td == curthread, ("devfs_symlink: td != curthread")); error = suser(td); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = dmp->dm_inode++; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; MALLOC(de->de_symlink, char *, i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); lockmgr(&dmp->dm_lock, LK_EXCLUSIVE, 0, td); #ifdef MAC mac_create_devfs_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); devfs_allocv(de, ap->a_dvp->v_mount, ap->a_vpp, td); lockmgr(&dmp->dm_lock, LK_RELEASE, 0, td); return (0); } /* * Vnode op for write */ /* ARGSUSED */ static int devfs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct thread *td; struct uio *uio; struct cdev *dev; int error, resid; struct cdevsw *dsw; vp = ap->a_vp; dev = vp->v_rdev; uio = ap->a_uio; td = uio->uio_td; resid = uio->uio_resid; dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); VOP_UNLOCK(vp, 0, td); KASSERT(dev->si_refcount > 0, ("devfs_write() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_write(dev, uio, ap->a_ioflag); PICKUP_GIANT(); } else error = dsw->d_write(dev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); dev_relthread(dev); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { vfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } return (error); } static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) devfs_access }, { &vop_getattr_desc, (vop_t *) devfs_getattr }, { &vop_ioctl_desc, (vop_t *) devfs_rioctl }, { &vop_lookup_desc, (vop_t *) devfs_lookup }, { &vop_mknod_desc, (vop_t *) devfs_mknod }, { &vop_pathconf_desc, (vop_t *) devfs_pathconf }, { &vop_read_desc, (vop_t *) devfs_rread }, { &vop_readdir_desc, (vop_t *) devfs_readdir }, { &vop_readlink_desc, (vop_t *) devfs_readlink }, { &vop_reclaim_desc, (vop_t *) devfs_reclaim }, { &vop_remove_desc, (vop_t *) devfs_remove }, { &vop_revoke_desc, (vop_t *) devfs_revoke }, { &vop_setattr_desc, (vop_t *) devfs_setattr }, #ifdef MAC { &vop_setlabel_desc, (vop_t *) devfs_setlabel }, #endif { &vop_symlink_desc, (vop_t *) devfs_symlink }, { NULL, NULL } }; static struct vnodeopv_desc devfs_vnodeop_opv_desc = { &devfs_vnodeop_p, devfs_vnodeop_entries }; VNODEOP_SET(devfs_vnodeop_opv_desc); static struct vnodeopv_entry_desc devfs_specop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) devfs_access }, { &vop_advlock_desc, (vop_t *) devfs_advlock }, { &vop_bmap_desc, (vop_t *) vop_panic }, { &vop_close_desc, (vop_t *) devfs_close }, { &vop_create_desc, (vop_t *) vop_panic }, { &vop_fsync_desc, (vop_t *) devfs_fsync }, { &vop_getattr_desc, (vop_t *) devfs_getattr }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_ioctl_desc, (vop_t *) devfs_ioctl }, { &vop_kqfilter_desc, (vop_t *) devfs_kqfilter }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) vop_panic }, { &vop_mkdir_desc, (vop_t *) vop_panic }, { &vop_mknod_desc, (vop_t *) vop_panic }, { &vop_open_desc, (vop_t *) devfs_open }, { &vop_pathconf_desc, (vop_t *) devfs_pathconf }, { &vop_poll_desc, (vop_t *) devfs_poll }, { &vop_print_desc, (vop_t *) devfs_print }, { &vop_read_desc, (vop_t *) devfs_read }, { &vop_readdir_desc, (vop_t *) vop_panic }, { &vop_readlink_desc, (vop_t *) vop_panic }, { &vop_reallocblks_desc, (vop_t *) vop_panic }, { &vop_reclaim_desc, (vop_t *) devfs_reclaim }, { &vop_remove_desc, (vop_t *) devfs_remove }, { &vop_rename_desc, (vop_t *) vop_panic }, { &vop_revoke_desc, (vop_t *) devfs_revoke }, { &vop_rmdir_desc, (vop_t *) vop_panic }, { &vop_setattr_desc, (vop_t *) devfs_setattr }, #ifdef MAC { &vop_setlabel_desc, (vop_t *) devfs_setlabel }, #endif { &vop_specstrategy_desc, (vop_t *) devfs_specstrategy }, { &vop_strategy_desc, (vop_t *) vop_panic }, { &vop_symlink_desc, (vop_t *) vop_panic }, { &vop_write_desc, (vop_t *) devfs_write }, { NULL, NULL } }; static struct vnodeopv_desc devfs_specop_opv_desc = { &devfs_specop_p, devfs_specop_entries }; VNODEOP_SET(devfs_specop_opv_desc); Index: head/sys/geom/geom_dev.c =================================================================== --- head/sys/geom/geom_dev.c (revision 137028) +++ head/sys/geom/geom_dev.c (revision 137029) @@ -1,432 +1,432 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, }; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_init_t g_dev_init; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .taste = g_dev_taste, .orphan = g_dev_orphan, .init = g_dev_init, }; static struct unrhdr *unithdr; /* Locked by topology */ static void g_dev_init(struct g_class *mp) { /* XXX: should have a #define MAX_UNIT_MINOR */ unithdr = new_unrhdr(0, 0xffffff); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) cp = NULL; else cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_consumer *cp; int error; struct cdev *dev; u_int unit; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, pp->name); cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); /* * XXX: I'm not 100% sure we can call make_dev(9) without Giant * yet. Once we can, we don't need to drop topology here either. */ unit = alloc_unr(unithdr); g_topology_unlock(); mtx_lock(&Giant); dev = make_dev(&g_dev_cdevsw, unit2minor(unit), UID_ROOT, GID_OPERATOR, 0640, gp->name); if (pp->flags & G_PF_CANDELETE) dev->si_flags |= SI_CANDELETE; mtx_unlock(&Giant); g_topology_lock(); dev->si_iosize_max = MAXPHYS; gp->softc = dev; dev->si_drv1 = gp; dev->si_drv2 = cp; return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_geom *gp; struct g_consumer *cp; int error, r, w, e; gp = dev->si_drv1; cp = dev->si_drv2; if (gp == NULL || cp == NULL || gp->softc != dev) return(ENXIO); /* g_dev_taste() not done yet */ g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", gp->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); if (dev->si_devsw == NULL) error = ENXIO; /* We were orphaned */ else error = g_access(cp, r, w, e); g_topology_unlock(); if (!error) dev->si_bsize_phys = cp->provider->sectorsize; return(error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_geom *gp; struct g_consumer *cp; int error, r, w, e, i; gp = dev->si_drv1; cp = dev->si_drv2; if (gp == NULL || cp == NULL) return(ENXIO); g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", gp->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif g_topology_lock(); if (dev->si_devsw == NULL) error = ENXIO; /* We were orphaned */ else error = g_access(cp, r, w, e); for (i = 0; i < 10 * hz;) { if (cp->acr != 0 || cp->acw != 0) break; if (cp->nstart == cp->nend) break; tsleep(&i, PRIBIO, "gdevwclose", hz / 10); i += hz / 10; } if (cp->acr == 0 && cp->acw == 0 && cp->nstart != cp->nend) { printf("WARNING: Final close of geom_dev(%s) %s %s\n", gp->name, "still has outstanding I/O after 10 seconds.", "Completing close anyway, panic may happen later."); } g_topology_unlock(); return (error); } /* * XXX: Until we have unmessed the ioctl situation, there is a race against * XXX: a concurrent orphanization. We cannot close it by holding topology * XXX: since that would prevent us from doing our job, and stalling events * XXX: will break (actually: stall) the BSD disklabel hacks. */ static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_geom *gp; struct g_consumer *cp; struct g_kerneldump kd; int i, error; u_int u; gp = dev->si_drv1; cp = dev->si_drv2; error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = cp->provider->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = cp->provider->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFRONTSTUFF: error = g_io_getattr("GEOM::frontstuff", cp, &i, data); break; case DIOCSKERNELDUMP: u = *((u_int *)data); if (!u) { set_dumper(NULL); error = 0; break; } kd.offset = 0; kd.length = OFF_MAX; i = sizeof kd; error = g_io_getattr("GEOM::kerneldump", cp, &i, &kd); if (!error) dev->si_flags |= SI_DUMPDEV; break; default: if (cp->provider->geom->ioctl != NULL) { error = cp->provider->geom->ioctl(cp->provider, cmd, data, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct bio *bp; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; if (bp->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp->bio_error); bp->bio_flags |= BIO_ERROR; } else { g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp->bio_resid, (intmax_t)bp2->bio_completed); } - bp->bio_resid = bp->bio_bcount - bp2->bio_completed; + bp->bio_resid = bp->bio_length - bp2->bio_completed; + bp->bio_completed = bp2->bio_completed; g_destroy_bio(bp2); biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { biofinish(bp, NULL, EINVAL); return; } for (;;) { /* * XXX: This is not an ideal solution, but I belive it to * XXX: deadlock safe, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; tsleep(&bp, PRIBIO, "gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); - bp2->bio_length = (off_t)bp->bio_bcount; bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Destroy the struct cdev *to prevent any more request from coming in. The * provider is already marked with an error, so anything which comes in * in the interrim will be returned immediately. * - Wait for any outstanding I/O to finish. * - Set our access counts to zero, whatever they were. * - Detach and self-destruct. */ static void g_dev_orphan(struct g_consumer *cp) { struct g_geom *gp; struct cdev *dev; u_int unit; g_topology_assert(); gp = cp->geom; dev = gp->softc; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, gp->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) set_dumper(NULL); /* Destroy the struct cdev *so we get no more requests */ unit = dev2unit(dev); destroy_dev(dev); free_unr(unithdr, unit); /* Wait for the cows to come home */ while (cp->nstart != cp->nend) msleep(&dev, NULL, PRIBIO, "gdevorphan", hz / 10); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); Index: head/sys/kern/kern_physio.c =================================================================== --- head/sys/kern/kern_physio.c (revision 137028) +++ head/sys/kern/kern_physio.c (revision 137029) @@ -1,124 +1,124 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include int physio(struct cdev *dev, struct uio *uio, int ioflag) { int i; int error; caddr_t sa; u_int iolen; struct buf *bp; /* Keep the process UPAGES from being swapped. XXX: why ? */ PHOLD(curproc); bp = getpbuf(NULL); sa = bp->b_data; error = 0; /* XXX: sanity check */ if(dev->si_iosize_max < PAGE_SIZE) { printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n", devtoname(dev), dev->si_iosize_max); dev->si_iosize_max = DFLTPHYS; } for (i = 0; i < uio->uio_iovcnt; i++) { while (uio->uio_iov[i].iov_len) { bp->b_flags = 0; if (uio->uio_rw == UIO_READ) bp->b_iocmd = BIO_READ; else bp->b_iocmd = BIO_WRITE; bp->b_dev = dev; bp->b_iodone = bdone; bp->b_data = uio->uio_iov[i].iov_base; bp->b_bcount = uio->uio_iov[i].iov_len; bp->b_offset = uio->uio_offset; bp->b_iooffset = uio->uio_offset; bp->b_saveaddr = sa; /* Don't exceed drivers iosize limit */ if (bp->b_bcount > dev->si_iosize_max) bp->b_bcount = dev->si_iosize_max; /* * Make sure the pbuf can map the request * XXX: The pbuf has kvasize = MAXPHYS so a request * XXX: larger than MAXPHYS - PAGE_SIZE must be * XXX: page aligned or it will be fragmented. */ iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK; if ((bp->b_bcount + iolen) > bp->b_kvasize) { bp->b_bcount = bp->b_kvasize; if (iolen != 0) bp->b_bcount -= PAGE_SIZE; } bp->b_bufsize = bp->b_bcount; bp->b_blkno = btodb(bp->b_offset); if (uio->uio_segflg == UIO_USERSPACE) if (vmapbuf(bp) < 0) { error = EFAULT; goto doerror; } - dev_strategy(bp); + dev_strategy(dev, bp); if (uio->uio_rw == UIO_READ) bwait(bp, PRIBIO, "physrd"); else bwait(bp, PRIBIO, "physwr"); if (uio->uio_segflg == UIO_USERSPACE) vunmapbuf(bp); iolen = bp->b_bcount - bp->b_resid; if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR)) goto doerror; /* EOF */ uio->uio_iov[i].iov_len -= iolen; uio->uio_iov[i].iov_base = (char *)uio->uio_iov[i].iov_base + iolen; uio->uio_resid -= iolen; uio->uio_offset += iolen; if( bp->b_ioflags & BIO_ERROR) { error = bp->b_error; goto doerror; } } } doerror: relpbuf(bp, NULL); PRELE(curproc); return (error); } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 137028) +++ head/sys/kern/vfs_aio.c (revision 137029) @@ -1,2326 +1,2326 @@ /* * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vfs_aio.h" NET_NEEDS_GIANT("aio"); /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. */ static long jobrefid; #define JOBST_NULL 0x0 #define JOBST_JOBQGLOBAL 0x2 #define JOBST_JOBRUNNING 0x3 #define JOBST_JOBFINISHED 0x4 #define JOBST_JOBQBUF 0x5 #define JOBST_JOBBFINISHED 0x6 #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef MAX_AIO_PROCS #define MAX_AIO_PROCS 32 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef TARGET_AIO_PROCS #define TARGET_AIO_PROCS 4 #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif #ifndef AIOD_TIMEOUT_DEFAULT #define AIOD_TIMEOUT_DEFAULT (10 * hz) #endif #ifndef AIOD_LIFETIME_DEFAULT #define AIOD_LIFETIME_DEFAULT (30 * hz) #endif SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel threads to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel threads for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel threads for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); /* Number of async I/O thread in the process of being started */ /* XXX This should be local to _aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_timeout; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, "Timeout value for synchronous aio operations"); static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int unloadable = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of aio (not recommended)"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); struct aiocblist { TAILQ_ENTRY(aiocblist) list; /* List of jobs */ TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */ int jobflags; int jobstate; int inputcharge; int outputcharge; struct callout_handle timeouthandle; struct buf *bp; /* Buffer pointer */ struct proc *userproc; /* User process */ /* Not td! */ struct ucred *cred; /* Active credential when created */ struct file *fd_file; /* Pointer to file structure */ struct aio_liojob *lio; /* Optional lio job */ struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */ struct knlist klist; /* list of knotes */ struct aiocb uaiocb; /* Kernel I/O control block */ }; /* jobflags */ #define AIOCBLIST_RUNDOWN 0x4 #define AIOCBLIST_DONE 0x10 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ struct aiothreadlist { int aiothreadflags; /* AIO proc flags */ TAILQ_ENTRY(aiothreadlist) list; /* List of processes */ struct thread *aiothread; /* The AIO thread */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY(aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ static TAILQ_HEAD(,aiothreadlist) aio_activeproc; /* Active daemons */ static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* Idle daemons */ static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ static void aio_init_aioinfo(struct proc *p); static void aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void); static int aio_aqueue(struct thread *td, struct aiocb *job, int type); static void aio_physwakeup(struct buf *bp); static void aio_proc_rundown(void *arg, struct proc *p); static int aio_fphysio(struct aiocblist *aiocbe); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); static void aio_swake_cb(struct socket *, struct sockbuf *); static int aio_unload(void); static void process_signal(void *aioj); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io thread data * aiocb async io jobs * aiol list io job pointer - internal to aio_suspend XXX * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { 0, filt_aioattach, filt_aiodetach, filt_aio }; static eventhandler_tag exit_tag, exec_tag; /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_UNLOAD: error = aio_unload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; SYSCALL_MODULE_HELPER(aio_return); SYSCALL_MODULE_HELPER(aio_suspend); SYSCALL_MODULE_HELPER(aio_cancel); SYSCALL_MODULE_HELPER(aio_error); SYSCALL_MODULE_HELPER(aio_read); SYSCALL_MODULE_HELPER(aio_write); SYSCALL_MODULE_HELPER(aio_waitcomplete); SYSCALL_MODULE_HELPER(lio_listio); DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static void aio_onceonly(void) { /* XXX: should probably just use so->callback */ aio_swake = &aio_swake_cb; exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; async_io_version = _POSIX_VERSION; p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); } /* * Callback for unload of AIO when used as a module. */ static int aio_unload(void) { int error; /* * XXX: no unloads by default, it's too dangerous. * perhaps we could do it if locked out callers and then * did an aio_proc_rundown() on each process. */ if (!unloadable) return (EOPNOTSUPP); error = kqueue_del_filteropts(EVFILT_AIO); if (error) return error; async_io_version = 0; aio_swake = NULL; EVENTHANDLER_DEREGISTER(process_exit, exit_tag); EVENTHANDLER_DEREGISTER(process_exec, exec_tag); p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ static void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = uma_zalloc(kaio_zone, M_WAITOK); p->p_aioinfo = ki; ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_queue_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; ki->kaio_buffer_finished_count = 0; ki->kaio_p = p; TAILQ_INIT(&ki->kaio_jobdone); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_bufdone); TAILQ_INIT(&ki->kaio_bufqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_sockqueue); } while (num_aio_procs < target_aio_procs) aio_newproc(); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct aiocblist *aiocbe) { struct kaioinfo *ki; struct aio_liojob *lj; struct proc *p; int error; int s; if (aiocbe->jobstate == JOBST_NULL) panic("aio_free_entry: freeing already free job"); p = aiocbe->userproc; ki = p->p_aioinfo; lj = aiocbe->lio; if (ki == NULL) panic("aio_free_entry: missing p->p_aioinfo"); while (aiocbe->jobstate == JOBST_JOBRUNNING) { aiocbe->jobflags |= AIOCBLIST_RUNDOWN; tsleep(aiocbe, PRIBIO, "jobwai", 0); } if (aiocbe->bp == NULL) { if (ki->kaio_queue_count <= 0) panic("aio_free_entry: process queue size <= 0"); if (num_queue_count <= 0) panic("aio_free_entry: system wide queue size <= 0"); if (lj) { lj->lioj_queue_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) lj->lioj_queue_finished_count--; } ki->kaio_queue_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) ki->kaio_queue_finished_count--; num_queue_count--; } else { if (lj) { lj->lioj_buffer_count--; if (aiocbe->jobflags & AIOCBLIST_DONE) lj->lioj_buffer_finished_count--; } if (aiocbe->jobflags & AIOCBLIST_DONE) ki->kaio_buffer_finished_count--; ki->kaio_buffer_count--; num_buf_aio--; } /* aiocbe is going away, we need to destroy any knotes */ /* XXXKSE Note the thread here is used to eventually find the * owning process again, but it is also used to do a fo_close * and that requires the thread. (but does it require the * OWNING thread? (or maybe the running thread?) * There is a semantic problem here... */ knlist_clear(&aiocbe->klist, 0); /* XXXKSE */ if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(p); } if (aiocbe->jobstate == JOBST_JOBQBUF) { if ((error = aio_fphysio(aiocbe)) != 0) return (error); if (aiocbe->jobstate != JOBST_JOBBFINISHED) panic("aio_free_entry: invalid physio finish-up state"); s = splbio(); TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); splx(s); } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { s = splnet(); TAILQ_REMOVE(&aio_jobs, aiocbe, list); TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); splx(s); } else if (aiocbe->jobstate == JOBST_JOBFINISHED) TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { s = splbio(); TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); splx(s); if (aiocbe->bp) { vunmapbuf(aiocbe->bp); relpbuf(aiocbe->bp, NULL); aiocbe->bp = NULL; } } if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); uma_zfree(aiolio_zone, lj); } aiocbe->jobstate = JOBST_NULL; untimeout(process_signal, aiocbe, aiocbe->timeouthandle); fdrop(aiocbe->fd_file, curthread); crfree(aiocbe->cred); uma_zfree(aiocb_zone, aiocbe); return (0); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { int s; struct kaioinfo *ki; struct aio_liojob *lj, *ljn; struct aiocblist *aiocbe, *aiocbn; struct file *fp; struct socket *so; ki = p->p_aioinfo; if (ki == NULL) return; mtx_lock(&Giant); ki->kaio_flags |= LIOJ_SIGNAL_POSTED; while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { ki->kaio_flags |= KAIO_RUNDOWN; if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) break; } /* * Move any aio ops that are waiting on socket I/O to the normal job * queues so they are cleaned up with any others. */ s = splnet(); for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); fp = aiocbe->fd_file; if (fp != NULL) { so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); if (TAILQ_EMPTY(&so->so_aiojobq)) { SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(&so->so_rcv); } } TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); } splx(s); restart1: for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) goto restart1; } restart2: for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) goto restart2; } /* * Note the use of lots of splbio here, trying to avoid splbio for long chains * of I/O. Probably unnecessary. */ restart3: s = splbio(); while (TAILQ_FIRST(&ki->kaio_bufqueue)) { ki->kaio_flags |= KAIO_WAKEUP; tsleep(p, PRIBIO, "aioprn", 0); splx(s); goto restart3; } splx(s); restart4: s = splbio(); for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { aiocbn = TAILQ_NEXT(aiocbe, plist); if (aio_free_entry(aiocbe)) { splx(s); goto restart4; } } splx(s); /* * If we've slept, jobs might have moved from one queue to another. * Retry rundown if we didn't manage to empty the queues. */ if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || TAILQ_FIRST(&ki->kaio_bufdone) != NULL) goto restart1; for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { ljn = TAILQ_NEXT(lj, lioj_list); if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); uma_zfree(aiolio_zone, lj); } else { #ifdef DIAGNOSTIC printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " "QF:%d\n", lj->lioj_buffer_count, lj->lioj_buffer_finished_count, lj->lioj_queue_count, lj->lioj_queue_finished_count); #endif } } uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; mtx_unlock(&Giant); } /* * Select a job to run (called by an AIO daemon). */ static struct aiocblist * aio_selectjob(struct aiothreadlist *aiop) { int s; struct aiocblist *aiocbe; struct kaioinfo *ki; struct proc *userp; s = splnet(); for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = TAILQ_NEXT(aiocbe, list)) { userp = aiocbe->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); splx(s); return (aiocbe); } } splx(s); return (NULL); } /* * The AIO processing activity. This is the code that does the I/O request for * the non-physio version of the operations. The normal vn operations are used, * and this code should work in all instances for every type of file, including * pipes, sockets, fifos, and regular files. */ static void aio_process(struct aiocblist *aiocbe) { struct ucred *td_savedcred; struct thread *td; struct proc *mycp; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; int cnt; int error; int oublock_st, oublock_end; int inblock_st, inblock_end; td = curthread; td_savedcred = td->td_ucred; td->td_ucred = aiocbe->cred; mycp = td->td_proc; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; inblock_st = mycp->p_stats->p_ru.ru_inblock; oublock_st = mycp->p_stats->p_ru.ru_oublock; /* * _aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } inblock_end = mycp->p_stats->p_ru.ru_inblock; oublock_end = mycp->p_stats->p_ru.ru_oublock; aiocbe->inputcharge = inblock_end - inblock_st; aiocbe->outputcharge = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { PROC_LOCK(aiocbe->userproc); psignal(aiocbe->userproc, SIGPIPE); PROC_UNLOCK(aiocbe->userproc); } } cnt -= auio.uio_resid; cb->_aiocb_private.error = error; cb->_aiocb_private.status = cnt; td->td_ucred = td_savedcred; } /* * The AIO daemon, most of the actual work is done in aio_process, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *uproc) { int s; struct aio_liojob *lj; struct aiocb *cb; struct aiocblist *aiocbe; struct aiothreadlist *aiop; struct kaioinfo *ki; struct proc *curcp, *mycp, *userp; struct vmspace *myvm, *tmpvm; struct thread *td = curthread; struct pgrp *newpgrp; struct session *newsess; mtx_lock(&Giant); /* * Local copies of curproc (cp) and vmspace (myvm) */ mycp = td->td_proc; myvm = mycp->p_vmspace; KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aiothread = td; aiop->aiothreadflags |= AIOP_FREE; s = splnet(); /* * Place thread (lightweight process) onto the AIO free thread list. */ if (TAILQ_EMPTY(&aio_freeproc)) wakeup(&aio_freeproc); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); splx(s); /* * Get rid of our current filedescriptors. AIOD's don't need any * filedescriptors, except as temporarily inherited from the client. */ fdfree(td); mtx_unlock(&Giant); /* The daemon resides in its own pgrp. */ MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); enterpgrp(mycp, mycp->p_pid, newpgrp, newsess); sx_xunlock(&proctree_lock); mtx_lock(&Giant); /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ wakeup(mycp); for (;;) { /* * curcp is the current daemon process context. * userp is the current user process context. */ curcp = mycp; /* * Take daemon off of free queue */ if (aiop->aiothreadflags & AIOP_FREE) { s = splnet(); TAILQ_REMOVE(&aio_freeproc, aiop, list); TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; splx(s); } aiop->aiothreadflags &= ~AIOP_SCHED; /* * Check for jobs. */ while ((aiocbe = aio_selectjob(aiop)) != NULL) { cb = &aiocbe->uaiocb; userp = aiocbe->userproc; aiocbe->jobstate = JOBST_JOBRUNNING; /* * Connect to process address space for user program. */ if (userp != curcp) { /* * Save the current address space that we are * connected to. */ tmpvm = mycp->p_vmspace; /* * Point to the new user address space, and * refer to it. */ mycp->p_vmspace = userp->p_vmspace; atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); /* * If the old address space wasn't the daemons * own address space, then we need to remove the * daemon's reference from the other process * that it was acting on behalf of. */ if (tmpvm != myvm) { vmspace_free(tmpvm); } curcp = userp; } ki = userp->p_aioinfo; lj = aiocbe->lio; /* Account for currently active jobs. */ ki->kaio_active_count++; /* Do the I/O function. */ aio_process(aiocbe); /* Decrement the active job count. */ ki->kaio_active_count--; /* * Increment the completion count for wakeup/signal * comparisons. */ aiocbe->jobflags |= AIOCBLIST_DONE; ki->kaio_queue_finished_count++; if (lj) lj->lioj_queue_finished_count++; if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(userp); } s = splbio(); if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { PROC_LOCK(userp); psignal(userp, lj->lioj_signal.sigev_signo); PROC_UNLOCK(userp); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } splx(s); aiocbe->jobstate = JOBST_JOBFINISHED; s = splnet(); TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); splx(s); KNOTE_UNLOCKED(&aiocbe->klist, 0); if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { wakeup(aiocbe); aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; } if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { PROC_LOCK(userp); psignal(userp, cb->aio_sigevent.sigev_signo); PROC_UNLOCK(userp); } } /* * Disconnect from user address space. */ if (curcp != mycp) { /* Get the user address space to disconnect from. */ tmpvm = mycp->p_vmspace; /* Get original address space for daemon. */ mycp->p_vmspace = myvm; /* Activate the daemon's address space. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); #ifdef DIAGNOSTIC if (tmpvm == myvm) { printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); } #endif /* Remove our vmspace reference. */ vmspace_free(tmpvm); curcp = mycp; } /* * If we are the first to be put onto the free queue, wakeup * anyone waiting for a daemon. */ s = splnet(); TAILQ_REMOVE(&aio_activeproc, aiop, list); if (TAILQ_EMPTY(&aio_freeproc)) wakeup(&aio_freeproc); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aiothreadflags |= AIOP_FREE; splx(s); /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if ((aiop->aiothreadflags & AIOP_SCHED) == 0 && tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) { s = splnet(); if (TAILQ_EMPTY(&aio_jobs)) { if ((aiop->aiothreadflags & AIOP_FREE) && (num_aio_procs > target_aio_procs)) { TAILQ_REMOVE(&aio_freeproc, aiop, list); splx(s); uma_zfree(aiop_zone, aiop); num_aio_procs--; #ifdef DIAGNOSTIC if (mycp->p_vmspace->vm_refcnt <= 1) { printf("AIOD: bad vm refcnt for" " exiting daemon: %d\n", mycp->p_vmspace->vm_refcnt); } #endif kthread_exit(0); } } splx(s); } } } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(void) { int error; struct proc *p; error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d", num_aio_procs); if (error) return (error); /* * Wait until daemon is started, but continue on just in case to * handle error conditions. */ error = tsleep(p, PZERO, "aiosta", aiod_timeout); num_aio_procs++; return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, _aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct aiocblist *aiocbe) { int error; struct aiocb *cb; struct file *fp; struct buf *bp; struct vnode *vp; struct kaioinfo *ki; struct aio_liojob *lj; int s; int notify; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; /* * If its not a disk, we don't want to return a positive error. * It causes the aio code to not fall through to try the thread * way when you're talking to a regular file. */ if (!vn_isdisk(vp, &error)) { if (error == ENOTBLK) return (-1); else return (error); } if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); if (cb->aio_nbytes > MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) return (-1); ki = p->p_aioinfo; if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) return (-1); ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) lj->lioj_buffer_count++; /* Create and build a buffer header for a transfer. */ bp = (struct buf *)getpbuf(NULL); BUF_KERNPROC(bp); /* * Get a copy of the kva from the physical buffer. */ bp->b_dev = vp->v_rdev; error = 0; bp->b_bcount = cb->aio_nbytes; bp->b_bufsize = cb->aio_nbytes; bp->b_iodone = aio_physwakeup; bp->b_saveaddr = bp->b_data; bp->b_data = (void *)(uintptr_t)cb->aio_buf; bp->b_offset = cb->aio_offset; bp->b_iooffset = cb->aio_offset; bp->b_blkno = btodb(cb->aio_offset); bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; /* * Bring buffer into kernel space. */ if (vmapbuf(bp) < 0) { error = EFAULT; goto doerror; } s = splbio(); aiocbe->bp = bp; bp->b_caller1 = (void *)aiocbe; TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; num_buf_aio++; bp->b_error = 0; splx(s); /* Perform transfer. */ - dev_strategy(bp); + dev_strategy(vp->v_rdev, bp); notify = 0; s = splbio(); /* * If we had an error invoking the request, or an error in processing * the request before we have returned, we process it as an error in * transfer. Note that such an I/O error is not indicated immediately, * but is returned using the aio_error mechanism. In this case, * aio_suspend will return immediately. */ if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { struct aiocb *job = aiocbe->uuaiocb; aiocbe->uaiocb._aiocb_private.status = 0; suword(&job->_aiocb_private.status, 0); aiocbe->uaiocb._aiocb_private.error = bp->b_error; suword(&job->_aiocb_private.error, bp->b_error); ki->kaio_buffer_finished_count++; if (aiocbe->jobstate != JOBST_JOBBFINISHED) { aiocbe->jobstate = JOBST_JOBBFINISHED; aiocbe->jobflags |= AIOCBLIST_DONE; TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); notify = 1; } } splx(s); if (notify) KNOTE_UNLOCKED(&aiocbe->klist, 0); return (0); doerror: ki->kaio_buffer_count--; if (lj) lj->lioj_buffer_count--; aiocbe->bp = NULL; relpbuf(bp, NULL); return (error); } /* * This waits/tests physio completion. */ static int aio_fphysio(struct aiocblist *iocb) { int s; struct buf *bp; int error; bp = iocb->bp; s = splbio(); while ((bp->b_flags & B_DONE) == 0) { if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) { if ((bp->b_flags & B_DONE) == 0) { splx(s); return (EINPROGRESS); } else break; } } splx(s); /* Release mapping into kernel space. */ vunmapbuf(bp); iocb->bp = 0; error = 0; /* Check for an error. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; relpbuf(bp, NULL); return (error); } /* * Wake up aio requests that may be serviceable now. */ static void aio_swake_cb(struct socket *so, struct sockbuf *sb) { struct aiocblist *cb,*cbn; struct proc *p; struct kaioinfo *ki = NULL; int opcode, wakecount = 0; struct aiothreadlist *aiop; if (sb == &so->so_snd) { opcode = LIO_WRITE; SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(&so->so_snd); } else { opcode = LIO_READ; SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(&so->so_rcv); } for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { cbn = TAILQ_NEXT(cb, list); if (opcode == cb->uaiocb.aio_lio_opcode) { p = cb->userproc; ki = p->p_aioinfo; TAILQ_REMOVE(&so->so_aiojobq, cb, list); TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); TAILQ_INSERT_TAIL(&aio_jobs, cb, list); TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); wakecount++; if (cb->jobstate != JOBST_JOBQGLOBAL) panic("invalid queue value"); } } while (wakecount--) { if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { TAILQ_REMOVE(&aio_freeproc, aiop, list); TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } } } /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ static int _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type) { struct proc *p = td->td_proc; struct filedesc *fdp; struct file *fp; unsigned int fd; struct socket *so; int s; int error; int opcode, user_opcode; struct aiocblist *aiocbe; struct aiothreadlist *aiop; struct kaioinfo *ki; struct kevent kev; struct kqueue *kq; struct file *kq_fp; struct sockbuf *sb; aiocbe = uma_zalloc(aiocb_zone, M_WAITOK); aiocbe->inputcharge = 0; aiocbe->outputcharge = 0; callout_handle_init(&aiocbe->timeouthandle); /* XXX - need a lock */ knlist_init(&aiocbe->klist, NULL); suword(&job->_aiocb_private.status, -1); suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.kernelinfo, -1); error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); if (error) { suword(&job->_aiocb_private.error, error); uma_zfree(aiocb_zone, aiocbe); return (error); } if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } /* Save userspace address of the job info. */ aiocbe->uuaiocb = job; /* Get the opcode. */ user_opcode = aiocbe->uaiocb.aio_lio_opcode; if (type != LIO_NOP) aiocbe->uaiocb.aio_lio_opcode = type; opcode = aiocbe->uaiocb.aio_lio_opcode; /* Get the fd info for process. */ fdp = p->p_fd; /* * Range check file descriptor. */ FILEDESC_LOCK(fdp); fd = aiocbe->uaiocb.aio_fildes; if (fd >= fdp->fd_nfiles) { FILEDESC_UNLOCK(fdp); uma_zfree(aiocb_zone, aiocbe); if (type == 0) suword(&job->_aiocb_private.error, EBADF); return (EBADF); } fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0)) || ((opcode == LIO_READ) && ((fp->f_flag & FREAD) == 0))) { FILEDESC_UNLOCK(fdp); uma_zfree(aiocb_zone, aiocbe); if (type == 0) suword(&job->_aiocb_private.error, EBADF); return (EBADF); } fhold(fp); FILEDESC_UNLOCK(fdp); if (aiocbe->uaiocb.aio_offset == -1LL) { error = EINVAL; goto aqueue_fail; } error = suword(&job->_aiocb_private.kernelinfo, jobrefid); if (error) { error = EINVAL; goto aqueue_fail; } aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; if (jobrefid == LONG_MAX) jobrefid = 1; else jobrefid++; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); if (type == 0) { suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.status, 0); suword(&job->_aiocb_private.kernelinfo, 0); } return (0); } if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { if (type == 0) suword(&job->_aiocb_private.status, 0); error = EINVAL; goto aqueue_fail; } if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; } else { /* * This method for requesting kevent-based notification won't * work on the alpha, since we're passing in a pointer * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- * based method instead. */ if (user_opcode == LIO_NOP || user_opcode == LIO_READ || user_opcode == LIO_WRITE) goto no_kqueue; error = copyin((struct kevent *)(uintptr_t)user_opcode, &kev, sizeof(kev)); if (error) goto aqueue_fail; } if ((u_int)kev.ident >= fdp->fd_nfiles || (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || (kq_fp->f_type != DTYPE_KQUEUE)) { error = EBADF; goto aqueue_fail; } kq = kq_fp->f_data; kev.ident = (uintptr_t)aiocbe->uuaiocb; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.data = (intptr_t)aiocbe; error = kqueue_register(kq, &kev, td, 1); aqueue_fail: if (error) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); if (type == 0) suword(&job->_aiocb_private.error, error); goto done; } no_kqueue: suword(&job->_aiocb_private.error, EINPROGRESS); aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; aiocbe->userproc = p; aiocbe->cred = crhold(td->td_ucred); aiocbe->jobflags = 0; aiocbe->lio = lj; ki = p->p_aioinfo; if (fp->f_type == DTYPE_SOCKET) { /* * Alternate queueing for socket ops: Reach down into the * descriptor to get the socket data. Then check to see if the * socket is ready to be read or written (based on the requested * operation). * * If it is not ready for io, then queue the aiocbe on the * socket, and set the flags so we get a call when sbnotify() * happens. * * Note if opcode is neither LIO_WRITE nor LIO_READ we lock * and unlock the snd sockbuf for no reason. */ so = fp->f_data; sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; SOCKBUF_LOCK(sb); s = splnet(); if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == LIO_WRITE) && (!sowriteable(so)))) { TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); sb->sb_flags |= SB_AIO; aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ ki->kaio_queue_count++; num_queue_count++; SOCKBUF_UNLOCK(sb); splx(s); error = 0; goto done; } SOCKBUF_UNLOCK(sb); splx(s); } if ((error = aio_qphysio(p, aiocbe)) == 0) goto done; if (error > 0) { suword(&job->_aiocb_private.status, 0); aiocbe->uaiocb._aiocb_private.error = error; suword(&job->_aiocb_private.error, error); goto done; } /* No buffer for daemon I/O. */ aiocbe->bp = NULL; ki->kaio_queue_count++; if (lj) lj->lioj_queue_count++; s = splnet(); TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); splx(s); aiocbe->jobstate = JOBST_JOBQGLOBAL; num_queue_count++; error = 0; /* * If we don't have a free AIO process, and we are below our quota, then * start one. Otherwise, depend on the subsequent I/O completions to * pick-up this job. If we don't sucessfully create the new process * (thread) due to resource issues, we return an error for now (EAGAIN), * which is likely not the correct thing to do. */ s = splnet(); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { num_aio_resv_start++; if ((error = aio_newproc()) == 0) { num_aio_resv_start--; goto retryproc; } num_aio_resv_start--; } splx(s); done: return (error); } /* * This routine queues an AIO request, checking for quotas. */ static int aio_aqueue(struct thread *td, struct aiocb *job, int type) { struct proc *p = td->td_proc; struct kaioinfo *ki; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); if (num_queue_count >= max_queue_count) return (EAGAIN); ki = p->p_aioinfo; if (ki->kaio_queue_count >= ki->kaio_qallowed_count) return (EAGAIN); return _aio_aqueue(td, job, NULL, type); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ int aio_return(struct thread *td, struct aio_return_args *uap) { struct proc *p = td->td_proc; int s; long jobref; struct aiocblist *cb, *ncb; struct aiocb *ujob; struct kaioinfo *ki; ujob = uap->aiocbp; jobref = fuword(&ujob->_aiocb_private.kernelinfo); if (jobref == -1 || jobref == 0) return (EINVAL); ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { p->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { p->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } goto done; } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { ncb = TAILQ_NEXT(cb, plist); if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { break; } } splx(s); done: if (cb != NULL) { if (ujob == cb->uuaiocb) { td->td_retval[0] = cb->uaiocb._aiocb_private.status; } else td->td_retval[0] = EFAULT; aio_free_entry(cb); return (0); } return (EINVAL); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ int aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct aiocb *const *cbptr, *cbp; struct kaioinfo *ki; struct aiocblist *cb; int i; int njoblist; int error, s, timo; long *ijoblist; struct aiocb **ujoblist; if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) return (EINVAL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); njoblist = 0; ijoblist = uma_zalloc(aiol_zone, M_WAITOK); ujoblist = uma_zalloc(aiol_zone, M_WAITOK); cbptr = uap->aiocbp; for (i = 0; i < uap->nent; i++) { cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (cbp == 0) continue; ujoblist[njoblist] = cbp; ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); njoblist++; } if (njoblist == 0) { uma_zfree(aiol_zone, ijoblist); uma_zfree(aiol_zone, ujoblist); return (0); } error = 0; for (;;) { TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { for (i = 0; i < njoblist; i++) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == ijoblist[i]) { if (ujoblist[i] != cb->uuaiocb) error = EINVAL; uma_zfree(aiol_zone, ijoblist); uma_zfree(aiol_zone, ujoblist); return (error); } } } s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, plist)) { for (i = 0; i < njoblist; i++) { if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == ijoblist[i]) { splx(s); if (ujoblist[i] != cb->uuaiocb) error = EINVAL; uma_zfree(aiol_zone, ijoblist); uma_zfree(aiol_zone, ujoblist); return (error); } } } ki->kaio_flags |= KAIO_WAKEUP; error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); splx(s); if (error == ERESTART || error == EINTR) { uma_zfree(aiol_zone, ijoblist); uma_zfree(aiol_zone, ujoblist); return (EINTR); } else if (error == EWOULDBLOCK) { uma_zfree(aiol_zone, ijoblist); uma_zfree(aiol_zone, ujoblist); return (EAGAIN); } } /* NOTREACHED */ return (EINVAL); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct aiocblist *cbe, *cbn; struct file *fp; struct filedesc *fdp; struct socket *so; struct proc *po; int s,error; int cancelled=0; int notcancelled=0; struct vnode *vp; fdp = p->p_fd; if ((u_int)uap->fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp,&error)) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } } else if (fp->f_type == DTYPE_SOCKET) { so = fp->f_data; s = splnet(); for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { cbn = TAILQ_NEXT(cbe, list); if ((uap->aiocbp == NULL) || (uap->aiocbp == cbe->uuaiocb) ) { po = cbe->userproc; ki = po->p_aioinfo; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); if (ki->kaio_flags & KAIO_WAKEUP) { wakeup(po); } cbe->jobstate = JOBST_JOBFINISHED; cbe->uaiocb._aiocb_private.status=-1; cbe->uaiocb._aiocb_private.error=ECANCELED; cancelled++; /* XXX cancelled, knote? */ if (cbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { PROC_LOCK(cbe->userproc); psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); PROC_UNLOCK(cbe->userproc); } if (uap->aiocbp) break; } } splx(s); if ((cancelled) && (uap->aiocbp)) { td->td_retval[0] = AIO_CANCELED; return (0); } } ki=p->p_aioinfo; if (ki == NULL) goto done; s = splnet(); for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { cbn = TAILQ_NEXT(cbe, plist); if ((uap->fd == cbe->uaiocb.aio_fildes) && ((uap->aiocbp == NULL ) || (uap->aiocbp == cbe->uuaiocb))) { if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); cancelled++; ki->kaio_queue_finished_count++; cbe->jobstate = JOBST_JOBFINISHED; cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; /* XXX cancelled, knote? */ if (cbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { PROC_LOCK(cbe->userproc); psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); PROC_UNLOCK(cbe->userproc); } } else { notcancelled++; } } } splx(s); done: if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes only. * For a user mode async implementation, it would be best to do it in a userland * subroutine. */ int aio_error(struct thread *td, struct aio_error_args *uap) { struct proc *p = td->td_proc; int s; struct aiocblist *cb; struct kaioinfo *ki; long jobref; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); if ((jobref == -1) || (jobref == 0)) return (EINVAL); TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { td->td_retval[0] = cb->uaiocb._aiocb_private.error; return (0); } } s = splnet(); for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { td->td_retval[0] = EINPROGRESS; splx(s); return (0); } } for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { td->td_retval[0] = EINPROGRESS; splx(s); return (0); } } splx(s); s = splbio(); for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { td->td_retval[0] = cb->uaiocb._aiocb_private.error; splx(s); return (0); } } for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, plist)) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { td->td_retval[0] = EINPROGRESS; splx(s); return (0); } } splx(s); #if (0) /* * Hack for lio. */ status = fuword(&uap->aiocbp->_aiocb_private.status); if (status == -1) return fuword(&uap->aiocbp->_aiocb_private.error); #endif return (EINVAL); } /* syscall - asynchronous read from a file (REALTIME) */ int aio_read(struct thread *td, struct aio_read_args *uap) { return aio_aqueue(td, uap->aiocbp, LIO_READ); } /* syscall - asynchronous write to a file (REALTIME) */ int aio_write(struct thread *td, struct aio_write_args *uap) { return aio_aqueue(td, uap->aiocbp, LIO_WRITE); } /* syscall - list directed I/O (REALTIME) */ int lio_listio(struct thread *td, struct lio_listio_args *uap) { struct proc *p = td->td_proc; int nent, nentqueued; struct aiocb *iocb, * const *cbptr; struct aiocblist *cb; struct kaioinfo *ki; struct aio_liojob *lj; int error, runningcode; int nerror; int i; int s; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); if ((nent + num_queue_count) > max_queue_count) return (EAGAIN); ki = p->p_aioinfo; if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) return (EAGAIN); lj = uma_zalloc(aiolio_zone, M_WAITOK); if (!lj) return (EAGAIN); lj->lioj_flags = 0; lj->lioj_buffer_count = 0; lj->lioj_buffer_finished_count = 0; lj->lioj_queue_count = 0; lj->lioj_queue_finished_count = 0; lj->lioj_ki = ki; /* * Setup signal. */ if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (error) { uma_zfree(aiolio_zone, lj); return (error); } if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return (EINVAL); } lj->lioj_flags |= LIOJ_SIGNAL; } TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Get pointers to the list of I/O requests. */ nerror = 0; nentqueued = 0; cbptr = uap->acb_list; for (i = 0; i < uap->nent; i++) { iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { error = _aio_aqueue(td, iocb, lj, 0); if (error == 0) nentqueued++; else nerror++; } } /* * If we haven't queued any, then just return error. */ if (nentqueued == 0) return (0); /* * Calculate the appropriate error return. */ runningcode = 0; if (nerror) runningcode = EIO; if (uap->mode == LIO_WAIT) { int command, found, jobref; for (;;) { found = 0; for (i = 0; i < uap->nent; i++) { /* * Fetch address of the control buf pointer in * user space. */ iocb = (struct aiocb *) (intptr_t)fuword(&cbptr[i]); if (((intptr_t)iocb == -1) || ((intptr_t)iocb == 0)) continue; /* * Fetch the associated command from user space. */ command = fuword(&iocb->aio_lio_opcode); if (command == LIO_NOP) { found++; continue; } jobref = fuword(&iocb->_aiocb_private.kernelinfo); TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { p->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { p->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } found++; break; } } s = splbio(); TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == jobref) { found++; break; } } splx(s); } /* * If all I/Os have been disposed of, then we can * return. */ if (found == nentqueued) return (runningcode); ki->kaio_flags |= KAIO_WAKEUP; error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); if (error == EINTR) return (EINTR); else if (error == EWOULDBLOCK) return (EAGAIN); } } return (runningcode); } /* * This is a weird hack so that we can post a signal. It is safe to do so from * a timeout routine, but *not* from an interrupt routine. */ static void process_signal(void *aioj) { struct aiocblist *aiocbe = aioj; struct aio_liojob *lj = aiocbe->lio; struct aiocb *cb = &aiocbe->uaiocb; if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { PROC_LOCK(lj->lioj_ki->kaio_p); psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); PROC_UNLOCK(lj->lioj_ki->kaio_p); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { PROC_LOCK(aiocbe->userproc); psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); PROC_UNLOCK(aiocbe->userproc); } } /* * Interrupt handler for physio, performs the necessary process wakeups, and * signals. */ static void aio_physwakeup(struct buf *bp) { struct aiocblist *aiocbe; struct proc *p; struct kaioinfo *ki; struct aio_liojob *lj; wakeup(bp); aiocbe = (struct aiocblist *)bp->b_caller1; if (aiocbe) { p = aiocbe->userproc; aiocbe->jobstate = JOBST_JOBBFINISHED; aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; aiocbe->uaiocb._aiocb_private.error = 0; aiocbe->jobflags |= AIOCBLIST_DONE; if (bp->b_ioflags & BIO_ERROR) aiocbe->uaiocb._aiocb_private.error = bp->b_error; lj = aiocbe->lio; if (lj) { lj->lioj_buffer_finished_count++; /* * wakeup/signal if all of the interrupt jobs are done. */ if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { /* * Post a signal if it is called for. */ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { lj->lioj_flags |= LIOJ_SIGNAL_POSTED; aiocbe->timeouthandle = timeout(process_signal, aiocbe, 0); } } } ki = p->p_aioinfo; if (ki) { ki->kaio_buffer_finished_count++; TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); KNOTE_UNLOCKED(&aiocbe->klist, 0); /* Do the wakeup. */ if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(p); } } if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) aiocbe->timeouthandle = timeout(process_signal, aiocbe, 0); } } /* syscall - wait for the next completion of an aio request */ int aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct kaioinfo *ki; struct aiocblist *cb = NULL; int error, s, timo; suword(uap->aiocbp, (int)NULL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); for (;;) { if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); td->td_retval[0] = cb->uaiocb._aiocb_private.status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { p->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { p->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); return (cb->uaiocb._aiocb_private.error); } s = splbio(); if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { splx(s); suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); td->td_retval[0] = cb->uaiocb._aiocb_private.status; aio_free_entry(cb); return (cb->uaiocb._aiocb_private.error); } ki->kaio_flags |= KAIO_WAKEUP; error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); splx(s); if (error == ERESTART) return (EINTR); else if (error < 0) return (error); else if (error == EINTR) return (EINTR); else if (error == EWOULDBLOCK) return (EAGAIN); } } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; /* * The aiocbe pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_flags &= ~EV_FLAG1; knlist_add(&aiocbe->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; knlist_remove(&aiocbe->klist, kn, 0); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; kn->kn_data = aiocbe->uaiocb._aiocb_private.error; if (aiocbe->jobstate != JOBST_JOBFINISHED && aiocbe->jobstate != JOBST_JOBBFINISHED) return (0); kn->kn_flags |= EV_EOF; return (1); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 137028) +++ head/sys/kern/vfs_bio.c (revision 137029) @@ -1,3865 +1,3901 @@ -/* +/*- + * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice immediately at the beginning of the file, without modification, - * this list of conditions, and the following disclaimer. - * 2. Absolutely no warranty of function or purpose is made by the author - * John S. Dyson. + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, }; /* * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ static struct proc *bufdaemonproc; static int inmem(struct vnode * vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf *bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void vfs_backgroundwritedone(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int flushbufqueues(int flushdeps); static void buf_daemon(void); void bremfreel(struct buf *bp); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); int runningbufspace; SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static int bufspace; SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "KVA memory used for bufs"); static int maxbufspace; SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static int bufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static int maxbufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static int lobufspace; SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); static int hibufspace; SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static int lorunningspace; SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, "Minimum preferred space used for in-progress I/O"); static int hirunningspace; SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, "Maximum amount of space to use for in-progress I/O"); static int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); static int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); static int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx rbreqlock; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct mtx nblock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bdonelock; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 5 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 4 /* empty buffer headers */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; /* Lock for the bufqueues */ static struct mtx bqlock; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } mtx_unlock(&nblock); } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * runningbufwakeup() - in-progress I/O accounting. * */ static __inline void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); bp->b_runningbufspace = 0; mtx_lock(&rbreqlock); if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * Reads will adjust runningbufspace, but will not block based on it. * The read load has a side effect of reducing the allowed write load. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ static __inline void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer deamon if necessary */ static __inline void bd_wakeup(int dirtybuflevel) { mtx_lock(&bdlock); if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ static __inline void bd_speedup(void) { bd_wakeup(1); } /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += (physmem_est - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } #if 0 /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } #endif /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; GIANT_REQUIRED; mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NULL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vflags = 0; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = nbuf * BKVASIZE; hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; hirunningspace = 1024 * 1024; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd (8K) buffers. */ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * bfreekva() - free the kva allocation for a buffer. * * Must be called at splbio() or higher as this is the only locking for * buffer_map. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf *bp) { GIANT_REQUIRED; if (bp->b_kvasize) { atomic_add_int(&buffreekvacnt, 1); atomic_subtract_int(&bufspace, bp->b_kvasize); vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); bp->b_kvasize = 0; bufspacewakeup(); } } /* * bremfree: * * Remove the buffer from the appropriate free list. */ void bremfree(struct buf *bp) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } void bremfreel(struct buf *bp) { int s = splbio(); int old_qindex = bp->b_qindex; GIANT_REQUIRED; if (bp->b_qindex != QUEUE_NONE) { KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, and it was on the EMPTY, LRU, or AGE queues, * the buffer was free and we must decrement numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { switch(old_qindex) { case QUEUE_DIRTY: case QUEUE_CLEAN: case QUEUE_EMPTY: case QUEUE_EMPTYKVA: atomic_subtract_int(&numfreebuffers, 1); break; default: break; } } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). This is really just a special case of breadn(). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf **bpp) { return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf **bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags, s; struct buf *newbp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize); /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); BO_UNLOCK(bp->b_bufobj); newbp->b_bufobj = &bp->b_vp->v_bufobj; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = vfs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; /* move over the dependencies */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, newbp); /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if (curthread->td_proc != bufdaemonproc && curthread->td_proc != updateproc) waitrunningbufspace(); } return (0); } /* * Complete a background write started from bwrite. */ static void vfs_backgroundwritedone(struct buf *bp) { struct buf *origbp; /* * Find the original buffer that we are writing. */ BO_LOCK(bp->b_bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bp->b_bufobj); /* * Process dependencies then return any unfinished ones. */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, origbp); /* * This buffer is marked B_NOCACHE, so when it is released * by biodone, it will be tossed. We mark it with BIO_READ * to avoid biodone doing a second bufobj_wdrop. */ bp->b_flags |= B_NOCACHE; bp->b_iocmd = BIO_READ; bp->b_flags &= ~(B_CACHE | B_DONE); bp->b_iodone = 0; bufdone(bp); } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct buf *nbp; struct bufobj *bo; GIANT_REQUIRED; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy")); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; BO_LOCK(bo); if (td->td_pflags & TDP_COWINPROGRESS) { recursiveflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { BO_UNLOCK(bo); (void) VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td); BO_LOCK(bo); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || buf_countdeps(nbp, 0) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } BO_LOCK(bo); dirtybufferflushes++; break; } } BO_UNLOCK(bo); bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have a lot of dirty * buffers (midpoint between our recovery point and our stall * point). */ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp); atomic_add_int(&numdirtybuffers, 1); bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { int s; mtx_lock(&Giant); s = splbio(); mtx_lock(&nblock); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; msleep(&needsbuffer, &nblock, (PRIBIO + 4), "flswai", 0); } splx(s); mtx_unlock(&nblock); mtx_unlock(&Giant); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int s; GIANT_REQUIRED; KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); s = splbio(); if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If B_INVAL is set then * this case is not run and the next case is run to * destroy the buffer. B_INVAL can occur if the buffer * is outside the range supported by the underlying device. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (vm_page_count_severe()) { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (!(bp->b_vflags & BV_BKGRDINPROG)) bp->b_flags |= B_RELBUF; BO_UNLOCK(bp->b_bufobj); } else bp->b_flags |= B_RELBUF; } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; obj = bp->b_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_lock_queues(); vm_page_set_invalid(m, poffset, presid); vm_page_unlock_queues(); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } /* enqueue */ mtx_lock(&bqlock); /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; } else { bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_dev = NULL; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); bp->b_dev = NULL; /* remaining buffers */ } else { if (bp->b_flags & B_DELWRI) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } mtx_unlock(&bqlock); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already * placed the buffer on the correct queue. We must also disassociate * the device and vnode for a B_INVAL buffer so gbincore() doesn't * find it. */ if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if (!(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse */ if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); splx(s); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int s; s = splbio(); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } mtx_lock(&bqlock); /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); } else { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ BO_LOCK(bp->b_bufobj); if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ BO_UNLOCK(bp->b_bufobj); mtx_unlock(&bqlock); splx(s); brelse(bp); return; } } mtx_unlock(&bqlock); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); splx(s); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(struct buf *bp) { int i; vm_page_t m; GIANT_REQUIRED; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); pmap_remove_all(m); vm_page_free(m); } else if (bp->b_flags & B_DIRECT) { vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; VI_LOCK(vp); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; VI_UNLOCK(vp); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); splx(s); return nwritten; } } bremfree(bp); bp->b_flags |= B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; GIANT_REQUIRED; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ mtx_lock(&bqlock); nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { /* * If no EMPTYKVA buffers and we are either * defragging or reusing, locate a CLEAN buffer * to free or reuse. If bufspace useage is low * skip this step so we can allocate a new buffer. */ if (defrag || bufspace >= lobufspace) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a * CLEAN buffer, check to see if it is ok to use an EMPTY * buffer. We can only use an EMPTY buffer if allocating * its KVA would not otherwise run us out of buffer space. */ if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); continue; } BO_UNLOCK(bp->b_bufobj); } /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf: locked buf"); bremfreel(bp); mtx_unlock(&bqlock); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_dev = NULL; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_magic = B_MAGIC_BIO; bp->b_bufobj = NULL; LIST_INIT(&bp->b_dep); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { int flags; char *waitmsg; mtx_unlock(&bqlock); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } bd_speedup(); /* heeeelp */ mtx_lock(&nblock); needsbuffer |= flags; while (needsbuffer & flags) { if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { mtx_unlock(&nblock); return (NULL); } } mtx_unlock(&nblock); } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; bfreekva(bp); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ atomic_add_int(&bufdefragcnt, 1); defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; atomic_add_int(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } return(bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) static void buf_daemon() { int s; mtx_lock(&Giant); /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ s = splbio(); mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(bufdaemonproc); /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { if (flushbufqueues(0) == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ flushbufqueues(1); break; } waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(int flushdeps) { struct thread *td = curthread; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; mtx_lock(&bqlock); TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); BO_LOCK(bp->b_bufobj); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BO_UNLOCK(bp->b_bufobj); BUF_UNLOCK(bp); continue; } BO_UNLOCK(bp->b_bufobj); if (bp->b_flags & B_INVAL) { bremfreel(bp); mtx_unlock(&bqlock); brelse(bp); return (1); } if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { mtx_unlock(&bqlock); vfs_bio_awrite(bp); vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); flushwithdeps += hasdeps; return (1); } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_unlock(&bqlock); return (0); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; int s = splbio(); BO_LOCK(bo); bp = gbincore(bo, blkno); BO_UNLOCK(bo); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; GIANT_REQUIRED; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_vflag & VV_OBJBUF) == 0) return 0; size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_LOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_UNLOCK(obj); return 1; notinmem: VM_OBJECT_UNLOCK(obj); return (0); } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; GIANT_REQUIRED; /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) == 0) return; object = bp->b_pages[0]->object; VM_OBJECT_LOCK(object); if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p writeable but not mightbedirty\n", object); if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; vm_page_lock_queues(); /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); vm_page_unlock_queues(); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } VM_OBJECT_UNLOCK(object); } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int s; int error; ASSERT_VOP_LOCKED(vp, "getblk"); struct vm_object *vmo; if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); bo = &vp->v_bufobj; s = splbio(); loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. * * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (curthread == PCPU_GET(idlethread)) return NULL; mtx_lock(&nblock); needsbuffer |= VFS_BIO_NEED_ANY; mtx_unlock(&nblock); } VI_LOCK(vp); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, VI_MTX(vp), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } splx(s); bp->b_flags &= ~B_DONE; } else { int bsize, maxsize, vmio; off_t offset; /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ VI_UNLOCK(vp); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) { splx(s); return NULL; } bsize = bo->bo_bsize; offset = blkno * bsize; vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_vflag & VV_OBJBUF); maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); bp = getnewbuf(slpflag, slptimeo, size, maxsize); if (bp == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vn_canvmio(vp) != TRUE) printf("getblk: VMIO on vnode type %d\n", vp->v_type); #endif VOP_GETVOBJECT(vp, &vmo); KASSERT(vmo == bp->b_object, ("ARGH! different b_object %p %p %p\n", bp, vmo, bp->b_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_object == NULL, ("ARGH! has b_object %p %p\n", bp, bp->b_object)); } allocbuf(bp, size); splx(s); bp->b_flags &= ~B_DONE; } KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); KASSERT(bp->b_bufobj == bo, ("wrong b_bufobj %p should be %p", bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int s; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; s = splbio(); while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) continue; splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; GIANT_REQUIRED; if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { atomic_subtract_int( &bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; atomic_add_int(&bufmallocspace, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { atomic_subtract_int(&bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, TRUE, "biodep")) vm_page_lock_queues(); bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = bp->b_object; VM_OBJECT_LOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { /* * note: must allocate system pages * since blocking here could intefere * with paging I/O, no matter which * process we are. */ m = vm_page_alloc(obj, pi, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (m == NULL) { atomic_add_int(&vm_pageout_deficit, desiredpages - bp->b_npages); VM_OBJECT_UNLOCK(obj); VM_WAIT; VM_OBJECT_LOCK(obj); } else { bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test PG_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. Should we wakeup the * page daemon? */ if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } vm_page_wire(m); vm_page_unlock_queues(); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_UNLOCK(obj); /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } void biodone(struct bio *bp) { mtx_lock(&bdonelock); bp->bio_flags |= BIO_DONE; if (bp->bio_done == NULL) wakeup(bp); mtx_unlock(&bdonelock); if (bp->bio_done != NULL) bp->bio_done(bp); } /* * Wait for a BIO to finish. * * XXX: resort to a timeout for now. The optimal locking (if any) for this * case is not yet clear. */ int biowait(struct bio *bp, const char *wchan) { mtx_lock(&bdonelock); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); mtx_unlock(&bdonelock); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { int s; s = splbio(); if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. */ static void -bufdonebio(struct bio *bp) +bufdonebio(struct bio *bip) { + struct buf *bp; /* Device drivers may or may not hold giant, hold it here. */ mtx_lock(&Giant); - bufdone(bp->bio_caller2); + bp = bip->bio_caller2; + bp->b_resid = bp->b_bcount - bip->bio_completed; + bp->b_resid = bip->bio_resid; /* XXX: remove */ + bp->b_ioflags = bip->bio_flags; + bp->b_error = bip->bio_error; + if (bp->b_error) + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); mtx_unlock(&Giant); + g_destroy_bio(bip); } void -dev_strategy(struct buf *bp) +dev_strategy(struct cdev *dev, struct buf *bp) { struct cdevsw *csw; - struct cdev *dev; + struct bio *bip; if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) panic("b_iocmd botch"); - bp->b_io.bio_done = bufdonebio; - bp->b_io.bio_caller2 = bp; - dev = bp->b_io.bio_dev; + for (;;) { + bip = g_new_bio(); + if (bip != NULL) + break; + /* Try again later */ + tsleep(&bp, PRIBIO, "dev_strat", hz/10); + } + bip->bio_cmd = bp->b_iocmd; + bip->bio_offset = bp->b_iooffset; + bip->bio_length = bp->b_bcount; + bip->bio_bcount = bp->b_bcount; /* XXX: remove */ + bip->bio_data = bp->b_data; + bip->bio_done = bufdonebio; + bip->bio_caller2 = bp; + bip->bio_dev = dev; + KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s)", devtoname(dev))); csw = dev_refthread(dev); if (csw == NULL) { bp->b_error = ENXIO; bp->b_ioflags = BIO_ERROR; mtx_lock(&Giant); /* XXX: too defensive ? */ bufdone(bp); mtx_unlock(&Giant); /* XXX: too defensive ? */ return; } - (*csw->d_strategy)(&bp->b_io); + (*csw->d_strategy)(bip); dev_relthread(dev); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { int s; void (*biodone)(struct buf *); s = splbio(); KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE && bp->b_bufobj != NULL) bufobj_wdrop(bp->b_bufobj); /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); if (bp->b_flags & B_VMIO) { int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; obj = bp->b_object; #if defined(VFS_BIO_DEBUG) mp_fixme("usecount and vflag accessed without locks."); if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if ((vp->v_vflag & VV_OBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); VM_OBJECT_LOCK(obj); #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%jd)/m->pindex(%ju) mismatch\n", (intmax_t)foff, (uintmax_t)m->pindex); } #endif /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (!vn_isdisk(vp, NULL)) printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize, (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n", (u_long)m->valid, (u_long)m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else { bdone(bp); } splx(s); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_object; VM_OBJECT_LOCK(obj); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); } vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); vfs_setdirty(bp); VM_OBJECT_LOCK(obj); retry: vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ pmap_remove_all(m); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf *bp) { int i; vm_ooffset_t foff, noff, eoff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. * */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask = 0; caddr_t sa, ea; GIANT_REQUIRED; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) == 0)) { bzero(bp->b_data, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } ea = sa = bp->b_data; for(i = 0; i < bp->b_npages; i++, sa = ea) { ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)(vm_offset_t)ulmin( (u_long)(vm_offset_t)ea, (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); if (bp->b_pages[i] == bogus_page) continue; j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) bzero(sa, ea - sa); } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_pages[i]->valid |= mask; } unlock: VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could intefere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!p) { atomic_add_int(&vm_pageout_deficit, (to - pg) >> PAGE_SHIFT); VM_OBJECT_UNLOCK(kernel_object); VM_WAIT; VM_OBJECT_LOCK(kernel_object); goto tryagain; } p->valid = VM_PAGE_BITS_ALL; pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { if (p->busy) { printf( "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); } bp->b_pages[index] = NULL; pmap_qremove(pg, 1); vm_page_lock_queues(); vm_page_busy(p); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. */ int vmapbuf(struct buf *bp) { caddr_t addr, kva; vm_prot_t prot; int pidx, i; struct vm_page *m; struct pmap *pmap = &curproc->p_vmspace->vm_pmap; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, pidx++) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. * * NOTE! Must use pmap_extract() because addr may be in * the userland address space, and kextract is only guarenteed * to work for the kernland address space (see: sparc64 port). */ retry: if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, prot) < 0) { vm_page_lock_queues(); for (i = 0; i < pidx; ++i) { vm_page_unhold(bp->b_pages[i]); bp->b_pages[i] = NULL; } vm_page_unlock_queues(); return(-1); } m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); if (m == NULL) goto retry; bp->b_pages[pidx] = m; } if (pidx > btoc(MAXPHYS)) panic("vmapbuf: mapped more than MAXPHYS"); pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); kva = bp->b_saveaddr; bp->b_npages = pidx; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(struct buf *bp) { int pidx; int npages; npages = bp->b_npages; pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_lock_queues(); for (pidx = 0; pidx < npages; pidx++) vm_page_unhold(bp->b_pages[pidx]); vm_page_unlock_queues(); bp->b_data = bp->b_saveaddr; } void bdone(struct buf *bp) { mtx_lock(&bdonelock); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(&bdonelock); } void bwait(struct buf *bp, u_char pri, const char *wchan) { mtx_lock(&bdonelock); while ((bp->b_flags & B_DONE) == 0) msleep(bp, &bdonelock, pri, wchan, 0); mtx_unlock(&bdonelock); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; #if 0 KASSERT(vp == bo->bo_vnode, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); #endif if (vp->v_type == VCHR) i = VOP_SPECSTRATEGY(vp, bp); else i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_LOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_MTX(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_dev = (%d,%d), b_data = %p, b_blkno = %jd\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, major(bp->b_dev), minor(bp->b_dev), bp->b_data, (intmax_t)bp->b_blkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ Index: head/sys/sys/conf.h =================================================================== --- head/sys/sys/conf.h (revision 137028) +++ head/sys/sys/conf.h (revision 137029) @@ -1,324 +1,324 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2000 * Poul-Henning Kamp. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)conf.h 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_CONF_H_ #define _SYS_CONF_H_ #ifdef _KERNEL #include #else #include #endif struct tty; struct disk; struct vnode; struct buf; struct snapdata; struct cdev { u_int si_flags; #define SI_ALIAS 0x0002 /* carrier of alias name */ #define SI_NAMED 0x0004 /* make_dev{_alias} has been called */ #define SI_CHEAPCLONE 0x0008 /* can be removed_dev'ed when vnode reclaims */ #define SI_CHILD 0x0010 /* child of another struct cdev **/ #define SI_DEVOPEN 0x0020 /* opened by device */ #define SI_CONSOPEN 0x0040 /* opened by console */ #define SI_DUMPDEV 0x0080 /* is kernel dumpdev */ #define SI_CANDELETE 0x0100 /* can do BIO_DELETE */ #define SI_CLONELIST 0x0200 /* on a clone list */ struct timespec si_atime; struct timespec si_ctime; struct timespec si_mtime; dev_t si_udev; int si_refcount; LIST_ENTRY(cdev) si_list; LIST_ENTRY(cdev) si_clone; LIST_ENTRY(cdev) si_hash; SLIST_HEAD(, vnode) si_hlist; LIST_HEAD(, cdev) si_children; LIST_ENTRY(cdev) si_siblings; struct cdev *si_parent; u_int si_inode; char *si_name; void *si_drv1, *si_drv2; struct cdevsw *si_devsw; int si_iosize_max; /* maximum I/O size (for physio &al) */ uid_t si_uid; gid_t si_gid; mode_t si_mode; u_long si_usecount; u_long si_threadcount; union { struct { struct tty *__sit_tty; } __si_tty; struct { struct mount *__sid_mountpoint; int __sid_bsize_phys; /* min physical block size */ struct snapdata *__sid_snapdata; } __si_disk; } __si_u; char __si_namebuf[SPECNAMELEN + 1]; }; #define si_tty __si_u.__si_tty.__sit_tty #define si_mountpoint __si_u.__si_disk.__sid_mountpoint #define si_bsize_phys __si_u.__si_disk.__sid_bsize_phys #define si_snapdata __si_u.__si_disk.__sid_snapdata #ifdef _KERNEL /* * Definitions of device driver entry switches */ struct bio; struct buf; struct thread; struct uio; struct knote; struct clonedevs; /* * Note: d_thread_t is provided as a transition aid for those drivers * that treat struct proc/struct thread as an opaque data type and * exist in substantially the same form in both 4.x and 5.x. Writers * of drivers that dips into the d_thread_t structure should use * struct thread or struct proc as appropriate for the version of the * OS they are using. It is provided in lieu of each device driver * inventing its own way of doing this. While it does violate style(9) * in a number of ways, this violation is deemed to be less * important than the benefits that a uniform API between releases * gives. * * Users of struct thread/struct proc that aren't device drivers should * not use d_thread_t. */ typedef struct thread d_thread_t; typedef int d_open_t(struct cdev *dev, int oflags, int devtype, struct thread *td); typedef int d_fdopen_t(struct cdev *dev, int oflags, struct thread *td, int fdidx); typedef int d_close_t(struct cdev *dev, int fflag, int devtype, struct thread *td); typedef void d_strategy_t(struct bio *bp); typedef int d_ioctl_t(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); typedef int d_read_t(struct cdev *dev, struct uio *uio, int ioflag); typedef int d_write_t(struct cdev *dev, struct uio *uio, int ioflag); typedef int d_poll_t(struct cdev *dev, int events, struct thread *td); typedef int d_kqfilter_t(struct cdev *dev, struct knote *kn); typedef int d_mmap_t(struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, int nprot); typedef void d_purge_t(struct cdev *dev); typedef int d_spare2_t(struct cdev *dev); typedef int dumper_t( void *priv, /* Private to the driver. */ void *virtual, /* Virtual (mapped) address. */ vm_offset_t physical, /* Physical address of virtual. */ off_t offset, /* Byte-offset to write at. */ size_t length); /* Number of bytes to dump. */ #endif /* _KERNEL */ /* * Types for d_flags. */ #define D_TAPE 0x0001 #define D_DISK 0x0002 #define D_TTY 0x0004 #define D_MEM 0x0008 #ifdef _KERNEL #define D_TYPEMASK 0xffff /* * Flags for d_flags which the drivers can set. */ #define D_MEMDISK 0x00010000 /* memory type disk */ #define D_TRACKCLOSE 0x00080000 /* track all closes */ #define D_MMAP_ANON 0x00100000 /* special treatment in vm_mmap.c */ #define D_PSEUDO 0x00200000 /* make_dev() can return NULL */ #define D_NEEDGIANT 0x00400000 /* driver want Giant */ /* * Version numbers. */ #define D_VERSION_00 0x20011966 #define D_VERSION D_VERSION_00 /* * Flags used for internal housekeeping */ #define D_INIT 0x80000000 /* cdevsw initialized */ #define D_ALLOCMAJ 0x40000000 /* major# is allocated */ /* * Character device switch table */ struct cdevsw { int d_version; int d_maj; u_int d_flags; const char *d_name; d_open_t *d_open; d_fdopen_t *d_fdopen; d_close_t *d_close; d_read_t *d_read; d_write_t *d_write; d_ioctl_t *d_ioctl; d_poll_t *d_poll; d_mmap_t *d_mmap; d_strategy_t *d_strategy; dumper_t *d_dump; d_kqfilter_t *d_kqfilter; d_purge_t *d_purge; d_spare2_t *d_spare2; /* These fields should not be messed with by drivers */ LIST_ENTRY(cdevsw) d_list; LIST_HEAD(, cdev) d_devs; int d_spare3; }; #endif /* _KERNEL */ #ifdef _KERNEL #define NUMCDEVSW 256 /* * XXX: do not use MAJOR_AUTO unless you have no choice. In general drivers * should just not initialize .d_maj and that will DTRT. */ #define MAJOR_AUTO 0 /* XXX: Not GM */ struct module; struct devsw_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ /* Do not initialize fields hereafter */ }; #define DEV_MODULE(name, evh, arg) \ static moduledata_t name##_mod = { \ #name, \ evh, \ arg \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE) void clone_setup(struct clonedevs **cdp); void clone_cleanup(struct clonedevs **); #define CLONE_UNITMASK 0xfffff #define CLONE_FLAG0 (CLONE_UNITMASK + 1) int clone_create(struct clonedevs **, struct cdevsw *, int *unit, struct cdev **dev, u_int extra); int count_dev(struct cdev *_dev); void destroy_dev(struct cdev *_dev); struct cdevsw *dev_refthread(struct cdev *_dev); void dev_relthread(struct cdev *_dev); const char *devtoname(struct cdev *_dev); int dev_named(struct cdev *_pdev, const char *_name); void dev_depends(struct cdev *_pdev, struct cdev *_cdev); void dev_ref(struct cdev *dev); void dev_rel(struct vnode *vp); -void dev_strategy(struct buf *bp); +void dev_strategy(struct cdev *dev, struct buf *bp); struct cdev *makebdev(int _maj, int _min); struct cdev *make_dev(struct cdevsw *_devsw, int _minor, uid_t _uid, gid_t _gid, int _perms, const char *_fmt, ...) __printflike(6, 7); struct cdev *make_dev_alias(struct cdev *_pdev, const char *_fmt, ...) __printflike(2, 3); int dev2unit(struct cdev *_dev); void dev_lock(void); void dev_unlock(void); int unit2minor(int _unit); void setconf(void); struct cdev *getdiskbyname(char *_name); void devfs_create(struct cdev *dev); void devfs_destroy(struct cdev *dev); #define UID_ROOT 0 #define UID_BIN 3 #define UID_UUCP 66 #define GID_WHEEL 0 #define GID_KMEM 2 #define GID_OPERATOR 5 #define GID_BIN 7 #define GID_GAMES 13 #define GID_DIALER 68 typedef void (*dev_clone_fn)(void *arg, char *name, int namelen, struct cdev **result); int dev_stdclone(char *_name, char **_namep, const char *_stem, int *_unit); EVENTHANDLER_DECLARE(dev_clone, dev_clone_fn); /* Stuff relating to kernel-dump */ struct dumperinfo { dumper_t *dumper; /* Dumping function. */ void *priv; /* Private parts. */ u_int blocksize; /* Size of block in bytes. */ off_t mediaoffset; /* Initial offset in bytes. */ off_t mediasize; /* Space available in bytes. */ }; int set_dumper(struct dumperinfo *); void dumpsys(struct dumperinfo *); extern int dumping; /* system is dumping */ /* D_TTY related functions */ d_close_t ttyclose; d_ioctl_t ttyioctl; d_kqfilter_t ttykqfilter; d_open_t ttyopen; d_poll_t ttypoll; d_read_t ttyread; d_write_t ttywrite; #endif /* _KERNEL */ #endif /* !_SYS_CONF_H_ */