Index: head/sys/fs/specfs/spec_vnops.c =================================================================== --- head/sys/fs/specfs/spec_vnops.c (revision 130550) +++ head/sys/fs/specfs/spec_vnops.c (revision 130551) @@ -1,852 +1,852 @@ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int spec_advlock(struct vop_advlock_args *); static int spec_close(struct vop_close_args *); static int spec_freeblks(struct vop_freeblks_args *); static int spec_fsync(struct vop_fsync_args *); static int spec_getpages(struct vop_getpages_args *); static int spec_ioctl(struct vop_ioctl_args *); static int spec_kqfilter(struct vop_kqfilter_args *); static int spec_open(struct vop_open_args *); static int spec_poll(struct vop_poll_args *); static int spec_print(struct vop_print_args *); static int spec_read(struct vop_read_args *); static int spec_specstrategy(struct vop_specstrategy_args *); static int spec_write(struct vop_write_args *); vop_t **spec_vnodeop_p; static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_access_desc, (vop_t *) vop_ebadf }, { &vop_advlock_desc, (vop_t *) spec_advlock }, { &vop_bmap_desc, (vop_t *) vop_panic }, { &vop_close_desc, (vop_t *) spec_close }, { &vop_create_desc, (vop_t *) vop_panic }, { &vop_freeblks_desc, (vop_t *) spec_freeblks }, { &vop_fsync_desc, (vop_t *) spec_fsync }, { &vop_getpages_desc, (vop_t *) spec_getpages }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_ioctl_desc, (vop_t *) spec_ioctl }, { &vop_kqfilter_desc, (vop_t *) spec_kqfilter }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) vop_panic }, { &vop_mkdir_desc, (vop_t *) vop_panic }, { &vop_mknod_desc, (vop_t *) vop_panic }, { &vop_open_desc, (vop_t *) spec_open }, { &vop_pathconf_desc, (vop_t *) vop_stdpathconf }, { &vop_poll_desc, (vop_t *) spec_poll }, { &vop_print_desc, (vop_t *) spec_print }, { &vop_read_desc, (vop_t *) spec_read }, { &vop_readdir_desc, (vop_t *) vop_panic }, { &vop_readlink_desc, (vop_t *) vop_panic }, { &vop_reallocblks_desc, (vop_t *) vop_panic }, { &vop_reclaim_desc, (vop_t *) vop_null }, { &vop_remove_desc, (vop_t *) vop_panic }, { &vop_rename_desc, (vop_t *) vop_panic }, { &vop_rmdir_desc, (vop_t *) vop_panic }, { &vop_setattr_desc, (vop_t *) vop_ebadf }, { &vop_specstrategy_desc, (vop_t *) spec_specstrategy }, { &vop_strategy_desc, (vop_t *) vop_panic }, { &vop_symlink_desc, (vop_t *) vop_panic }, { &vop_write_desc, (vop_t *) spec_write }, { NULL, NULL } }; static struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; VNODEOP_SET(spec_vnodeop_opv_desc); int spec_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } /* * Open a special file. */ /* ARGSUSED */ static int spec_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; int error; struct cdevsw *dsw; if (vp->v_type == VBLK) return (ENXIO); /* Don't allow open if fs is mounted -nodev. */ if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) return (ENXIO); if (dev == NODEV) return (ENXIO); dsw = devsw(dev); if (dsw == NULL || dsw->d_open == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; /* * XXX: Disks get special billing here, but it is mostly wrong. * XXX: Disk partitions can overlap and the real checks should * XXX: take this into account, and consequently they need to * XXX: live in the disk slice code. Some checks do. */ if (vn_isdisk(vp, NULL) && ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { /* * Never allow opens for write if the disk is mounted R/W. */ if (vp->v_rdev->si_mountpoint != NULL && !(vp->v_rdev->si_mountpoint->mnt_flag & MNT_RDONLY)) return (EBUSY); /* * When running in secure mode, do not allow opens * for writing if the disk is mounted. */ error = securelevel_ge(td->td_ucred, 1); if (error && vfs_mountedon(vp)) return (error); /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } /* XXX: Special casing of ttys for deadfs. Probably redundant. */ if (dsw->d_flags & D_TTY) vp->v_vflag |= VV_ISTTY; VOP_UNLOCK(vp, 0, td); dev_ref(dev); cdevsw_ref(dsw); if(!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); PICKUP_GIANT(); } else if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); cdevsw_rel(dsw); if (error != 0) dev_rel(dev); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error) return (error); if (dsw->d_flags & D_TTY) { if (dev->si_tty) { struct tty *tp; tp = dev->si_tty; if (!tp->t_stop) { printf("Warning:%s: no t_stop, using nottystop\n", devtoname(dev)); tp->t_stop = nottystop; } } } if (vn_isdisk(vp, NULL)) { if (!dev->si_bsize_phys) dev->si_bsize_phys = DEV_BSIZE; } return (error); } /* * Vnode op for read */ /* ARGSUSED */ static int spec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct thread *td; struct uio *uio; dev_t dev; int error, resid; struct cdevsw *dsw; vp = ap->a_vp; dev = vp->v_rdev; uio = ap->a_uio; td = uio->uio_td; resid = uio->uio_resid; if (resid == 0) return (0); dsw = devsw(dev); VOP_UNLOCK(vp, 0, td); KASSERT(dev->si_refcount > 0, ("specread() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_read(dev, uio, ap->a_ioflag); PICKUP_GIANT(); } else error = dsw->d_read(dev, uio, ap->a_ioflag); cdevsw_rel(dsw); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (uio->uio_resid != resid || (error == 0 && resid != 0)) vfs_timestamp(&dev->si_atime); return (error); } /* * Vnode op for write */ /* ARGSUSED */ static int spec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct thread *td; struct uio *uio; dev_t dev; int error, resid; struct cdevsw *dsw; vp = ap->a_vp; dev = vp->v_rdev; dsw = devsw(dev); uio = ap->a_uio; td = uio->uio_td; resid = uio->uio_resid; VOP_UNLOCK(vp, 0, td); KASSERT(dev->si_refcount > 0, ("spec_write() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_write(dev, uio, ap->a_ioflag); PICKUP_GIANT(); } else error = dsw->d_write(dev, uio, ap->a_ioflag); cdevsw_rel(dsw); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { vfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } return (error); } /* * Device ioctl operation. */ /* ARGSUSED */ static int spec_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { dev_t dev; int error; struct cdevsw *dsw; dev = ap->a_vp->v_rdev; dsw = devsw(dev); KASSERT(dev->si_refcount > 0, ("spec_ioctl() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_td); PICKUP_GIANT(); } else error = dsw->d_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, ap->a_td); cdevsw_rel(dsw); if (error == ENOIOCTL) error = ENOTTY; return (error); } /* ARGSUSED */ static int spec_poll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { dev_t dev; struct cdevsw *dsw; int error; dev = ap->a_vp->v_rdev; dsw = devsw(dev); KASSERT(dev->si_refcount > 0, ("spec_poll() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { /* XXX: not yet DROP_GIANT(); */ error = dsw->d_poll(dev, ap->a_events, ap->a_td); /* XXX: not yet PICKUP_GIANT(); */ } else error = dsw->d_poll(dev, ap->a_events, ap->a_td); cdevsw_rel(dsw); return(error); } /* ARGSUSED */ static int spec_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { dev_t dev; struct cdevsw *dsw; int error; dev = ap->a_vp->v_rdev; dsw = devsw(dev); KASSERT(dev->si_refcount > 0, ("spec_kqfilter() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_kqfilter(dev, ap->a_kn); PICKUP_GIANT(); } else error = dsw->d_kqfilter(dev, ap->a_kn); cdevsw_rel(dsw); return (error); } /* * Synch buffers associated with a block device */ /* ARGSUSED */ static int spec_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { if (!vn_isdisk(ap->a_vp, NULL)) return (0); return (vop_stdfsync(ap)); } /* * Mutex to use when delaying niced I/O bound processes in spec_strategy(). */ static struct mtx strategy_mtx; static void strategy_init(void) { mtx_init(&strategy_mtx, "strategy", NULL, MTX_DEF); } SYSINIT(strategy, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, strategy_init, NULL) static int doslowdown = 0; SYSCTL_INT(_debug, OID_AUTO, doslowdown, CTLFLAG_RW, &doslowdown, 0, ""); /* * Just call the device strategy routine */ static int spec_xstrategy(struct vnode *vp, struct buf *bp) { struct mount *mp; struct cdevsw *dsw; struct thread *td = curthread; KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE || bp->b_iocmd == BIO_DELETE, ("Wrong b_iocmd buf=%p cmd=%d", bp, bp->b_iocmd)); /* * Slow down disk requests for niced processes. */ - if (doslowdown && td && td->td_ksegrp->kg_nice > 0) { + if (doslowdown && td && td->td_proc->p_nice > 0) { mtx_lock(&strategy_mtx); msleep(&strategy_mtx, &strategy_mtx, PPAUSE | PCATCH | PDROP, "ioslow", - td->td_ksegrp->kg_nice); + td->td_proc->p_nice); } /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) { if (bp->b_iocmd == BIO_WRITE) { if (bp->b_lock.lk_lockholder == LK_KERNPROC) mp->mnt_stat.f_asyncwrites++; else mp->mnt_stat.f_syncwrites++; } else { if (bp->b_lock.lk_lockholder == LK_KERNPROC) mp->mnt_stat.f_asyncreads++; else mp->mnt_stat.f_syncreads++; } } dsw = devsw(bp->b_dev); if (dsw == NULL) { bp->b_io.bio_error = ENXIO; bp->b_io.bio_flags |= BIO_ERROR; biodone(&bp->b_io); return (0); } KASSERT(dsw->d_strategy != NULL, ("No strategy on dev %s responsible for buffer %p\n", devtoname(bp->b_dev), bp)); if (!(dsw->d_flags & D_NEEDGIANT)) { /* XXX: notyet DROP_GIANT(); */ DEV_STRATEGY(bp); /* XXX: notyet PICKUP_GIANT(); */ } else DEV_STRATEGY(bp); return (0); } static int spec_specstrategy(ap) struct vop_specstrategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { KASSERT(ap->a_vp->v_rdev == ap->a_bp->b_dev, ("%s, dev %s != %s", __func__, devtoname(ap->a_vp->v_rdev), devtoname(ap->a_bp->b_dev))); return spec_xstrategy(ap->a_vp, ap->a_bp); } static int spec_freeblks(ap) struct vop_freeblks_args /* { struct vnode *a_vp; daddr_t a_addr; daddr_t a_length; } */ *ap; { struct buf *bp; /* * XXX: This assumes that strategy does the deed right away. * XXX: this may not be TRTTD. */ if ((ap->a_vp->v_rdev->si_flags & SI_CANDELETE) == 0) return (0); bp = geteblk(ap->a_length); bp->b_iocmd = BIO_DELETE; bp->b_dev = ap->a_vp->v_rdev; bp->b_blkno = ap->a_addr; bp->b_offset = dbtob(ap->a_addr); bp->b_iooffset = bp->b_offset; bp->b_bcount = ap->a_length; BUF_KERNPROC(bp); DEV_STRATEGY(bp); return (0); } /* * Device close routine */ /* ARGSUSED */ static int spec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; dev_t dev = vp->v_rdev; struct cdevsw *dsw; int error; /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ /* * This needs to be rewritten to take the vp interlock into * consideration. */ dsw = devsw(dev); oldvp = NULL; sx_xlock(&proctree_lock); if (td && vp == td->td_proc->p_session->s_ttyvp) { SESS_LOCK(td->td_proc->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_XLOCK) == 0) { td->td_proc->p_session->s_ttyvp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(td->td_proc->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ VI_LOCK(vp); if (vp->v_iflag & VI_XLOCK) { /* Forced close. */ } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); return (0); } VI_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("spec_close() on un-referenced dev_t (%s)", devtoname(dev))); cdevsw_ref(dsw); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); PICKUP_GIANT(); } else error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); cdevsw_rel(dsw); dev_rel(dev); return (error); } /* * Print out the contents of a special device vnode. */ static int spec_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } /* * Special device advisory byte-level locks. */ /* ARGSUSED */ static int spec_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } static int spec_getpages(ap) struct vop_getpages_args *ap; { vm_offset_t kva; int error; int i, pcount, size, s; daddr_t blkno; struct buf *bp; vm_page_t m; vm_ooffset_t offset; int toff, nextoff, nread; struct vnode *vp = ap->a_vp; int blksiz; int gotreqpage; GIANT_REQUIRED; error = 0; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * Calculate the offset of the transfer and do a sanity check. * FreeBSD currently only supports an 8 TB range due to b_blkno * being in DEV_BSIZE ( usually 512 ) byte chunks on call to * VOP_STRATEGY. XXX */ offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; blkno = btodb(offset); /* * Round up physical size for real devices. We cannot round using * v_mount's block size data because v_mount has nothing to do with * the device. i.e. it's usually '/dev'. We need the physical block * size for the device itself. * * We can't use v_rdev->si_mountpoint because it only exists when the * block device is mounted. However, we can use v_rdev. */ if (vn_isdisk(vp, NULL)) blksiz = vp->v_rdev->si_bsize_phys; else blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); bp = getpbuf(NULL); kva = (vm_offset_t)bp->b_data; /* * Map the pages to be read into the kva. */ pmap_qenter(kva, ap->a_m, pcount); /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_iooffset = offset; bp->b_blkno = blkno; bp->b_lblkno = blkno; pbgetvp(ap->a_vp, bp); bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; bp->b_runningbufspace = bp->b_bufsize; runningbufspace += bp->b_runningbufspace; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; /* Do the input. */ spec_xstrategy(bp->b_vp, bp); s = splbio(); bwait(bp, PVM, "spread"); splx(s); if ((bp->b_ioflags & BIO_ERROR) != 0) { if (bp->b_error) error = bp->b_error; else error = EIO; } nread = size - bp->b_resid; if (nread < ap->a_count) { bzero((caddr_t)kva + nread, ap->a_count - nread); } pmap_qremove(kva, pcount); gotreqpage = 0; /* * While the page is busy, its object field is immutable. */ VM_OBJECT_LOCK(ap->a_m[ap->a_reqpage]->object); vm_page_lock_queues(); for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { nextoff = toff + PAGE_SIZE; m = ap->a_m[i]; if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); } else if (toff < nread) { /* * Since this is a VM request, we have to supply the * unaligned offset to allow vm_page_set_validclean() * to zero sub-DEV_BSIZE'd portions of the page. */ vm_page_set_validclean(m, 0, nread - toff); } else { m->valid = 0; vm_page_undirty(m); } if (i != ap->a_reqpage) { /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error || (m->valid == VM_PAGE_BITS_ALL)) { if (m->valid) { if (m->flags & PG_WANTED) { vm_page_activate(m); } else { vm_page_deactivate(m); } vm_page_wakeup(m); } else { vm_page_free(m); } } else { vm_page_free(m); } } else if (m->valid) { gotreqpage = 1; /* * Since this is a VM request, we need to make the * entire page presentable by zeroing invalid sections. */ if (m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, FALSE); } } vm_page_unlock_queues(); if (!gotreqpage) { m = ap->a_m[ap->a_reqpage]; printf( "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n", devtoname(bp->b_dev), error, bp, bp->b_vp); printf( " size: %d, resid: %ld, a_count: %d, valid: 0x%lx\n", size, bp->b_resid, ap->a_count, (u_long)m->valid); printf( " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", nread, ap->a_reqpage, (u_long)m->pindex, pcount); VM_OBJECT_UNLOCK(m->object); /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_ERROR; } VM_OBJECT_UNLOCK(ap->a_m[ap->a_reqpage]->object); /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_OK; } Index: head/sys/i386/ibcs2/ibcs2_misc.c =================================================================== --- head/sys/i386/ibcs2/ibcs2_misc.c (revision 130550) +++ head/sys/i386/ibcs2/ibcs2_misc.c (revision 130551) @@ -1,1205 +1,1205 @@ /* * Copyright (c) 1995 Steven Wallace * Copyright (c) 1994, 1995 Scott Bartram * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Lawrence Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp * * @(#)sun_misc.c 8.1 (Berkeley) 6/18/93 */ #include __FBSDID("$FreeBSD$"); /* * IBCS2 compatibility module. * * IBCS2 system calls that are implemented differently in BSD are * handled here. */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int ibcs2_ulimit(td, uap) struct thread *td; struct ibcs2_ulimit_args *uap; { struct rlimit rl; struct proc *p; int error; #define IBCS2_GETFSIZE 1 #define IBCS2_SETFSIZE 2 #define IBCS2_GETPSIZE 3 #define IBCS2_GETDTABLESIZE 4 p = td->td_proc; switch (uap->cmd) { case IBCS2_GETFSIZE: PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE); PROC_UNLOCK(p); if (td->td_retval[0] == -1) td->td_retval[0] = 0x7fffffff; return 0; case IBCS2_SETFSIZE: PROC_LOCK(p); rl.rlim_max = lim_max(p, RLIMIT_FSIZE); PROC_UNLOCK(p); rl.rlim_cur = uap->newlimit; error = kern_setrlimit(td, RLIMIT_FSIZE, &rl); if (!error) { PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE); PROC_UNLOCK(p); } else { DPRINTF(("failed ")); } return error; case IBCS2_GETPSIZE: PROC_LOCK(p); td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */ PROC_UNLOCK(p); return 0; case IBCS2_GETDTABLESIZE: uap->cmd = IBCS2_SC_OPEN_MAX; return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap); default: return ENOSYS; } } #define IBCS2_WSTOPPED 0177 #define IBCS2_STOPCODE(sig) ((sig) << 8 | IBCS2_WSTOPPED) int ibcs2_wait(td, uap) struct thread *td; struct ibcs2_wait_args *uap; { int error, options, status; int *statusp; pid_t pid; struct trapframe *tf = td->td_frame; if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V)) == (PSL_Z|PSL_PF|PSL_N|PSL_V)) { /* waitpid */ pid = uap->a1; statusp = (int *)uap->a2; options = uap->a3; } else { /* wait */ pid = WAIT_ANY; statusp = (int *)uap->a1; options = 0; } error = kern_wait(td, pid, &status, options, NULL); if (error) return error; if (statusp) { /* * Convert status/signal result. */ if (WIFSTOPPED(status)) { if (WSTOPSIG(status) <= 0 || WSTOPSIG(status) > IBCS2_SIGTBLSZ) return (EINVAL); status = IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]); } else if (WIFSIGNALED(status)) { if (WTERMSIG(status) <= 0 || WTERMSIG(status) > IBCS2_SIGTBLSZ) return (EINVAL); status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))]; } /* else exit status -- identical */ /* record result/status */ td->td_retval[1] = status; return copyout(&status, statusp, sizeof(status)); } return 0; } int ibcs2_execv(td, uap) struct thread *td; struct ibcs2_execv_args *uap; { struct execve_args ea; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); ea.fname = uap->path; ea.argv = uap->argp; ea.envv = NULL; return execve(td, &ea); } int ibcs2_execve(td, uap) struct thread *td; struct ibcs2_execve_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return execve(td, (struct execve_args *)uap); } int ibcs2_umount(td, uap) struct thread *td; struct ibcs2_umount_args *uap; { struct unmount_args um; um.path = uap->name; um.flags = 0; return unmount(td, &um); } int ibcs2_mount(td, uap) struct thread *td; struct ibcs2_mount_args *uap; { #ifdef notyet int oflags = uap->flags, nflags, error; char fsname[MFSNAMELEN]; if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5)) return (EINVAL); if ((oflags & IBCS2_MS_NEWTYPE) == 0) return (EINVAL); nflags = 0; if (oflags & IBCS2_MS_RDONLY) nflags |= MNT_RDONLY; if (oflags & IBCS2_MS_NOSUID) nflags |= MNT_NOSUID; if (oflags & IBCS2_MS_REMOUNT) nflags |= MNT_UPDATE; uap->flags = nflags; if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname, (u_int *)0)) return (error); if (strcmp(fsname, "4.2") == 0) { uap->type = (caddr_t)STACK_ALLOC(); if (error = copyout("ufs", uap->type, sizeof("ufs"))) return (error); } else if (strcmp(fsname, "nfs") == 0) { struct ibcs2_nfs_args sna; struct sockaddr_in sain; struct nfs_args na; struct sockaddr sa; if (error = copyin(uap->data, &sna, sizeof sna)) return (error); if (error = copyin(sna.addr, &sain, sizeof sain)) return (error); bcopy(&sain, &sa, sizeof sa); sa.sa_len = sizeof(sain); uap->data = (caddr_t)STACK_ALLOC(); na.addr = (struct sockaddr *)((int)uap->data + sizeof na); na.sotype = SOCK_DGRAM; na.proto = IPPROTO_UDP; na.fh = (nfsv2fh_t *)sna.fh; na.flags = sna.flags; na.wsize = sna.wsize; na.rsize = sna.rsize; na.timeo = sna.timeo; na.retrans = sna.retrans; na.hostname = sna.hostname; if (error = copyout(&sa, na.addr, sizeof sa)) return (error); if (error = copyout(&na, uap->data, sizeof na)) return (error); } return (mount(td, uap)); #else return EINVAL; #endif } /* * Read iBCS2-style directory entries. We suck them into kernel space so * that they can be massaged before being copied out to user code. Like * SunOS, we squish out `empty' entries. * * This is quite ugly, but what do you expect from compatibility code? */ int ibcs2_getdents(td, uap) struct thread *td; register struct ibcs2_getdents_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_dirent idb; off_t off; /* true file offset */ int buflen, error, eofflag; u_long *cookies = NULL, *cookiep; int ncookies; #define BSD_DIRENT(cp) ((struct dirent *)(cp)) #define IBCS2_RECLEN(reclen) (reclen + sizeof(u_short)) if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; if (vp->v_type != VDIR) { /* XXX vnode readdir op should do this */ fdrop(fp, td); return (EINVAL); } off = fp->f_offset; #define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */ buflen = max(DIRBLKSIZ, uap->nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error) goto out; #endif /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) goto out; inp = buf; outp = uap->buf; resid = uap->nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_getdents: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < IBCS2_RECLEN(reclen)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make an iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). */ idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno; idb.d_off = (ibcs2_off_t)off; idb.d_reclen = (u_short)IBCS2_RECLEN(reclen); if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 || (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10, BSD_DIRENT(inp)->d_namlen + 1)) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += IBCS2_RECLEN(reclen); resid -= IBCS2_RECLEN(reclen); } /* if we squished out the whole block, try again */ if (outp == uap->buf) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = uap->nbytes - resid; out: VOP_UNLOCK(vp, 0, td); fdrop(fp, td); if (cookies) free(cookies, M_TEMP); free(buf, M_TEMP); return (error); } int ibcs2_read(td, uap) struct thread *td; struct ibcs2_read_args *uap; { register struct vnode *vp; register caddr_t inp, buf; /* BSD-format */ register int len, reclen; /* BSD-format */ register caddr_t outp; /* iBCS2-format */ register int resid; /* iBCS2-format */ struct file *fp; struct uio auio; struct iovec aiov; struct ibcs2_direct { ibcs2_ino_t ino; char name[14]; } idb; off_t off; /* true file offset */ int buflen, error, eofflag, size; u_long *cookies = NULL, *cookiep; int ncookies; if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) { if (error == EINVAL) return read(td, (struct read_args *)uap); else return error; } if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; if (vp->v_type != VDIR) { fdrop(fp, td); return read(td, (struct read_args *)uap); } off = fp->f_offset; if (vp->v_type != VDIR) return read(td, (struct read_args *)uap); DPRINTF(("ibcs2_read: read directory\n")); buflen = max(DIRBLKSIZ, uap->nbytes); buflen = min(buflen, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); again: aiov.iov_base = buf; aiov.iov_len = buflen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = buflen; auio.uio_offset = off; if (cookies) { free(cookies, M_TEMP); cookies = NULL; } #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error) goto out; #endif /* * First we read into the malloc'ed buffer, then * we massage it into user space, one record at a time. */ if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) { DPRINTF(("VOP_READDIR failed: %d\n", error)); goto out; } inp = buf; outp = uap->buf; resid = uap->nbytes; if ((len = buflen - auio.uio_resid) <= 0) goto eof; cookiep = cookies; if (cookies) { /* * When using cookies, the vfs has the option of reading from * a different offset than that supplied (UFS truncates the * offset to a block boundary to make sure that it never reads * partway through a directory entry, even if the directory * has been compacted). */ while (len > 0 && ncookies > 0 && *cookiep <= off) { len -= BSD_DIRENT(inp)->d_reclen; inp += BSD_DIRENT(inp)->d_reclen; cookiep++; ncookies--; } } for (; len > 0 && resid > 0; len -= reclen) { if (cookiep && ncookies == 0) break; reclen = BSD_DIRENT(inp)->d_reclen; if (reclen & 3) { printf("ibcs2_read: reclen=%d\n", reclen); error = EFAULT; goto out; } if (BSD_DIRENT(inp)->d_fileno == 0) { inp += reclen; /* it is a hole; squish it out */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; continue; } if (reclen > len || resid < sizeof(struct ibcs2_direct)) { /* entry too big for buffer, so just stop */ outp++; break; } /* * Massage in place to make an iBCS2-shaped dirent (otherwise * we have to worry about touching user memory outside of * the copyout() call). * * TODO: if length(filename) > 14, then break filename into * multiple entries and set inode = 0xffff except last */ idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe : BSD_DIRENT(inp)->d_fileno; (void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size); bzero(idb.name + size, 14 - size); if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0) goto out; /* advance past this real entry */ if (cookiep) { off = *cookiep++; ncookies--; } else off += reclen; inp += reclen; /* advance output past iBCS2-shaped entry */ outp += sizeof(struct ibcs2_direct); resid -= sizeof(struct ibcs2_direct); } /* if we squished out the whole block, try again */ if (outp == uap->buf) goto again; fp->f_offset = off; /* update the vnode offset */ eof: td->td_retval[0] = uap->nbytes - resid; out: VOP_UNLOCK(vp, 0, td); fdrop(fp, td); if (cookies) free(cookies, M_TEMP); free(buf, M_TEMP); return (error); } int ibcs2_mknod(td, uap) struct thread *td; struct ibcs2_mknod_args *uap; { caddr_t sg = stackgap_init(); CHECKALTCREAT(td, &sg, uap->path); if (S_ISFIFO(uap->mode)) { struct mkfifo_args ap; ap.path = uap->path; ap.mode = uap->mode; return mkfifo(td, &ap); } else { struct mknod_args ap; ap.path = uap->path; ap.mode = uap->mode; ap.dev = uap->dev; return mknod(td, &ap); } } int ibcs2_getgroups(td, uap) struct thread *td; struct ibcs2_getgroups_args *uap; { int error, i; ibcs2_gid_t *iset = NULL; struct getgroups_args sa; gid_t *gp; caddr_t sg = stackgap_init(); if (uap->gidsetsize < 0) return (EINVAL); if (uap->gidsetsize > NGROUPS_MAX) uap->gidsetsize = NGROUPS_MAX; sa.gidsetsize = uap->gidsetsize; if (uap->gidsetsize) { sa.gidset = stackgap_alloc(&sg, NGROUPS_MAX * sizeof(gid_t *)); iset = stackgap_alloc(&sg, uap->gidsetsize * sizeof(ibcs2_gid_t)); } if ((error = getgroups(td, &sa)) != 0) return error; if (uap->gidsetsize == 0) return 0; for (i = 0, gp = sa.gidset; i < td->td_retval[0]; i++) iset[i] = (ibcs2_gid_t)*gp++; if (td->td_retval[0] && (error = copyout((caddr_t)iset, (caddr_t)uap->gidset, sizeof(ibcs2_gid_t) * td->td_retval[0]))) return error; return 0; } int ibcs2_setgroups(td, uap) struct thread *td; struct ibcs2_setgroups_args *uap; { int error, i; ibcs2_gid_t *iset; struct setgroups_args sa; gid_t *gp; caddr_t sg = stackgap_init(); if (uap->gidsetsize < 0 || uap->gidsetsize > NGROUPS_MAX) return (EINVAL); sa.gidsetsize = uap->gidsetsize; sa.gidset = stackgap_alloc(&sg, sa.gidsetsize * sizeof(gid_t *)); iset = stackgap_alloc(&sg, sa.gidsetsize * sizeof(ibcs2_gid_t *)); if (sa.gidsetsize) { if ((error = copyin((caddr_t)uap->gidset, (caddr_t)iset, sizeof(ibcs2_gid_t *) * uap->gidsetsize)) != 0) return error; } for (i = 0, gp = sa.gidset; i < sa.gidsetsize; i++) *gp++ = (gid_t)iset[i]; return setgroups(td, &sa); } int ibcs2_setuid(td, uap) struct thread *td; struct ibcs2_setuid_args *uap; { struct setuid_args sa; sa.uid = (uid_t)uap->uid; return setuid(td, &sa); } int ibcs2_setgid(td, uap) struct thread *td; struct ibcs2_setgid_args *uap; { struct setgid_args sa; sa.gid = (gid_t)uap->gid; return setgid(td, &sa); } int ibcs2_time(td, uap) struct thread *td; struct ibcs2_time_args *uap; { struct timeval tv; microtime(&tv); td->td_retval[0] = tv.tv_sec; if (uap->tp) return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp, sizeof(ibcs2_time_t)); else return 0; } int ibcs2_pathconf(td, uap) struct thread *td; struct ibcs2_pathconf_args *uap; { uap->name++; /* iBCS2 _PC_* defines are offset by one */ return pathconf(td, (struct pathconf_args *)uap); } int ibcs2_fpathconf(td, uap) struct thread *td; struct ibcs2_fpathconf_args *uap; { uap->name++; /* iBCS2 _PC_* defines are offset by one */ return fpathconf(td, (struct fpathconf_args *)uap); } int ibcs2_sysconf(td, uap) struct thread *td; struct ibcs2_sysconf_args *uap; { int mib[2], value, len, error; struct proc *p; p = td->td_proc; switch(uap->name) { case IBCS2_SC_ARG_MAX: mib[1] = KERN_ARGMAX; break; case IBCS2_SC_CHILD_MAX: PROC_LOCK(p); td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC); PROC_UNLOCK(p); return 0; case IBCS2_SC_CLK_TCK: td->td_retval[0] = hz; return 0; case IBCS2_SC_NGROUPS_MAX: mib[1] = KERN_NGROUPS; break; case IBCS2_SC_OPEN_MAX: PROC_LOCK(p); td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE); PROC_UNLOCK(p); return 0; case IBCS2_SC_JOB_CONTROL: mib[1] = KERN_JOB_CONTROL; break; case IBCS2_SC_SAVED_IDS: mib[1] = KERN_SAVED_IDS; break; case IBCS2_SC_VERSION: mib[1] = KERN_POSIX1; break; case IBCS2_SC_PASS_MAX: td->td_retval[0] = 128; /* XXX - should we create PASS_MAX ? */ return 0; case IBCS2_SC_XOPEN_VERSION: td->td_retval[0] = 2; /* XXX: What should that be? */ return 0; default: return EINVAL; } mib[0] = CTL_KERN; len = sizeof(value); error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL); if (error) return error; td->td_retval[0] = value; return 0; } int ibcs2_alarm(td, uap) struct thread *td; struct ibcs2_alarm_args *uap; { int error; struct itimerval *itp, *oitp; struct setitimer_args sa; caddr_t sg = stackgap_init(); itp = stackgap_alloc(&sg, sizeof(*itp)); oitp = stackgap_alloc(&sg, sizeof(*oitp)); timevalclear(&itp->it_interval); itp->it_value.tv_sec = uap->sec; itp->it_value.tv_usec = 0; sa.which = ITIMER_REAL; sa.itv = itp; sa.oitv = oitp; error = setitimer(td, &sa); if (error) return error; if (oitp->it_value.tv_usec) oitp->it_value.tv_sec++; td->td_retval[0] = oitp->it_value.tv_sec; return 0; } int ibcs2_times(td, uap) struct thread *td; struct ibcs2_times_args *uap; { int error; struct getrusage_args ga; struct tms tms; struct timeval t; caddr_t sg = stackgap_init(); struct rusage *ru = stackgap_alloc(&sg, sizeof(*ru)); #define CONVTCK(r) (r.tv_sec * hz + r.tv_usec / (1000000 / hz)) ga.who = RUSAGE_SELF; ga.rusage = ru; error = getrusage(td, &ga); if (error) return error; tms.tms_utime = CONVTCK(ru->ru_utime); tms.tms_stime = CONVTCK(ru->ru_stime); ga.who = RUSAGE_CHILDREN; error = getrusage(td, &ga); if (error) return error; tms.tms_cutime = CONVTCK(ru->ru_utime); tms.tms_cstime = CONVTCK(ru->ru_stime); microtime(&t); td->td_retval[0] = CONVTCK(t); return copyout((caddr_t)&tms, (caddr_t)uap->tp, sizeof(struct tms)); } int ibcs2_stime(td, uap) struct thread *td; struct ibcs2_stime_args *uap; { int error; struct settimeofday_args sa; caddr_t sg = stackgap_init(); sa.tv = stackgap_alloc(&sg, sizeof(*sa.tv)); sa.tzp = NULL; if ((error = copyin((caddr_t)uap->timep, &(sa.tv->tv_sec), sizeof(long))) != 0) return error; sa.tv->tv_usec = 0; if ((error = settimeofday(td, &sa)) != 0) return EPERM; return 0; } int ibcs2_utime(td, uap) struct thread *td; struct ibcs2_utime_args *uap; { int error; struct utimes_args sa; struct timeval *tp; caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); sa.path = uap->path; if (uap->buf) { struct ibcs2_utimbuf ubuf; if ((error = copyin((caddr_t)uap->buf, (caddr_t)&ubuf, sizeof(ubuf))) != 0) return error; sa.tptr = stackgap_alloc(&sg, 2 * sizeof(struct timeval *)); tp = (struct timeval *)sa.tptr; tp->tv_sec = ubuf.actime; tp->tv_usec = 0; tp++; tp->tv_sec = ubuf.modtime; tp->tv_usec = 0; } else sa.tptr = NULL; return utimes(td, &sa); } int ibcs2_nice(td, uap) struct thread *td; struct ibcs2_nice_args *uap; { int error; struct setpriority_args sa; sa.which = PRIO_PROCESS; sa.who = 0; - sa.prio = td->td_ksegrp->kg_nice + uap->incr; + sa.prio = td->td_proc->p_nice + uap->incr; if ((error = setpriority(td, &sa)) != 0) return EPERM; - td->td_retval[0] = td->td_ksegrp->kg_nice; + td->td_retval[0] = td->td_proc->p_nice; return 0; } /* * iBCS2 getpgrp, setpgrp, setsid, and setpgid */ int ibcs2_pgrpsys(td, uap) struct thread *td; struct ibcs2_pgrpsys_args *uap; { struct proc *p = td->td_proc; switch (uap->type) { case 0: /* getpgrp */ PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; case 1: /* setpgrp */ { struct setpgid_args sa; sa.pid = 0; sa.pgid = 0; setpgid(td, &sa); PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return 0; } case 2: /* setpgid */ { struct setpgid_args sa; sa.pid = uap->pid; sa.pgid = uap->pgid; return setpgid(td, &sa); } case 3: /* setsid */ return setsid(td, NULL); default: return EINVAL; } } /* * XXX - need to check for nested calls */ int ibcs2_plock(td, uap) struct thread *td; struct ibcs2_plock_args *uap; { int error; #define IBCS2_UNLOCK 0 #define IBCS2_PROCLOCK 1 #define IBCS2_TEXTLOCK 2 #define IBCS2_DATALOCK 4 if ((error = suser(td)) != 0) return EPERM; switch(uap->cmd) { case IBCS2_UNLOCK: case IBCS2_PROCLOCK: case IBCS2_TEXTLOCK: case IBCS2_DATALOCK: return 0; /* XXX - TODO */ } return EINVAL; } int ibcs2_uadmin(td, uap) struct thread *td; struct ibcs2_uadmin_args *uap; { #define SCO_A_REBOOT 1 #define SCO_A_SHUTDOWN 2 #define SCO_A_REMOUNT 4 #define SCO_A_CLOCK 8 #define SCO_A_SETCONFIG 128 #define SCO_A_GETDEV 130 #define SCO_AD_HALT 0 #define SCO_AD_BOOT 1 #define SCO_AD_IBOOT 2 #define SCO_AD_PWRDOWN 3 #define SCO_AD_PWRNAP 4 #define SCO_AD_PANICBOOT 1 #define SCO_AD_GETBMAJ 0 #define SCO_AD_GETCMAJ 1 if (suser(td)) return EPERM; switch(uap->cmd) { case SCO_A_REBOOT: case SCO_A_SHUTDOWN: switch(uap->func) { struct reboot_args r; case SCO_AD_HALT: case SCO_AD_PWRDOWN: case SCO_AD_PWRNAP: r.opt = RB_HALT; reboot(td, &r); case SCO_AD_BOOT: case SCO_AD_IBOOT: r.opt = RB_AUTOBOOT; reboot(td, &r); } return EINVAL; case SCO_A_REMOUNT: case SCO_A_CLOCK: case SCO_A_SETCONFIG: return 0; case SCO_A_GETDEV: return EINVAL; /* XXX - TODO */ } return EINVAL; } int ibcs2_sysfs(td, uap) struct thread *td; struct ibcs2_sysfs_args *uap; { #define IBCS2_GETFSIND 1 #define IBCS2_GETFSTYP 2 #define IBCS2_GETNFSTYP 3 switch(uap->cmd) { case IBCS2_GETFSIND: case IBCS2_GETFSTYP: case IBCS2_GETNFSTYP: break; } return EINVAL; /* XXX - TODO */ } int ibcs2_unlink(td, uap) struct thread *td; struct ibcs2_unlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return unlink(td, (struct unlink_args *)uap); } int ibcs2_chdir(td, uap) struct thread *td; struct ibcs2_chdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return chdir(td, (struct chdir_args *)uap); } int ibcs2_chmod(td, uap) struct thread *td; struct ibcs2_chmod_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return chmod(td, (struct chmod_args *)uap); } int ibcs2_chown(td, uap) struct thread *td; struct ibcs2_chown_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return chown(td, (struct chown_args *)uap); } int ibcs2_rmdir(td, uap) struct thread *td; struct ibcs2_rmdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return rmdir(td, (struct rmdir_args *)uap); } int ibcs2_mkdir(td, uap) struct thread *td; struct ibcs2_mkdir_args *uap; { caddr_t sg = stackgap_init(); CHECKALTCREAT(td, &sg, uap->path); return mkdir(td, (struct mkdir_args *)uap); } int ibcs2_symlink(td, uap) struct thread *td; struct ibcs2_symlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); CHECKALTCREAT(td, &sg, uap->link); return symlink(td, (struct symlink_args *)uap); } int ibcs2_rename(td, uap) struct thread *td; struct ibcs2_rename_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->from); CHECKALTCREAT(td, &sg, uap->to); return rename(td, (struct rename_args *)uap); } int ibcs2_readlink(td, uap) struct thread *td; struct ibcs2_readlink_args *uap; { caddr_t sg = stackgap_init(); CHECKALTEXIST(td, &sg, uap->path); return readlink(td, (struct readlink_args *) uap); } Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 130550) +++ head/sys/kern/init_main.c (revision 130551) @@ -1,733 +1,733 @@ /* * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_init_path.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread thread0; struct kse kse0; struct ksegrp ksegrp0; static struct filedesc0 filedesc0; struct vmspace vmspace0; struct proc *initproc; struct vnode *rootvp; int boothowto = 0; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, ""); int bootverbose; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, ""); /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL) /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. * * The last item on the list is expected to be the scheduler, * which will not return. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; /* Call function */ (*((*sipp)->func))((*sipp)->udata); /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } panic("Shouldn't get here!"); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data __unused) { printf("%s", (char *)data); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version) #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 1, print_caddr_t, wit_warn) #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 2, print_caddr_t, diag_warn) #endif static void set_boot_verbose(void *data __unused) { if (boothowto & RB_VERBOSE) bootverbose++; } SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL) struct sysentvec null_sysvec = { 0, NULL, 0, 0, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "null", NULL, NULL, 0, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, NULL, NULL, NULL }; /* *************************************************************************** **** **** The two following SYSINIT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { register struct proc *p; register struct filedesc0 *fdp; register unsigned i; struct thread *td; struct ksegrp *kg; struct kse *ke; GIANT_REQUIRED; p = &proc0; td = &thread0; ke = &kse0; kg = &ksegrp0; ke->ke_sched = kse0_sched; kg->kg_sched = ksegrp0_sched; p->p_sched = proc0_sched; td->td_sched = thread0_sched; /* * Initialize magic number. */ p->p_magic = P_MAGIC; /* * Initialize thread, process and pgrp structures. */ procinit(); threadinit(); /* * Initialize sleep queue hash table */ sleepinit(); /* * additional VM structures */ vm_init2(); /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); session0.s_count = 1; session0.s_leader = p; p->p_sysent = &null_sysvec; /* * proc_linkup was already done in init_i386() or alphainit() etc. * because the earlier code needed to follow td->td_proc. Otherwise * I would have done it here.. maybe this means this should be * done earlier too. */ p->p_flag = P_SYSTEM; p->p_sflag = PS_INMEM; p->p_state = PRS_NORMAL; + p->p_nice = NZERO; td->td_state = TDS_RUNNING; - kg->kg_nice = NZERO; kg->kg_pri_class = PRI_TIMESHARE; kg->kg_user_pri = PUSER; td->td_priority = PVM; td->td_base_pri = PUSER; td->td_kse = ke; /* XXXKSE */ td->td_oncpu = 0; ke->ke_state = KES_THREAD; ke->ke_thread = td; p->p_peers = 0; p->p_leader = p; bcopy("swapper", p->p_comm, sizeof ("swapper")); callout_init(&p->p_itcallout, CALLOUT_MPSAFE); callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); /* Create credentials. */ p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ #ifdef MAC mac_create_proc0(p->p_ucred); #endif td->td_ucred = crhold(p->p_ucred); /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ /* XXX this duplicates part of fdinit() */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; p->p_fdtol = NULL; mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = CMASK; fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; fdp->fd_fd.fd_nfiles = NDFILE; fdp->fd_fd.fd_map = fdp->fd_dmap; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; i = ptoa(cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = i; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; p->p_cpulimit = RLIM_INFINITY; /* Allocate a prototype map so we have something to fork. */ pmap_pinit0(vmspace_pmap(&vmspace0)); p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0); /* * We continue to place resource usage info * in the user struct so that it's pageable. */ p->p_stats = &p->p_uarea->u_stats; /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { microuptime(&p->p_stats->p_start); p->p_runtime.sec = 0; p->p_runtime.frac = 0; } sx_sunlock(&allproc_lock); binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; int init_does_devfs = 0; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode)) panic("cannot find root vnode"); FILEDESC_LOCK(p->p_fd); p->p_fd->fd_cdir = rootvnode; VREF(p->p_fd->fd_cdir); p->p_fd->fd_rdir = rootvnode; VREF(p->p_fd->fd_rdir); FILEDESC_UNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0, td); #ifdef MAC mac_create_root_mount(td->td_ucred, TAILQ_FIRST(&mountlist)); #endif /* * For disk based systems, we probably cannot do this yet * since the fs will be read-only. But a NFS root * might be ok. It is worth a shot. */ error = kern_mkdir(td, "/dev", UIO_SYSSPACE, 0700); if (error == EEXIST) error = 0; if (error == 0) error = kernel_vmount(0, "fstype", "devfs", "fspath", "/dev", NULL); if (error != 0) init_does_devfs = 1; /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (init_does_devfs) { (void)subyte(--ucp, 'd'); options = 1; } if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kthread_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct ucred *newcred, *oldcred; int error; error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM; oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_create_proc1(newcred); #endif initproc->p_ucred = newcred; PROC_UNLOCK(initproc); crfree(oldcred); cred_update_thread(FIRST_THREAD_IN_PROC(initproc)); mtx_lock_spin(&sched_lock); initproc->p_sflag |= PS_INMEM; mtx_unlock_spin(&sched_lock); cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); mtx_lock_spin(&sched_lock); TD_SET_CAN_RUN(td); setrunqueue(td); /* XXXKSE */ mtx_unlock_spin(&sched_lock); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) Index: head/sys/kern/kern_clock.c =================================================================== --- head/sys/kern/kern_clock.c (revision 130550) +++ head/sys/kern/kern_clock.c (revision 130551) @@ -1,555 +1,555 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #endif #ifdef DDB #include #endif #ifdef DEVICE_POLLING extern void hardclock_device_poll(void); #endif /* DEVICE_POLLING */ static void initclocks(void *dummy); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) /* Some of these don't belong here, but it's easiest to concentrate them. */ long cp_time[CPUSTATES]; SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), "LU", "CPU time statistics"); #ifdef SW_WATCHDOG #include static int watchdog_ticks; static int watchdog_enabled; static void watchdog_fire(void); static void watchdog_config(void *, u_int, int *); #endif /* SW_WATCHDOG */ /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. * * The main timer, running hz times per second, is used to trigger interval * timers, timeouts and rescheduling as needed. * * The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the cpu * just before its quantum expires. Otherwise, it would never accumulate * cpu ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) * * Time-of-day is maintained using a "timecounter", which may or may * not be related to the hardware generating the above mentioned * interrupts. */ int stathz; int profhz; int profprocs; int ticks; int psratio; /* * Initialize clock frequencies and start both clocks running. */ /* ARGSUSED*/ static void initclocks(dummy) void *dummy; { register int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ cpu_initclocks(); /* * Compute profhz/stathz, and fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; #ifdef SW_WATCHDOG EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); #endif } /* * Each time the real-time timer fires, this function is called on all CPUs. * Note that hardclock() calls hardclock_process() for the boot CPU, so only * the other CPUs in the system need to call this function. */ void hardclock_process(frame) register struct clockframe *frame; { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; /* * Run current process's virtual and profile time, as needed. */ mtx_lock_spin_flags(&sched_lock, MTX_QUIET); if (p->p_flag & P_SA) { /* XXXKSE What to do? */ } else { pstats = p->p_stats; if (CLKF_USERMODE(frame) && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { p->p_sflag |= PS_ALRMPEND; td->td_flags |= TDF_ASTPENDING; } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { p->p_sflag |= PS_PROFPEND; td->td_flags |= TDF_ASTPENDING; } } mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); } /* * The real-time timer, interrupting hz times per second. */ void hardclock(frame) register struct clockframe *frame; { int need_softclock = 0; CTR0(KTR_CLK, "hardclock fired"); hardclock_process(frame); tc_ticktock(); /* * If no separate statistics clock is available, run it from here. * * XXX: this only works for UP */ if (stathz == 0) { profclock(frame); statclock(frame); } #ifdef DEVICE_POLLING hardclock_device_poll(); /* this is very short and quick */ #endif /* DEVICE_POLLING */ /* * Process callouts at a very low cpu priority, so we don't keep the * relatively high clock interrupt priority any longer than necessary. */ mtx_lock_spin_flags(&callout_lock, MTX_QUIET); ticks++; if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { need_softclock = 1; } else if (softticks + 1 == ticks) ++softticks; mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); /* * swi_sched acquires sched_lock, so we don't want to call it with * callout_lock held; incorrect locking order. */ if (need_softclock) swi_sched(softclock_ih, 0); #ifdef SW_WATCHDOG if (watchdog_enabled > 0 && --watchdog_ticks <= 0) watchdog_fire(); #endif /* SW_WATCHDOG */ } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(tv) struct timeval *tv; { register unsigned long ticks; register long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) / tick + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + ((unsigned long)usec + (tick - 1)) / tick + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(p) register struct proc *p; { /* * XXX; Right now sched_lock protects statclock(), but perhaps * it should be protected later on by a time_lock, which would * cover psdiv, etc. as well. */ PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_STOPPROF) return; if ((p->p_flag & P_PROFIL) == 0) { mtx_lock_spin(&sched_lock); p->p_flag |= P_PROFIL; if (++profprocs == 1) cpu_startprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Stop profiling on a process. */ void stopprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_PROFIL) { if (p->p_profthreads != 0) { p->p_flag |= P_STOPPROF; while (p->p_profthreads != 0) msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, "stopprof", 0); p->p_flag &= ~P_STOPPROF; } if ((p->p_flag & P_PROFIL) == 0) return; mtx_lock_spin(&sched_lock); p->p_flag &= ~P_PROFIL; if (--profprocs == 0) cpu_stopprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. Most of the statistics are only * used by user-level statistics programs. The main exceptions are * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. * This should be called by all active processors. */ void statclock(frame) register struct clockframe *frame; { struct pstats *pstats; struct rusage *ru; struct vmspace *vm; struct thread *td; struct proc *p; long rss; td = curthread; p = td->td_proc; mtx_lock_spin_flags(&sched_lock, MTX_QUIET); if (CLKF_USERMODE(frame)) { /* * Charge the time as appropriate. */ if (p->p_flag & P_SA) thread_statclock(1); p->p_uticks++; - if (td->td_ksegrp->kg_nice > NZERO) + if (p->p_nice > NZERO) cp_time[CP_NICE]++; else cp_time[CP_USER]++; } else { /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { p->p_iticks++; cp_time[CP_INTR]++; } else { if (p->p_flag & P_SA) thread_statclock(0); td->td_sticks++; p->p_sticks++; if (p != PCPU_GET(idlethread)->td_proc) cp_time[CP_SYS]++; else cp_time[CP_IDLE]++; } } sched_clock(td); /* Update resource usage integrals and maximums. */ if ((pstats = p->p_stats) != NULL && (ru = &pstats->p_ru) != NULL && (vm = p->p_vmspace) != NULL) { ru->ru_ixrss += pgtok(vm->vm_tsize); ru->ru_idrss += pgtok(vm->vm_dsize); ru->ru_isrss += pgtok(vm->vm_ssize); rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; } mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); } void profclock(frame) register struct clockframe *frame; { struct thread *td; #ifdef GPROF struct gmonparam *g; int i; #endif td = curthread; if (CLKF_USERMODE(frame)) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. * if there is no related user location yet, don't * bother trying to count it. */ if (td->td_proc->p_flag & P_PROFIL) addupc_intr(td, CLKF_PC(frame), 1); } #ifdef GPROF else { /* * Kernel statistics are just like addupc_intr, only easier. */ g = &_gmonparam; if (g->state == GMON_PROF_ON) { i = CLKF_PC(frame) - g->lowpc; if (i < g->textsize) { i /= HISTFRACTION * sizeof(*g->kcount); g->kcount[i]++; } } } #endif } /* * Return information about system clocks. */ static int sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) { struct clockinfo clkinfo; /* * Construct clockinfo structure. */ bzero(&clkinfo, sizeof(clkinfo)); clkinfo.hz = hz; clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 0, 0, sysctl_kern_clockrate, "S,clockinfo", "Rate and period of various kernel clocks"); #ifdef SW_WATCHDOG static void watchdog_config(void *unused __unused, u_int cmd, int *err) { u_int u; u = cmd & WD_INTERVAL; if (cmd && u >= WD_TO_1SEC) { u = cmd & WD_INTERVAL; watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; watchdog_enabled = 1; *err = 0; } else { watchdog_enabled = 0; } } /* * Handle a watchdog timeout by dumping interrupt information and * then either dropping to DDB or panicing. */ static void watchdog_fire(void) { int nintr; u_int64_t inttotal; u_long *curintr; char *curname; curintr = intrcnt; curname = intrnames; inttotal = 0; nintr = eintrcnt - intrcnt; printf("interrupt total\n"); while (--nintr >= 0) { if (*curintr) printf("%-12s %20lu\n", curname, *curintr); curname += strlen(curname) + 1; inttotal += *curintr++; } printf("Total %20ju\n", (uintmax_t)inttotal); #ifdef DDB db_print_backtrace(); Debugger("watchdog timeout"); #else /* !DDB */ panic("watchdog timeout"); #endif /* DDB */ } #endif /* SW_WATCHDOG */ Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 130550) +++ head/sys/kern/kern_proc.c (revision 130551) @@ -1,1242 +1,1242 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #include #include #include #include MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static void proc_ctor(void *mem, int size, void *arg); static void proc_dtor(void *mem, int size, void *arg); static void proc_init(void *mem, int size); static void proc_fini(void *mem, int size); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx pargs_ref_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; uma_zone_t ithread_zone; int kstack_pages = KSTACK_PAGES; int uarea_pages = UAREA_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, ""); SYSCTL_INT(_kern, OID_AUTO, uarea_pages, CTLFLAG_RD, &uarea_pages, 0, ""); #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static void proc_ctor(void *mem, int size, void *arg) { struct proc *p; p = (struct proc *)mem; } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; struct ksegrp *kg; struct kse *ke; /* INVARIANTS checks go here */ p = (struct proc *)mem; KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); td = FIRST_THREAD_IN_PROC(p); KASSERT((td != NULL), ("proc_dtor: bad thread pointer")); kg = FIRST_KSEGRP_IN_PROC(p); KASSERT((kg != NULL), ("proc_dtor: bad kg pointer")); ke = FIRST_KSE_IN_KSEGRP(kg); KASSERT((ke != NULL), ("proc_dtor: bad ke pointer")); /* Dispose of an alternate kstack, if it exists. * XXX What if there are more than one thread in the proc? * The first thread in the proc is special and not * freed, so you gotta do this here. */ if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0)) vm_thread_dispose_altkstack(td); /* * We want to make sure we know the initial linkages. * so for now tear them down and remake them. * This is probably un-needed as we can probably rely * on the state coming in here from wait4(). */ proc_linkup(p, kg, ke, td); } /* * Initialize type-stable parts of a proc (when newly created). */ static void proc_init(void *mem, int size) { struct proc *p; struct thread *td; struct ksegrp *kg; struct kse *ke; p = (struct proc *)mem; p->p_sched = (struct p_sched *)&p[1]; vm_proc_new(p); td = thread_alloc(); ke = kse_alloc(); kg = ksegrp_alloc(); proc_linkup(p, kg, ke, td); bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); } /* * Tear down type-stable parts of a proc (just before being discarded) */ static void proc_fini(void *mem, int size) { struct proc *p; struct thread *td; struct ksegrp *kg; struct kse *ke; p = (struct proc *)mem; KASSERT((p->p_numthreads == 1), ("bad number of threads in freeing process")); td = FIRST_THREAD_IN_PROC(p); KASSERT((td != NULL), ("proc_dtor: bad thread pointer")); kg = FIRST_KSEGRP_IN_PROC(p); KASSERT((kg != NULL), ("proc_dtor: bad kg pointer")); ke = FIRST_KSE_IN_KSEGRP(kg); KASSERT((ke != NULL), ("proc_dtor: bad ke pointer")); vm_proc_dispose(p); thread_free(td); ksegrp_free(kg); kse_free(ke); mtx_destroy(&p->p_mtx); } /* * Is p an inferior of the current process? */ int inferior(p) register struct proc *p; { sx_assert(&proctree_lock, SX_LOCKED); for (; p != curproc; p = p->p_pptr) if (p->p_pid == 0) return (0); return (1); } /* * Locate a process by number */ struct proc * pfind(pid) register pid_t pid; { register struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, PIDHASH(pid), p_hash) if (p->p_pid == pid) { PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { struct pgrp *pgrp2; sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); pgrp2 = pgfind(pgid); KASSERT(pgrp2 == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; sess->s_count = 1; sess->s_ttyvp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; SESS_LOCK(pgrp->pg_session); pgrp->pg_session->s_count++; SESS_UNLOCK(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; int i; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); PGRP_LOCK(pgrp); if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; SESS_LOCK(savesess); i = --savesess->s_count; SESS_UNLOCK(savesess); PGRP_UNLOCK(pgrp); if (i == 0) { if (savesess->s_ttyp != NULL) ttyrel(savesess->s_ttyp); mtx_destroy(&savesess->s_mtx); FREE(savesess, M_SESSION); } mtx_destroy(&pgrp->pg_mtx); FREE(pgrp, M_PGRP); } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) { hispgrp = p->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; PROC_LOCK(p); if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); pgadjustjobc(hispgrp, entering); } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p)) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); psignal(p, SIGHUP); psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp); /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp); } void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp) { struct proc *p; struct thread *td0; struct kse *ke; struct ksegrp *kg; struct tty *tp; struct session *sp; struct timeval tv; struct sigacts *ps; p = td->td_proc; bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; mtx_lock(&ktrace_mtx); kp->ki_traceflag = p->p_traceflag; mtx_unlock(&ktrace_mtx); #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; if (p->p_ucred) { kp->ki_uid = p->p_ucred->cr_uid; kp->ki_ruid = p->p_ucred->cr_ruid; kp->ki_svuid = p->p_ucred->cr_svuid; /* XXX bde doesn't like KI_NGROUPS */ kp->ki_ngroups = min(p->p_ucred->cr_ngroups, KI_NGROUPS); bcopy(p->p_ucred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = p->p_ucred->cr_rgid; kp->ki_svgid = p->p_ucred->cr_svgid; } if (p->p_sigacts) { ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } mtx_lock_spin(&sched_lock); if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ if (p->p_sflag & PS_INMEM) kp->ki_rssize += UAREA_PAGES; FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; if (td0->td_altkstack_obj != NULL) kp->ki_rssize += td0->td_altkstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } if ((p->p_sflag & PS_INMEM) && p->p_stats) { kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); kp->ki_rusage = p->p_stats->p_ru; kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec + p->p_stats->p_cru.ru_stime.tv_sec; kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec + p->p_stats->p_cru.ru_stime.tv_usec; } if (p->p_state != PRS_ZOMBIE) { #if 0 if (td == NULL) { /* XXXKSE: This should never happen. */ printf("fill_kinfo_proc(): pid %d has no threads!\n", p->p_pid); mtx_unlock_spin(&sched_lock); return; } #endif if (td->td_wmesg != NULL) { strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); } if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* XXXKSE very approximate */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else { kp->ki_stat = SIDL; } kp->ki_sflag = p->p_sflag; kp->ki_swtime = p->p_swtime; kp->ki_pid = p->p_pid; + kp->ki_nice = p->p_nice; kg = td->td_ksegrp; ke = td->td_kse; bintime2timeval(&p->p_runtime, &tv); kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec; /* things in the KSE GROUP */ kp->ki_estcpu = kg->kg_estcpu; kp->ki_slptime = kg->kg_slptime; kp->ki_pri.pri_user = kg->kg_user_pri; kp->ki_pri.pri_class = kg->kg_pri_class; - kp->ki_nice = kg->kg_nice; /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_pctcpu = sched_pctcpu(td); /* Things in the kse */ if (ke) kp->ki_rqindex = ke->ke_rqindex; else kp->ki_rqindex = 0; } else { kp->ki_stat = SZOMB; } mtx_unlock_spin(&sched_lock); sp = NULL; tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = dev2udev(tp->t_dev); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NOUDEV; if (p->p_comm[0] != '\0') { strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); strlcpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm)); } kp->ki_siglist = p->p_siglist; SIGSETOR(kp->ki_siglist, td->td_siglist); kp->ki_sigmask = td->td_sigmask; kp->ki_xstat = p->p_xstat; kp->ki_acflag = p->p_acflag; kp->ki_flag = p->p_flag; /* If jailed(p->p_ucred), emulate the old P_JAILED flag. */ if (jailed(p->p_ucred)) kp->ki_flag |= P_JAILED; kp->ki_lock = p->p_lock; if (p->p_pptr) kp->ki_ppid = p->p_pptr->p_pid; } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) if (p->p_pid == pid) { PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } #define KERN_PROC_ZOMBMASK 0x3 #define KERN_PROC_NOTHREADS 0x4 /* * Must be called with the process locked and will return with it unlocked. */ static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags) { struct thread *td; struct kinfo_proc kinfo_proc; int error = 0; struct proc *np; pid_t pid = p->p_pid; PROC_LOCK_ASSERT(p, MA_OWNED); if (flags & KERN_PROC_NOTHREADS) { fill_kinfo_proc(p, &kinfo_proc); PROC_UNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); PROC_LOCK(p); } else { _PHOLD(p); FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &kinfo_proc); PROC_UNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); PROC_LOCK(p); if (error) break; } _PRELE(p); } PROC_UNLOCK(p); if (error) return (error); if (flags & KERN_PROC_ZOMBMASK) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return EAGAIN; if (np != p) { PROC_UNLOCK(np); return EAGAIN; } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } error = sysctl_out_proc(p, req, flags); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ mtx_lock_spin(&sched_lock); if (p->p_state == PRS_NEW) { mtx_unlock_spin(&sched_lock); continue; } mtx_unlock_spin(&sched_lock); PROC_LOCK(p); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred == NULL || p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || dev2udev(p->p_session->s_ttyp->t_dev) != (udev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred == NULL || p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred == NULL || p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags | doingzomb); if (error) { sx_sunlock(&allproc_lock); return (error); } } } sx_sunlock(&allproc_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS, M_WAITOK); pa->ar_ref = 1; pa->ar_length = len; return (pa); } void pargs_free(struct pargs *pa) { FREE(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; PARGS_LOCK(pa); pa->ar_ref++; PARGS_UNLOCK(pa); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; PARGS_LOCK(pa); if (--pa->ar_ref == 0) { PARGS_UNLOCK(pa); pargs_free(pa); } else PARGS_UNLOCK(pa); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; int error = 0; if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p)) != 0) { PROC_UNLOCK(p); return (error); } if (req->newptr && curproc != p) { PROC_UNLOCK(p); return (EPERM); } pa = p->p_args; pargs_hold(pa); PROC_UNLOCK(p); if (req->oldptr != NULL && pa != NULL) error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY, sysctl_kern_proc_args, "Process argument list"); SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 130550) +++ head/sys/kern/kern_resource.c (revision 130551) @@ -1,1156 +1,1132 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int donice(struct thread *td, struct proc *chgp, int n); static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct mtx uihashtbl_mtx; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static struct uidinfo *uilookup(uid_t uid); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif /* * MPSAFE */ int getpriority(td, uap) struct thread *td; register struct getpriority_args *uap; { - struct ksegrp *kg; struct proc *p; int error, low; error = 0; low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) - low = td->td_ksegrp->kg_nice; + low = td->td_proc->p_nice; else { p = pfind(uap->who); if (p == NULL) break; if (p_cansee(td, p) == 0) { - FOREACH_KSEGRP_IN_PROC(p, kg) { - if (kg->kg_nice < low) - low = kg->kg_nice; - } + low = p->p_nice; } PROC_UNLOCK(p); } break; case PRIO_PGRP: { register struct pgrp *pg; sx_slock(&proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (!p_cansee(td, p)) { - FOREACH_KSEGRP_IN_PROC(p, kg) { - if (kg->kg_nice < low) - low = kg->kg_nice; - } + if (p->p_nice < low) + low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; } case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (!p_cansee(td, p) && p->p_ucred->cr_uid == uap->who) { - FOREACH_KSEGRP_IN_PROC(p, kg) { - if (kg->kg_nice < low) - low = kg->kg_nice; - } + if (p->p_nice < low) + low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif /* * MPSAFE */ int setpriority(td, uap) struct thread *td; register struct setpriority_args *uap; { struct proc *curp; register struct proc *p; int found = 0, error = 0; curp = td->td_proc; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); error = donice(td, curp, uap->prio); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == 0) break; if (p_cansee(td, p) == 0) error = donice(td, p, uap->prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: { register struct pgrp *pg; sx_slock(&proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (!p_cansee(td, p)) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; } case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_ucred->cr_uid == uap->who && !p_cansee(td, p)) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* - * Set "nice" for a process. Doesn't really understand threaded processes - * well but does try. Has the unfortunate side effect of making all the NICE - * values for a process's ksegrps the same. This suggests that - * NICE values should be stored as a process nice and deltas for the ksegrps. - * (but not yet). + * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { - struct ksegrp *kg; - int error, low; + int error; - low = PRIO_MAX + 1; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; - /* - * Only allow nicing if to more than the lowest nice. - * E.g., for nices of 4,3,2 allow nice to 3 but not 1 - */ - FOREACH_KSEGRP_IN_PROC(p, kg) { - if (kg->kg_nice < low) - low = kg->kg_nice; - } - if (n < low && suser(td) != 0) + if (n < p->p_nice && suser(td) != 0) return (EACCES); mtx_lock_spin(&sched_lock); - FOREACH_KSEGRP_IN_PROC(p, kg) { - sched_nice(kg, n); - } + sched_nice(p, n); mtx_unlock_spin(&sched_lock); return (0); } /* * Set realtime priority * * MPSAFE */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif int rtprio(td, uap) struct thread *td; /* curthread */ register struct rtprio_args *uap; { struct proc *curp; struct proc *p; struct ksegrp *kg; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; curp = td->td_proc; if (uap->pid == 0) { p = curp; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; mtx_lock_spin(&sched_lock); /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * XXXKSE Maybe need a new interface to report * priorities of multiple system scope threads. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td->td_ksegrp, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_KSEGRP_IN_PROC(p, kg) { pri_to_rtp(kg, &rtp2); if ((rtp2.type < rtp.type) || ((rtp2.type == rtp.type) && (rtp2.prio < rtp.prio))) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* disallow setting rtprio in most cases if not superuser */ if (suser(td) != 0) { /* can't set someone else's */ if (uap->pid) { error = EPERM; break; } /* can't set realtime priority */ /* * Realtime priority has to be restricted for reasons which should be * obvious. However, for idle priority, there is a potential for * system deadlock if an idleprio process gains a lock on a resource * that other processes need (and the idleprio process can't run * due to a CPU-bound normal process). Fix me! XXX */ #if 0 if (RTP_PRIO_IS_REALTIME(rtp.type)) #endif if (rtp.type != RTP_PRIO_NORMAL) { error = EPERM; break; } } mtx_lock_spin(&sched_lock); /* * If we are setting our own priority, set just our * KSEGRP but if we are doing another process, * do all the groups on that process. If we * specify our own pid we do the latter. */ if (uap->pid == 0) { error = rtp_to_pri(&rtp, td->td_ksegrp); } else { FOREACH_KSEGRP_IN_PROC(p, kg) { if ((error = rtp_to_pri(&rtp, kg)) != 0) { break; } } } mtx_unlock_spin(&sched_lock); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg) { mtx_assert(&sched_lock, MA_OWNED); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } sched_class(kg, rtp->type); if (curthread->td_ksegrp == kg) { curthread->td_base_pri = kg->kg_user_pri; sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */ } return (0); } void pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp) { mtx_assert(&sched_lock, MA_OWNED); switch (PRI_BASE(kg->kg_pri_class)) { case PRI_REALTIME: rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = kg->kg_pri_class; } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif /* * MPSAFE */ int osetrlimit(td, uap) struct thread *td; register struct osetrlimit_args *uap; { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif /* * MPSAFE */ int ogetrlimit(td, uap) struct thread *td; register struct ogetrlimit_args *uap; { struct orlimit olim; struct rlimit rl; struct proc *p; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); p = td->td_proc; PROC_LOCK(p); lim_rlimit(p, uap->which, &rl); PROC_UNLOCK(p); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* * MPSAFE */ int setrlimit(td, uap) struct thread *td; register struct __setrlimit_args *uap; { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } int kern_setrlimit(td, which, limp) struct thread *td; u_int which; struct rlimit *limp; { struct plimit *newlim, *oldlim; struct proc *p; register struct rlimit *alimp; rlim_t oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz = 0; p = td->td_proc; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = suser_cred(td->td_ucred, PRISON_ROOT))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: mtx_lock_spin(&sched_lock); p->p_cpulimit = limp->rlim_cur; mtx_unlock_spin(&sched_lock); break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = alimp->rlim_cur; break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } *alimp = *limp; p->p_limit = newlim; PROC_UNLOCK(p); lim_free(oldlim); if (which == RLIMIT_STACK) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; mtx_lock(&Giant); if (limp->rlim_cur > oldssiz) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz; addr = p->p_sysent->sv_usrstack - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz - limp->rlim_cur; addr = p->p_sysent->sv_usrstack - oldssiz; } addr = trunc_page(addr); size = round_page(size); (void) vm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, prot, FALSE); mtx_unlock(&Giant); } } return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* * MPSAFE */ /* ARGSUSED */ int getrlimit(td, uap) struct thread *td; register struct __getrlimit_args *uap; { struct rlimit rlim; struct proc *p; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); p = td->td_proc; PROC_LOCK(p); lim_rlimit(p, uap->which, &rlim); PROC_UNLOCK(p); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return(error); } /* * Transform the running time and tick information in proc p into user, * system, and interrupt time usage. */ void calcru(p, up, sp, ip) struct proc *p; struct timeval *up; struct timeval *sp; struct timeval *ip; { struct bintime bt; struct timeval tv; /* {user, system, interrupt, total} {ticks, usec}; previous tu: */ u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu; mtx_assert(&sched_lock, MA_OWNED); /* XXX: why spl-protect ? worst case is an off-by-one report */ ut = p->p_uticks; st = p->p_sticks; it = p->p_iticks; tt = ut + st + it; if (tt == 0) { st = 1; tt = 1; } if (p == curthread->td_proc) { /* * Adjust for the current time slice. This is actually fairly * important since the error here is on the order of a time * quantum, which is much greater than the sampling error. * XXXKSE use a different test due to threads on other * processors also being 'current'. */ binuptime(&bt); bintime_sub(&bt, PCPU_PTR(switchtime)); bintime_add(&bt, &p->p_runtime); } else bt = p->p_runtime; bintime2timeval(&bt, &tv); tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec; ptu = p->p_uu + p->p_su + p->p_iu; if (tu < ptu || (int64_t)tu < 0) { printf("calcru: negative time of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ptu; } /* Subdivide tu. */ uu = (tu * ut) / tt; su = (tu * st) / tt; iu = tu - uu - su; /* Enforce monotonicity. */ if (uu < p->p_uu || su < p->p_su || iu < p->p_iu) { if (uu < p->p_uu) uu = p->p_uu; else if (uu + p->p_su + p->p_iu > tu) uu = tu - p->p_su - p->p_iu; if (st == 0) su = p->p_su; else { su = ((tu - uu) * st) / (st + it); if (su < p->p_su) su = p->p_su; else if (uu + su + p->p_iu > tu) su = tu - uu - p->p_iu; } KASSERT(uu + su + p->p_iu <= tu, ("calcru: monotonisation botch 1")); iu = tu - uu - su; KASSERT(iu >= p->p_iu, ("calcru: monotonisation botch 2")); } p->p_uu = uu; p->p_su = su; p->p_iu = iu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; if (ip != NULL) { ip->tv_sec = iu / 1000000; ip->tv_usec = iu % 1000000; } } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif /* * MPSAFE */ /* ARGSUSED */ int getrusage(td, uap) register struct thread *td; register struct getrusage_args *uap; { struct rusage ru; struct proc *p; p = td->td_proc; switch (uap->who) { case RUSAGE_SELF: mtx_lock(&Giant); mtx_lock_spin(&sched_lock); calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime, NULL); mtx_unlock_spin(&sched_lock); ru = p->p_stats->p_ru; mtx_unlock(&Giant); break; case RUSAGE_CHILDREN: mtx_lock(&Giant); ru = p->p_stats->p_cru; mtx_unlock(&Giant); break; default: return (EINVAL); break; } return (copyout(&ru, uap->rusage, sizeof(struct rusage))); } void ruadd(ru, ru2) register struct rusage *ru, *ru2; { register long *ip, *ip2; register int i; timevaladd(&ru->ru_utime, &ru2->ru_utime); timevaladd(&ru->ru_stime, &ru2->ru_stime); if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc() { struct plimit *limp; limp = (struct plimit *)malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); limp->pl_refcnt = 1; limp->pl_mtx = mtx_pool_alloc(mtxpool_sleep); return (limp); } struct plimit * lim_hold(limp) struct plimit *limp; { LIM_LOCK(limp); limp->pl_refcnt++; LIM_UNLOCK(limp); return (limp); } void lim_free(limp) struct plimit *limp; { LIM_LOCK(limp); KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow")); if (--limp->pl_refcnt == 0) { LIM_UNLOCK(limp); free((void *)limp, M_PLIMIT); return; } LIM_UNLOCK(limp); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(dst, src) struct plimit *dst, *src; { KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct proc *p, int which) { struct rlimit rl; lim_rlimit(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t lim_cur(struct proc *p, int which) { struct rlimit rl; lim_rlimit(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; } /* * Find the uidinfo structure for a uid. This structure is used to * track the total resource consumption (process count, socket buffer * size, etc.) for the uid and impose limits. */ void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_mtx must be locked. */ static struct uidinfo * uilookup(uid) uid_t uid; { struct uihashhead *uipp; struct uidinfo *uip; mtx_assert(&uihashtbl_mtx, MA_OWNED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) break; return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Increase refcount on uidinfo struct returned. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid) uid_t uid; { struct uidinfo *old_uip, *uip; mtx_lock(&uihashtbl_mtx); uip = uilookup(uid); if (uip == NULL) { mtx_unlock(&uihashtbl_mtx); uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); mtx_lock(&uihashtbl_mtx); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((old_uip = uilookup(uid)) != NULL) { /* Someone else beat us to it. */ free(uip, M_UIDINFO); uip = old_uip; } else { uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep); uip->ui_uid = uid; LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); } } uihold(uip); mtx_unlock(&uihashtbl_mtx); return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(uip) struct uidinfo *uip; { UIDINFO_LOCK(uip); uip->ui_ref++; UIDINFO_UNLOCK(uip); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, loose the lock and aquire the locks in the proper * order to try again. */ void uifree(uip) struct uidinfo *uip; { /* Prepare for optimal case. */ UIDINFO_LOCK(uip); if (--uip->ui_ref != 0) { UIDINFO_UNLOCK(uip); return; } /* Prepare for suboptimal case. */ uip->ui_ref++; UIDINFO_UNLOCK(uip); mtx_lock(&uihashtbl_mtx); UIDINFO_LOCK(uip); /* * We must subtract one from the count again because we backed out * our initial subtraction before dropping the lock. * Since another thread may have added a reference after we dropped the * initial lock we have to test for zero again. */ if (--uip->ui_ref == 0) { LIST_REMOVE(uip, ui_hash); mtx_unlock(&uihashtbl_mtx); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %jd\n", uip->ui_uid, (intmax_t)uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); UIDINFO_UNLOCK(uip); FREE(uip, M_UIDINFO); return; } mtx_unlock(&uihashtbl_mtx); UIDINFO_UNLOCK(uip); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(uip, diff, max) struct uidinfo *uip; int diff; int max; { UIDINFO_LOCK(uip); /* Don't allow them to exceed max, but allow subtraction. */ if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) { UIDINFO_UNLOCK(uip); return (0); } uip->ui_proccnt += diff; if (uip->ui_proccnt < 0) printf("negative proccnt for uid = %d\n", uip->ui_uid); UIDINFO_UNLOCK(uip); return (1); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(uip, hiwat, to, max) struct uidinfo *uip; u_int *hiwat; u_int to; rlim_t max; { rlim_t new; int s; s = splnet(); UIDINFO_LOCK(uip); new = uip->ui_sbsize + to - *hiwat; /* Don't allow them to exceed max, but allow subtraction */ if (to > *hiwat && new > max) { splx(s); UIDINFO_UNLOCK(uip); return (0); } uip->ui_sbsize = new; *hiwat = to; if (uip->ui_sbsize < 0) printf("negative sbsize for uid = %d\n", uip->ui_uid); splx(s); UIDINFO_UNLOCK(uip); return (1); } Index: head/sys/kern/sched_4bsd.c =================================================================== --- head/sys/kern/sched_4bsd.c (revision 130550) +++ head/sys/kern/sched_4bsd.c (revision 130551) @@ -1,873 +1,876 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define KTR_4BSD 0x0 /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #ifdef SMP #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) #else #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #endif #define NICE_WEIGHT 1 /* Priorities per nice level. */ struct ke_sched { int ske_cpticks; /* (j) Ticks of cpu time. */ struct runq *ske_runq; /* runq the kse is currently on */ }; #define ke_runq ke_sched->ske_runq #define KEF_BOUND KEF_SCHED1 #define SKE_RUNQ_PCPU(ke) \ ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq) /* * KSE_CAN_MIGRATE macro returns true if the kse can migrate between * cpus. */ #define KSE_CAN_MIGRATE(ke) \ ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) static struct ke_sched ke_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = NULL; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = NULL; static int sched_tdcnt; /* Total runnable threads in the system. */ static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ static struct callout roundrobin_callout; static void setup_runqs(void); static void roundrobin(void *arg); static void schedcpu(void); static void schedcpu_thread(void); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct ksegrp *kg); static void resetpriority(struct ksegrp *kg); static struct kproc_desc sched_kp = { "schedcpu", schedcpu_thread, NULL }; SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp) SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) /* * Global run queue. */ static struct runq runq; #ifdef SMP /* * Per-CPU run queues */ static struct runq runq_pcpu[MAXCPU]; #endif static void setup_runqs(void) { #ifdef SMP int i; for (i = 0; i < MAXCPU; ++i) runq_init(&runq_pcpu[i]); #endif runq_init(&runq); } static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val; new_val = sched_quantum * tick; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < tick) return (EINVAL); sched_quantum = new_val / tick; hogticks = 2 * sched_quantum; return (0); } SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", "Roundrobin scheduling quantum in microseconds"); /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); if (td->td_priority < curthread->td_priority && curthread->td_kse) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Force switch among equal priority processes every 100ms. * We don't actually need to force a context switch of the current process. * The act of firing the event triggers a context switch to softclock() and * then switching back out again which is equivalent to a preemption, thus * no further work is needed on the local CPU. */ /* ARGSUSED */ static void roundrobin(void *arg) { #ifdef SMP mtx_lock_spin(&sched_lock); forward_roundrobin(); mtx_unlock_spin(&sched_lock); #endif callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); } /* * Constants for digital decay and forget: * 90% of (kg_estcpu) usage in 5 * loadav time * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * kg_estcpu *= decay; * will compute * kg_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void) { register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct kse *ke; struct ksegrp *kg; int awake, realstathz; realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Prevent state changes and protect run queue. */ mtx_lock_spin(&sched_lock); /* * Increment time in/out of memory. We ignore overflow; with * 16-bit int's (remember them?) overflow takes 45 days. */ p->p_swtime++; FOREACH_KSEGRP_IN_PROC(p, kg) { awake = 0; FOREACH_KSE_IN_GROUP(kg, ke) { /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The kse slptimes are not touched in wakeup * because the thread may not HAVE a KSE. */ if (ke->ke_state == KES_ONRUNQ) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } else if ((ke->ke_state == KES_THREAD) && (TD_IS_RUNNING(ke->ke_thread))) { awake = 1; /* Do not clear KEF_DIDRUN */ } else if (ke->ke_flags & KEF_DIDRUN) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } /* * ke_pctcpu is only for ps and ttyinfo(). * Do it per kse, and add them up at the end? * XXXKSE */ ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT; /* * If the kse has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ke->ke_sched->ske_cpticks == 0) continue; #if (FSHIFT >= CCPU_SHIFT) ke->ke_pctcpu += (realstathz == 100) ? ((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ke->ke_pctcpu += ((FSCALE - ccpu) * (ke->ke_sched->ske_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ke->ke_sched->ske_cpticks = 0; } /* end of kse loop */ /* * If there are ANY running threads in this KSEGRP, * then don't count it as sleeping. */ if (awake) { if (kg->kg_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(kg); } kg->kg_slptime = 0; } else kg->kg_slptime++; if (kg->kg_slptime > 1) continue; kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); resetpriority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { if (td->td_priority >= PUSER) { sched_prio(td, kg->kg_user_pri); } } } /* end of ksegrp loop */ mtx_unlock_spin(&sched_lock); } /* end of process loop */ sx_sunlock(&allproc_lock); } /* * Main loop for a kthread that executes schedcpu once a second. */ static void schedcpu_thread(void) { int nowake; for (;;) { schedcpu(); tsleep(&nowake, curthread->td_priority, "-", hz); } } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at * least six times the loadfactor will decay kg_estcpu to zero. */ static void updatepri(struct ksegrp *kg) { register fixpt_t loadfac; register unsigned int newcpu; loadfac = loadfactor(averunnable.ldavg[0]); if (kg->kg_slptime > 5 * loadfac) kg->kg_estcpu = 0; else { newcpu = kg->kg_estcpu; kg->kg_slptime--; /* was incremented in schedcpu() */ while (newcpu && --kg->kg_slptime) newcpu = decay_cpu(loadfac, newcpu); kg->kg_estcpu = newcpu; } resetpriority(kg); } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct ksegrp *kg) { register unsigned int newpriority; struct thread *td; if (kg->kg_pri_class == PRI_TIMESHARE) { newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + - NICE_WEIGHT * (kg->kg_nice - PRIO_MIN); + NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); kg->kg_user_pri = newpriority; } FOREACH_THREAD_IN_GROUP(kg, td) { maybe_resched(td); /* XXXKSE silly */ } } /* ARGSUSED */ static void sched_setup(void *dummy) { setup_runqs(); if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; hogticks = 2 * sched_quantum; callout_init(&roundrobin_callout, CALLOUT_MPSAFE); /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); /* Account for thread0. */ sched_tdcnt++; } /* External interfaces start here */ int sched_runnable(void) { #ifdef SMP return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); #else return runq_check(&runq); #endif } int sched_rr_interval(void) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; return (sched_quantum); } /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage * estimator (kg_estcpu) is increased here. resetpriority() will * compute a different priority each time kg_estcpu increases by * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). The cpu usage estimator ramps up * quite quickly when the process is running (linearly), and decays * away exponentially, at a rate which is proportionally slower when * the system is busy. The basic principle is that the system will * 90% forget that the process used a lot of CPU time in 5 * loadav * seconds. This causes the system to favor processes which haven't * run much recently, and to round-robin among other processes. */ void sched_clock(struct thread *td) { struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; ke = td->td_kse; ke->ke_sched->ske_cpticks++; kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(kg); if (td->td_priority >= PUSER) td->td_priority = kg->kg_user_pri; } } /* * charge childs scheduling cpu usage to parent. * * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. * Charge it to the ksegrp that did the wait since process estcpu is sum of * all ksegrps, this is strictly as expected. Assume that the child process * aggregated all the estcpu into the 'built-in' ksegrp. */ void sched_exit(struct proc *p, struct proc *p1) { sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_exit_kse(struct kse *ke, struct kse *child) { } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu); } void sched_exit_thread(struct thread *td, struct thread *child) { if ((child->td_proc->p_flag & P_NOLOAD) == 0) sched_tdcnt--; } void sched_fork(struct proc *p, struct proc *p1) { sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_sched->ske_cpticks = 0; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); child->kg_estcpu = kg->kg_estcpu; } void sched_fork_thread(struct thread *td, struct thread *child) { } void -sched_nice(struct ksegrp *kg, int nice) +sched_nice(struct proc *p, int nice) { + struct ksegrp *kg; - PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); + PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); - kg->kg_nice = nice; - resetpriority(kg); + p->p_nice = nice; + FOREACH_KSEGRP_IN_PROC(p, kg) { + resetpriority(kg); + } } void sched_class(struct ksegrp *kg, int class) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_pri_class = class; } /* * Adjust the priority of a thread. * This may include moving the thread within the KSEGRP, * changing the assignment of a kse to the thread, * and moving a KSE in the system run queue. */ void sched_prio(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { adjustrunqueue(td, prio); } else { td->td_priority = prio; } } void sched_sleep(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_ksegrp->kg_slptime = 0; td->td_base_pri = td->td_priority; } void sched_switch(struct thread *td) { struct thread *newtd; struct kse *ke; struct proc *p; ke = td->td_kse; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_THREAD), ("sched_switch: kse state?")); if ((p->p_flag & P_NOLOAD) == 0) sched_tdcnt--; td->td_lastcpu = td->td_oncpu; td->td_last_kse = ke; td->td_flags &= ~TDF_NEEDRESCHED; td->td_oncpu = NOCPU; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. */ if (TD_IS_RUNNING(td)) { /* Put us back on the run queue (kse and all). */ setrunqueue(td); } else if (p->p_flag & P_SA) { /* * We will not be on the run queue. So we must be * sleeping or similar. As it's available, * someone else can use the KSE if they need it. */ kse_reassign(ke); } newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void sched_wakeup(struct thread *td) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; if (kg->kg_slptime > 1) updatepri(kg); kg->kg_slptime = 0; setrunqueue(td); maybe_resched(td); } void sched_add(struct thread *td) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; #ifdef SMP if (KSE_CAN_MIGRATE(ke)) { CTR1(KTR_4BSD, "adding kse:%p to gbl runq", ke); ke->ke_runq = &runq; } else { CTR1(KTR_4BSD, "adding kse:%p to pcpu runq", ke); if (!SKE_RUNQ_PCPU(ke)) ke->ke_runq = &runq_pcpu[PCPU_GET(cpuid)]; } #else ke->ke_runq = &runq; #endif if ((td->td_proc->p_flag & P_NOLOAD) == 0) sched_tdcnt++; runq_add(ke->ke_runq, ke); } void sched_rem(struct thread *td) { struct kse *ke; ke = td->td_kse; KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_rem: process swapped out")); KASSERT((ke->ke_state == KES_ONRUNQ), ("sched_rem: KSE not on run queue")); mtx_assert(&sched_lock, MA_OWNED); if ((td->td_proc->p_flag & P_NOLOAD) == 0) sched_tdcnt--; runq_remove(ke->ke_sched->ske_runq, ke); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; } struct kse * sched_choose(void) { struct kse *ke; struct runq *rq; #ifdef SMP struct kse *kecpu; rq = &runq; ke = runq_choose(&runq); kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); if (ke == NULL || (kecpu != NULL && kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) { CTR2(KTR_4BSD, "choosing kse %p from pcpu runq %d", kecpu, PCPU_GET(cpuid)); ke = kecpu; rq = &runq_pcpu[PCPU_GET(cpuid)]; } else { CTR1(KTR_4BSD, "choosing kse %p from main runq", ke); } #else rq = &runq; ke = runq_choose(&runq); #endif if (ke != NULL) { runq_remove(rq, ke); ke->ke_state = KES_THREAD; KASSERT((ke->ke_thread != NULL), ("sched_choose: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_choose: No KSE on thread")); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_choose: process swapped out")); } return (ke); } void sched_userret(struct thread *td) { struct ksegrp *kg; /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } void sched_bind(struct thread *td, int cpu) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("sched_bind: cannot bind non-running thread")); ke = td->td_kse; ke->ke_flags |= KEF_BOUND; #ifdef SMP ke->ke_runq = &runq_pcpu[cpu]; if (PCPU_GET(cpuid) == cpu) return; ke->ke_state = KES_THREAD; mi_switch(SW_VOL); #endif } void sched_unbind(struct thread* td) { mtx_assert(&sched_lock, MA_OWNED); td->td_kse->ke_flags &= ~KEF_BOUND; } int sched_load(void) { return (sched_tdcnt); } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread)); } fixpt_t sched_pctcpu(struct thread *td) { struct kse *ke; ke = td->td_kse; if (ke == NULL) ke = td->td_last_kse; if (ke) return (ke->ke_pctcpu); return (0); } Index: head/sys/kern/sched_ule.c =================================================================== --- head/sys/kern/sched_ule.c (revision 130550) +++ head/sys/kern/sched_ule.c (revision 130551) @@ -1,1744 +1,1750 @@ /*- * Copyright (c) 2002-2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include #include #define KTR_ULE KTR_NFS /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); static int slice_min = 1; SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); static int slice_max = 10; SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); int realstathz; int tickincr = 1; /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ /* CPU that we have affinity for. */ u_char ske_cpu; }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks #define ke_cpu ke_sched->ske_cpu #define ke_assign ke_procq.tqe_next #define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ #define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ struct kg_sched { int skg_slptime; /* Number of ticks we vol. slept */ int skg_runtime; /* Number of ticks we were running */ }; #define kg_slptime kg_sched->skg_slptime #define kg_runtime kg_sched->skg_runtime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct td_sched td_sched; struct ke_sched ke_sched; struct kg_sched kg_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * The priority is primarily determined by the interactivity score. Thus, we * give lower(better) priorities to kse groups that use less CPU. The nice * value is then directly added to this to allow nice to have some effect * on latency. * * PRI_RANGE: Total priority range for timeshare threads. * PRI_NRESV: Number of nice values. * PRI_BASE: The start of the dynamic range. */ #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) #define SCHED_PRI_INTERACT(score) \ ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) /* * These determine the interactivity of a process. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshhold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) #define SCHED_SLP_RUN_FORK ((hz / 2) << 10) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per val in the range of [0, max]. * SLICE_NICE: Determine the amount of slice granted to a scaled nice. * SLICE_NTHRESH: The nice cutoff point for slice assignment. */ #define SCHED_SLICE_MIN (slice_min) #define SCHED_SLICE_MAX (slice_max) #define SCHED_SLICE_INTERACTIVE (slice_max) #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_SLICE_NICE(nice) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) /* * This macro determines whether or not the kse belongs on the current or * next run queue. */ #define SCHED_INTERACTIVE(kg) \ (sched_interact_score(kg) < SCHED_INTERACT_THRESH) #define SCHED_CURR(kg, ke) \ (ke->ke_thread->td_priority < kg->kg_user_pri || \ SCHED_INTERACTIVE(kg)) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 10 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - per processor runqs and statistics. */ struct kseq { struct runq ksq_idle; /* Queue of IDLE threads. */ struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ struct runq *ksq_next; /* Next timeshare queue. */ struct runq *ksq_curr; /* Current queue. */ int ksq_load_timeshare; /* Load for timeshare. */ int ksq_load; /* Aggregate load. */ short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ short ksq_nicemin; /* Least nice. */ #ifdef SMP int ksq_transferable; LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ struct kseq_group *ksq_group; /* Our processor group. */ volatile struct kse *ksq_assigned; /* assigned by another CPU. */ #else int ksq_sysload; /* For loadavg, !ITHD load. */ #endif }; #ifdef SMP /* * kseq groups are groups of processors which can cheaply share threads. When * one processor in the group goes idle it will check the runqs of the other * processors in its group prior to halting and waiting for an interrupt. * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. * In a numa environment we'd want an idle bitmap per group and a two tiered * load balancer. */ struct kseq_group { int ksg_cpus; /* Count of CPUs in this kseq group. */ cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ cpumask_t ksg_idlemask; /* Idle cpus in this group. */ cpumask_t ksg_mask; /* Bit mask for first cpu. */ int ksg_load; /* Total load of this group. */ int ksg_transferable; /* Transferable load of this group. */ LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ }; #endif /* * One kse queue per processor. */ #ifdef SMP static cpumask_t kseq_idle; static int ksg_maxid; static struct kseq kseq_cpu[MAXCPU]; static struct kseq_group kseq_groups[MAXCPU]; static int bal_tick; static int gbal_tick; #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (&kseq_cpu[(x)]) #define KSEQ_ID(x) ((x) - kseq_cpu) #define KSEQ_GROUP(x) (&kseq_groups[(x)]) #else /* !SMP */ static struct kseq kseq_cpu; #define KSEQ_SELF() (&kseq_cpu) #define KSEQ_CPU(x) (&kseq_cpu) #endif static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); static void sched_interact_update(struct ksegrp *kg); static void sched_interact_fork(struct ksegrp *kg); static void sched_pctcpu_update(struct kse *ke); /* Operations on per processor queues */ static struct kse * kseq_choose(struct kseq *kseq); static void kseq_setup(struct kseq *kseq); static void kseq_load_add(struct kseq *kseq, struct kse *ke); static void kseq_load_rem(struct kseq *kseq, struct kse *ke); static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); void kseq_print(int cpu); #ifdef SMP static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); static struct kse *runq_steal(struct runq *rq); static void sched_balance(void); static void sched_balance_groups(void); static void sched_balance_group(struct kseq_group *ksg); static void sched_balance_pair(struct kseq *high, struct kseq *low); static void kseq_move(struct kseq *from, int cpu); static int kseq_idled(struct kseq *kseq); static void kseq_notify(struct kse *ke, int cpu); static void kseq_assign(struct kseq *); static struct kse *kseq_steal(struct kseq *kseq, int stealidle); /* * On P4 Xeons the round-robin interrupt delivery is broken. As a result of * this, we can't pin interrupts to the cpu that they were delivered to, * otherwise all ithreads only run on CPU 0. */ #ifdef __i386__ #define KSE_CAN_MIGRATE(ke, class) \ ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) #else /* !__i386__ */ #define KSE_CAN_MIGRATE(ke, class) \ ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ ((ke)->ke_flags & KEF_BOUND) == 0) #endif /* !__i386__ */ #endif void kseq_print(int cpu) { struct kseq *kseq; int i; kseq = KSEQ_CPU(cpu); printf("kseq:\n"); printf("\tload: %d\n", kseq->ksq_load); printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); #ifdef SMP printf("\tload transferable: %d\n", kseq->ksq_transferable); #endif printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); printf("\tnice counts:\n"); for (i = 0; i < SCHED_PRI_NRESV; i++) if (kseq->ksq_nice[i]) printf("\t\t%d = %d\n", i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); } static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke) { #ifdef SMP if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; } #endif runq_add(ke->ke_runq, ke); } static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke) { #ifdef SMP if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { kseq->ksq_transferable--; kseq->ksq_group->ksg_transferable--; } #endif runq_remove(ke->ke_runq, ke); } static void kseq_load_add(struct kseq *kseq, struct kse *ke) { int class; mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); if (class == PRI_TIMESHARE) kseq->ksq_load_timeshare++; kseq->ksq_load++; if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP kseq->ksq_group->ksg_load++; #else kseq->ksq_sysload++; #endif if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, - ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); + ke->ke_proc->p_nice, kseq->ksq_nicemin); if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) - kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); + kseq_nice_add(kseq, ke->ke_proc->p_nice); } static void kseq_load_rem(struct kseq *kseq, struct kse *ke) { int class; mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); if (class == PRI_TIMESHARE) kseq->ksq_load_timeshare--; if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP kseq->ksq_group->ksg_load--; #else kseq->ksq_sysload--; #endif kseq->ksq_load--; ke->ke_runq = NULL; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) - kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); + kseq_nice_rem(kseq, ke->ke_proc->p_nice); } static void kseq_nice_add(struct kseq *kseq, int nice) { mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) kseq->ksq_nicemin = nice; } static void kseq_nice_rem(struct kseq *kseq, int nice) { int n; mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ n = nice + SCHED_PRI_NHALF; kseq->ksq_nice[n]--; KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); /* * If this wasn't the smallest nice value or there are more in * this bucket we can just return. Otherwise we have to recalculate * the smallest nice. */ if (nice != kseq->ksq_nicemin || kseq->ksq_nice[n] != 0 || kseq->ksq_load_timeshare == 0) return; for (; n < SCHED_PRI_NRESV; n++) if (kseq->ksq_nice[n]) { kseq->ksq_nicemin = n - SCHED_PRI_NHALF; return; } } #ifdef SMP /* * sched_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load * by migrating some processes. * * Dealing only with two CPUs at a time has two advantages. Firstly, most * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses * algorithm simplicity and more gradual effects on load in larger systems. * * It could be improved by considering the priorities and slices assigned to * each task prior to balancing them. There are many pathological cases with * any approach and so the semi random algorithm below may work as well as any. * */ static void sched_balance(void) { struct kseq_group *high; struct kseq_group *low; struct kseq_group *ksg; int cnt; int i; if (smp_started == 0) goto out; low = high = NULL; i = random() % (ksg_maxid + 1); for (cnt = 0; cnt <= ksg_maxid; cnt++) { ksg = KSEQ_GROUP(i); /* * Find the CPU with the highest load that has some * threads to transfer. */ if ((high == NULL || ksg->ksg_load > high->ksg_load) && ksg->ksg_transferable) high = ksg; if (low == NULL || ksg->ksg_load < low->ksg_load) low = ksg; if (++i > ksg_maxid) i = 0; } if (low != NULL && high != NULL && high != low) sched_balance_pair(LIST_FIRST(&high->ksg_members), LIST_FIRST(&low->ksg_members)); out: bal_tick = ticks + (random() % (hz * 2)); } static void sched_balance_groups(void) { int i; mtx_assert(&sched_lock, MA_OWNED); if (smp_started) for (i = 0; i <= ksg_maxid; i++) sched_balance_group(KSEQ_GROUP(i)); gbal_tick = ticks + (random() % (hz * 2)); } static void sched_balance_group(struct kseq_group *ksg) { struct kseq *kseq; struct kseq *high; struct kseq *low; int load; if (ksg->ksg_transferable == 0) return; low = NULL; high = NULL; LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { load = kseq->ksq_load; if (high == NULL || load > high->ksq_load) high = kseq; if (low == NULL || load < low->ksq_load) low = kseq; } if (high != NULL && low != NULL && high != low) sched_balance_pair(high, low); } static void sched_balance_pair(struct kseq *high, struct kseq *low) { int transferable; int high_load; int low_load; int move; int diff; int i; /* * If we're transfering within a group we have to use this specific * kseq's transferable count, otherwise we can steal from other members * of the group. */ if (high->ksq_group == low->ksq_group) { transferable = high->ksq_transferable; high_load = high->ksq_load; low_load = low->ksq_load; } else { transferable = high->ksq_group->ksg_transferable; high_load = high->ksq_group->ksg_load; low_load = low->ksq_group->ksg_load; } if (transferable == 0) return; /* * Determine what the imbalance is and then adjust that to how many * kses we actually have to give up (transferable). */ diff = high_load - low_load; move = diff / 2; if (diff & 0x1) move++; move = min(move, transferable); for (i = 0; i < move; i++) kseq_move(high, KSEQ_ID(low)); return; } static void kseq_move(struct kseq *from, int cpu) { struct kseq *kseq; struct kseq *to; struct kse *ke; kseq = from; to = KSEQ_CPU(cpu); ke = kseq_steal(kseq, 1); if (ke == NULL) { struct kseq_group *ksg; ksg = kseq->ksq_group; LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { if (kseq == from || kseq->ksq_transferable == 0) continue; ke = kseq_steal(kseq, 1); break; } if (ke == NULL) panic("kseq_move: No KSEs available with a " "transferable count of %d\n", ksg->ksg_transferable); } if (kseq == to) return; ke->ke_state = KES_THREAD; kseq_runq_rem(kseq, ke); kseq_load_rem(kseq, ke); kseq_notify(ke, cpu); } static int kseq_idled(struct kseq *kseq) { struct kseq_group *ksg; struct kseq *steal; struct kse *ke; ksg = kseq->ksq_group; /* * If we're in a cpu group, try and steal kses from another cpu in * the group before idling. */ if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { if (steal == kseq || steal->ksq_transferable == 0) continue; ke = kseq_steal(steal, 0); if (ke == NULL) continue; ke->ke_state = KES_THREAD; kseq_runq_rem(steal, ke); kseq_load_rem(steal, ke); ke->ke_cpu = PCPU_GET(cpuid); sched_add(ke->ke_thread); return (0); } } /* * We only set the idled bit when all of the cpus in the group are * idle. Otherwise we could get into a situation where a KSE bounces * back and forth between two idle cores on seperate physical CPUs. */ ksg->ksg_idlemask |= PCPU_GET(cpumask); if (ksg->ksg_idlemask != ksg->ksg_cpumask) return (1); atomic_set_int(&kseq_idle, ksg->ksg_mask); return (1); } static void kseq_assign(struct kseq *kseq) { struct kse *nke; struct kse *ke; do { (volatile struct kse *)ke = kseq->ksq_assigned; } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); for (; ke != NULL; ke = nke) { nke = ke->ke_assign; ke->ke_flags &= ~KEF_ASSIGNED; sched_add(ke->ke_thread); } } static void kseq_notify(struct kse *ke, int cpu) { struct kseq *kseq; struct thread *td; struct pcpu *pcpu; ke->ke_cpu = cpu; ke->ke_flags |= KEF_ASSIGNED; kseq = KSEQ_CPU(cpu); /* * Place a KSE on another cpu's queue and force a resched. */ do { (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); pcpu = pcpu_find(cpu); td = pcpu->pc_curthread; if (ke->ke_thread->td_priority < td->td_priority || td == pcpu->pc_idlethread) { td->td_flags |= TDF_NEEDRESCHED; ipi_selected(1 << cpu, IPI_AST); } } static struct kse * runq_steal(struct runq *rq) { struct rqhead *rqh; struct rqbits *rqb; struct kse *ke; int word; int bit; mtx_assert(&sched_lock, MA_OWNED); rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(ke, rqh, ke_procq) { if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) return (ke); } } } return (NULL); } static struct kse * kseq_steal(struct kseq *kseq, int stealidle) { struct kse *ke; /* * Steal from next first to try to get a non-interactive task that * may not have run for a while. */ if ((ke = runq_steal(kseq->ksq_next)) != NULL) return (ke); if ((ke = runq_steal(kseq->ksq_curr)) != NULL) return (ke); if (stealidle) return (runq_steal(&kseq->ksq_idle)); return (NULL); } int kseq_transfer(struct kseq *kseq, struct kse *ke, int class) { struct kseq_group *ksg; int cpu; if (smp_started == 0) return (0); cpu = 0; ksg = kseq->ksq_group; /* * If there are any idle groups, give them our extra load. The * threshold at which we start to reassign kses has a large impact * on the overall performance of the system. Tuned too high and * some CPUs may idle. Too low and there will be excess migration * and context switches. */ if (ksg->ksg_load > (ksg->ksg_cpus * 2) && kseq_idle) { /* * Multiple cpus could find this bit simultaneously * but the race shouldn't be terrible. */ cpu = ffs(kseq_idle); if (cpu) atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); } /* * If another cpu in this group has idled, assign a thread over * to them after checking to see if there are idled groups. */ if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { cpu = ffs(ksg->ksg_idlemask); if (cpu) ksg->ksg_idlemask &= ~(1 << (cpu - 1)); } /* * Now that we've found an idle CPU, migrate the thread. */ if (cpu) { cpu--; ke->ke_runq = NULL; kseq_notify(ke, cpu); return (1); } return (0); } #endif /* SMP */ /* * Pick the highest priority task we have and return it. */ static struct kse * kseq_choose(struct kseq *kseq) { struct kse *ke; struct runq *swap; mtx_assert(&sched_lock, MA_OWNED); swap = NULL; for (;;) { ke = runq_choose(kseq->ksq_curr); if (ke == NULL) { /* * We already swaped once and didn't get anywhere. */ if (swap) break; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; continue; } /* * If we encounter a slice of 0 the kse is in a * TIMESHARE kse group and its nice was too far out * of the range that receives slices. */ if (ke->ke_slice == 0) { runq_remove(ke->ke_runq, ke); sched_slice(ke); ke->ke_runq = kseq->ksq_next; runq_add(ke->ke_runq, ke); continue; } return (ke); } return (runq_choose(&kseq->ksq_idle)); } static void kseq_setup(struct kseq *kseq) { runq_init(&kseq->ksq_timeshare[0]); runq_init(&kseq->ksq_timeshare[1]); runq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; kseq->ksq_load = 0; kseq->ksq_load_timeshare = 0; } static void sched_setup(void *dummy) { #ifdef SMP int balance_groups; int i; #endif slice_min = (hz/100); /* 10ms */ slice_max = (hz/7); /* ~140ms */ #ifdef SMP balance_groups = 0; /* * Initialize the kseqs. */ for (i = 0; i < MAXCPU; i++) { struct kseq *ksq; ksq = &kseq_cpu[i]; ksq->ksq_assigned = NULL; kseq_setup(&kseq_cpu[i]); } if (smp_topology == NULL) { struct kseq_group *ksg; struct kseq *ksq; for (i = 0; i < MAXCPU; i++) { ksq = &kseq_cpu[i]; ksg = &kseq_groups[i]; /* * Setup a kseq group with one member. */ ksq->ksq_transferable = 0; ksq->ksq_group = ksg; ksg->ksg_cpus = 1; ksg->ksg_idlemask = 0; ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; ksg->ksg_load = 0; ksg->ksg_transferable = 0; LIST_INIT(&ksg->ksg_members); LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); } } else { struct kseq_group *ksg; struct cpu_group *cg; int j; for (i = 0; i < smp_topology->ct_count; i++) { cg = &smp_topology->ct_group[i]; ksg = &kseq_groups[i]; /* * Initialize the group. */ ksg->ksg_idlemask = 0; ksg->ksg_load = 0; ksg->ksg_transferable = 0; ksg->ksg_cpus = cg->cg_count; ksg->ksg_cpumask = cg->cg_mask; LIST_INIT(&ksg->ksg_members); /* * Find all of the group members and add them. */ for (j = 0; j < MAXCPU; j++) { if ((cg->cg_mask & (1 << j)) != 0) { if (ksg->ksg_mask == 0) ksg->ksg_mask = 1 << j; kseq_cpu[j].ksq_transferable = 0; kseq_cpu[j].ksq_group = ksg; LIST_INSERT_HEAD(&ksg->ksg_members, &kseq_cpu[j], ksq_siblings); } } if (ksg->ksg_cpus > 1) balance_groups = 1; } ksg_maxid = smp_topology->ct_count - 1; } /* * Stagger the group and global load balancer so they do not * interfere with each other. */ bal_tick = ticks + hz; if (balance_groups) gbal_tick = ticks + (hz / 2); #else kseq_setup(KSEQ_SELF()); #endif mtx_lock_spin(&sched_lock); kseq_load_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return; pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); pri += SCHED_PRI_BASE; - pri += kg->kg_nice; + pri += kg->kg_proc->p_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return; } /* * Calculate a time slice based on the properties of the kseg and the runq * that we're on. This is only for PRI_TIMESHARE ksegrps. */ static void sched_slice(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; kg = ke->ke_ksegrp; kseq = KSEQ_CPU(ke->ke_cpu); /* * Rationale: * KSEs in interactive ksegs get the minimum slice so that we * quickly notice if it abuses its advantage. * * KSEs in non-interactive ksegs are assigned a slice that is * based on the ksegs nice value relative to the least nice kseg * on the run queue for this cpu. * * If the KSE is less nice than all others it gets the maximum * slice and other KSEs will adjust their slice relative to * this when they first expire. * * There is 20 point window that starts relative to the least * nice kse on the run queue. Slice size is determined by * the kse distance from the last nice ksegrp. * * If the kse is outside of the window it will get no slice * and will be reevaluated each time it is selected on the * run queue. The exception to this is nice 0 ksegs when * a nice -20 is running. They are always granted a minimum * slice. */ if (!SCHED_INTERACTIVE(kg)) { int nice; - nice = kg->kg_nice + (0 - kseq->ksq_nicemin); + nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); if (kseq->ksq_load_timeshare == 0 || - kg->kg_nice < kseq->ksq_nicemin) + kg->kg_proc->p_nice < kseq->ksq_nicemin) ke->ke_slice = SCHED_SLICE_MAX; else if (nice <= SCHED_SLICE_NTHRESH) ke->ke_slice = SCHED_SLICE_NICE(nice); - else if (kg->kg_nice == 0) + else if (kg->kg_proc->p_nice == 0) ke->ke_slice = SCHED_SLICE_MIN; else ke->ke_slice = 0; } else ke->ke_slice = SCHED_SLICE_INTERACTIVE; CTR6(KTR_ULE, "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", - ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, + ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin, kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. * This routine will not operate correctly when slp or run times have been * adjusted to more than double their maximum. */ static void sched_interact_update(struct ksegrp *kg) { int sum; sum = kg->kg_runtime + kg->kg_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { kg->kg_runtime /= 2; kg->kg_slptime /= 2; return; } kg->kg_runtime = (kg->kg_runtime / 5) * 4; kg->kg_slptime = (kg->kg_slptime / 5) * 4; } static void sched_interact_fork(struct ksegrp *kg) { int ratio; int sum; sum = kg->kg_runtime + kg->kg_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; kg->kg_runtime /= ratio; kg->kg_slptime /= ratio; } } static int sched_interact_score(struct ksegrp *kg) { int div; if (kg->kg_runtime > kg->kg_slptime) { div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); } if (kg->kg_slptime > kg->kg_runtime) { div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); return (kg->kg_runtime / div); } /* * This can happen if slptime and runtime are 0. */ return (0); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most SCHED_SLICE_MAX. */ int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } static void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. */ if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { /* * Shift the tick count out so that the divide doesn't * round away our results. */ ke->ke_ticks <<= 10; ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ticks >>= 10; } else ke->ke_ticks = 0; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } void sched_prio(struct thread *td, u_char prio) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. We still call adjustrunqueue below in case kse * needs to fix things up. */ if (prio < td->td_priority && ke && (ke->ke_flags & KEF_ASSIGNED) == 0 && ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { runq_remove(ke->ke_runq, ke); ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; runq_add(ke->ke_runq, ke); } adjustrunqueue(td, prio); } else td->td_priority = prio; } void sched_switch(struct thread *td) { struct thread *newtd; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; /* * If the KSE has been assigned it may be in the process of switching * to the new cpu. This is the case in sched_bind(). */ if ((ke->ke_flags & KEF_ASSIGNED) == 0) { if (TD_IS_RUNNING(td)) { kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); setrunqueue(td); } else { if (ke->ke_runq) { kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); } else if ((td->td_flags & TDF_IDLETD) == 0) backtrace(); /* * We will not be on the run queue. So we must be * sleeping or similar. */ if (td->td_proc->p_flag & P_SA) kse_reassign(ke); } } newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void -sched_nice(struct ksegrp *kg, int nice) +sched_nice(struct proc *p, int nice) { + struct ksegrp *kg; struct kse *ke; struct thread *td; struct kseq *kseq; - PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); + PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); /* * We need to adjust the nice counts for running KSEs. */ - if (kg->kg_pri_class == PRI_TIMESHARE) - FOREACH_KSE_IN_GROUP(kg, ke) { - if (ke->ke_runq == NULL) - continue; - kseq = KSEQ_CPU(ke->ke_cpu); - kseq_nice_rem(kseq, kg->kg_nice); - kseq_nice_add(kseq, nice); + FOREACH_KSEGRP_IN_PROC(p, kg) { + if (kg->kg_pri_class == PRI_TIMESHARE) { + FOREACH_KSE_IN_GROUP(kg, ke) { + if (ke->ke_runq == NULL) + continue; + kseq = KSEQ_CPU(ke->ke_cpu); + kseq_nice_rem(kseq, p->p_nice); + kseq_nice_add(kseq, nice); + } } - kg->kg_nice = nice; - sched_priority(kg); - FOREACH_THREAD_IN_GROUP(kg, td) - td->td_flags |= TDF_NEEDRESCHED; + } + p->p_nice = nice; + FOREACH_KSEGRP_IN_PROC(p, kg) { + sched_priority(kg); + FOREACH_THREAD_IN_GROUP(kg, td) + td->td_flags |= TDF_NEEDRESCHED; + } } void sched_sleep(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_base_pri = td->td_priority; CTR2(KTR_ULE, "sleep kse %p (tick: %d)", td->td_kse, td->td_slptime); } void sched_wakeup(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ if (td->td_slptime) { struct ksegrp *kg; int hzticks; kg = td->td_ksegrp; hzticks = (ticks - td->td_slptime) << 10; if (hzticks >= SCHED_SLP_RUN_MAX) { kg->kg_slptime = SCHED_SLP_RUN_MAX; kg->kg_runtime = 1; } else { kg->kg_slptime += hzticks; sched_interact_update(kg); } sched_priority(kg); if (td->td_kse) sched_slice(td->td_kse); CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", td->td_kse, hzticks); td->td_slptime = 0; } setrunqueue(td); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct proc *p, struct proc *p1) { mtx_assert(&sched_lock, MA_OWNED); + p1->p_nice = p->p_nice; sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ child->ke_cpu = ke->ke_cpu; child->ke_runq = NULL; /* Grab our parents cpu estimation information. */ child->ke_ticks = ke->ke_ticks; child->ke_ltick = ke->ke_ltick; child->ke_ftick = ke->ke_ftick; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); child->kg_slptime = kg->kg_slptime; child->kg_runtime = kg->kg_runtime; child->kg_user_pri = kg->kg_user_pri; - child->kg_nice = kg->kg_nice; sched_interact_fork(child); kg->kg_runtime += tickincr << 10; sched_interact_update(kg); CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_class(struct ksegrp *kg, int class) { struct kseq *kseq; struct kse *ke; int nclass; int oclass; mtx_assert(&sched_lock, MA_OWNED); if (kg->kg_pri_class == class) return; nclass = PRI_BASE(class); oclass = PRI_BASE(kg->kg_pri_class); FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); #ifdef SMP /* * On SMP if we're on the RUNQ we must adjust the transferable * count because could be changing to or from an interrupt * class. */ if (ke->ke_state == KES_ONRUNQ) { if (KSE_CAN_MIGRATE(ke, oclass)) { kseq->ksq_transferable--; kseq->ksq_group->ksg_transferable--; } if (KSE_CAN_MIGRATE(ke, nclass)) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; } } #endif if (oclass == PRI_TIMESHARE) { kseq->ksq_load_timeshare--; - kseq_nice_rem(kseq, kg->kg_nice); + kseq_nice_rem(kseq, kg->kg_proc->p_nice); } if (nclass == PRI_TIMESHARE) { kseq->ksq_load_timeshare++; - kseq_nice_add(kseq, kg->kg_nice); + kseq_nice_add(kseq, kg->kg_proc->p_nice); } } kg->kg_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct proc *child) { mtx_assert(&sched_lock, MA_OWNED); sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); } void sched_exit_kse(struct kse *ke, struct kse *child) { kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { /* kg->kg_slptime += child->kg_slptime; */ kg->kg_runtime += child->kg_runtime; sched_interact_update(kg); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_clock(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP if (ticks == bal_tick) sched_balance(); if (ticks == gbal_tick) sched_balance_groups(); #endif /* * sched_setup() apparently happens prior to stathz being set. We * need to resolve the timers earlier in the boot so we can avoid * calculating this here. */ if (realstathz == 0) { realstathz = stathz ? stathz : hz; tickincr = hz / realstathz; /* * XXX This does not work for values of stathz that are much * larger than hz. */ if (tickincr == 0) tickincr = 1; } ke = td->td_kse; kg = ke->ke_ksegrp; /* Adjust ticks for pctcpu */ ke->ke_ticks++; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_flags & TDF_IDLETD) return; CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); /* * We only do slicing code for TIMESHARE ksegrps. */ if (kg->kg_pri_class != PRI_TIMESHARE) return; /* * We used a tick charge it to the ksegrp so that we can compute our * interactivity. */ kg->kg_runtime += tickincr << 10; sched_interact_update(kg); /* * We used up one time slice. */ if (--ke->ke_slice > 0) return; /* * We're out of time, recompute priorities and requeue. */ kseq = KSEQ_SELF(); kseq_load_rem(kseq, ke); sched_priority(kg); sched_slice(ke); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; kseq_load_add(kseq, ke); td->td_flags |= TDF_NEEDRESCHED; } int sched_runnable(void) { struct kseq *kseq; int load; load = 1; kseq = KSEQ_SELF(); #ifdef SMP if (kseq->ksq_assigned) { mtx_lock_spin(&sched_lock); kseq_assign(kseq); mtx_unlock_spin(&sched_lock); } #endif if ((curthread->td_flags & TDF_IDLETD) != 0) { if (kseq->ksq_load > 0) goto out; } else if (kseq->ksq_load - 1 > 0) goto out; load = 0; out: return (load); } void sched_userret(struct thread *td) { struct ksegrp *kg; kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } struct kse * sched_choose(void) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kseq = KSEQ_SELF(); #ifdef SMP restart: if (kseq->ksq_assigned) kseq_assign(kseq); #endif ke = kseq_choose(kseq); if (ke) { #ifdef SMP if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) if (kseq_idled(kseq) == 0) goto restart; #endif kseq_runq_rem(kseq, ke); ke->ke_state = KES_THREAD; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority); } return (ke); } #ifdef SMP if (kseq_idled(kseq) == 0) goto restart; #endif return (NULL); } void sched_add(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; int class; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; kg = td->td_ksegrp; if (ke->ke_flags & KEF_ASSIGNED) return; kseq = KSEQ_SELF(); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); class = PRI_BASE(kg->kg_pri_class); switch (class) { case PRI_ITHD: case PRI_REALTIME: ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; break; case PRI_IDLE: /* * This is for priority prop. */ if (ke->ke_thread->td_priority < PRI_MIN_IDLE) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = &kseq->ksq_idle; ke->ke_slice = SCHED_SLICE_MIN; break; default: panic("Unknown pri class."); break; } #ifdef SMP if (ke->ke_cpu != PCPU_GET(cpuid)) { ke->ke_runq = NULL; kseq_notify(ke, ke->ke_cpu); return; } /* * If we had been idle, clear our bit in the group and potentially * the global bitmap. If not, see if we should transfer this thread. */ if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { /* * Check to see if our group is unidling, and if so, remove it * from the global idle mask. */ if (kseq->ksq_group->ksg_idlemask == kseq->ksq_group->ksg_cpumask) atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); /* * Now remove ourselves from the group specific idle mask. */ kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); } else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) if (kseq_transfer(kseq, ke, class)) return; #endif if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; kseq_runq_add(kseq, ke); kseq_load_add(kseq, ke); } void sched_rem(struct thread *td) { struct kseq *kseq; struct kse *ke; ke = td->td_kse; /* * It is safe to just return here because sched_rem() is only ever * used in places where we're immediately going to add the * kse back on again. In that case it'll be added with the correct * thread and priority when the caller drops the sched_lock. */ if (ke->ke_flags & KEF_ASSIGNED) return; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_ONRUNQ), ("sched_rem: KSE not on run queue")); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq = KSEQ_CPU(ke->ke_cpu); kseq_runq_rem(kseq, ke); kseq_load_rem(kseq, ke); } fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct kse *ke; pctcpu = 0; ke = td->td_kse; if (ke == NULL) return (0); mtx_lock_spin(&sched_lock); if (ke->ke_ticks) { int rtick; /* * Don't update more frequently than twice a second. Allowing * this causes the cpu usage to decay away too quickly due to * rounding errors. */ if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || ke->ke_ltick < (ticks - (hz / 2))) sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; mtx_unlock_spin(&sched_lock); return (pctcpu); } void sched_bind(struct thread *td, int cpu) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; ke->ke_flags |= KEF_BOUND; #ifdef SMP if (PCPU_GET(cpuid) == cpu) return; /* sched_rem without the runq_remove */ ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); kseq_notify(ke, cpu); /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL); #endif } void sched_unbind(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_kse->ke_flags &= ~KEF_BOUND; } int sched_load(void) { #ifdef SMP int total; int i; total = 0; for (i = 0; i <= ksg_maxid; i++) total += KSEQ_GROUP(i)->ksg_load; return (total); #else return (KSEQ_SELF()->ksq_sysload); #endif } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 130550) +++ head/sys/sys/proc.h (revision 130551) @@ -1,936 +1,936 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifndef _KERNEL #include #endif #include #include #include #include #include /* XXX. */ #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { int s_count; /* (m) Ref cnt; pgrps in session. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct tty *s_ttyp; /* (m) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Pgrp id. */ int pg_jobc; /* (m) job cntl proc count */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by sched_lock mtx * k - only accessed by curthread * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * p - select lock (sellock) * q - td_contested lock * r - p_peers lock * x - created at fork, only changes during single threading in exec * z - zombie threads/kse/ksegroup lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct ithd; struct ke_sched; struct kg_sched; struct nlminfo; struct p_sched; struct sleepqueue; struct td_sched; struct trapframe; struct turnstile; /* * Here we define the four structures used for process information. * * The first is the thread. It might be though of as a "Kernel * Schedulable Entity Context". * This structure contains all the information as to where a thread of * execution is now, or was when it was suspended, why it was suspended, * and anything else that will be needed to restart it when it is * rescheduled. Always associated with a KSE when running, but can be * reassigned to an equivalent KSE when being restarted for * load balancing. Each of these is associated with a kernel stack * and a pcb. * * It is important to remember that a particular thread structure only * exists as long as the system call or kernel entrance (e.g. by pagefault) * which it is currently executing. It should therefore NEVER be referenced * by pointers in long lived structures that live longer than a single * request. If several threads complete their work at the same time, * they will all rewind their stacks to the user boundary, report their * completion state, and all but one will be freed. That last one will * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel * entrance. (basically to save freeing and then re-allocating it) The KSE * keeps a cached thread available to allow it to quickly * get one when it needs a new one. There is also a system * cache of free threads. Threads have priority and partake in priority * inheritance schemes. */ struct thread; /* * The second structure is the Kernel Schedulable Entity. (KSE) * It represents the ability to take a slot in the scheduler queue. * As long as this is scheduled, it could continue to run any threads that * are assigned to the KSEGRP (see later) until either it runs out * of runnable threads of high enough priority, or CPU. * It runs on one CPU and is assigned a quantum of time. When a thread is * blocked, The KSE continues to run and will search for another thread * in a runnable state amongst those it has. It May decide to return to user * mode with a new 'empty' thread if there are no runnable threads. * Threads are temporarily associated with a KSE for scheduling reasons. */ struct kse; /* * The KSEGRP is allocated resources across a number of CPUs. * (Including a number of CPUxQUANTA. It parcels these QUANTA up among * its KSEs, each of which should be running in a different CPU. * BASE priority and total available quanta are properties of a KSEGRP. * Multiple KSEGRPs in a single process compete against each other * for total quanta in the same way that a forked child competes against * it's parent process. */ struct ksegrp; /* * A process is the owner of all system resources allocated to a task * except CPU quanta. * All KSEGs under one process see, and have the same access to, these * resources (e.g. files, memory, sockets, permissions kqueues). * A process may compete for CPU cycles on the same basis as a * forked process cluster by spawning several KSEGRPs. */ struct proc; /*************** * In pictures: With a single run queue used by all processors: RUNQ: --->KSE---KSE--... SLEEPQ:[]---THREAD---THREAD---THREAD | / []---THREAD KSEG---THREAD--THREAD--THREAD [] []---THREAD---THREAD (processors run THREADs from the KSEG until they are exhausted or the KSEG exhausts its quantum) With PER-CPU run queues: KSEs on the separate run queues directly They would be given priorities calculated from the KSEG. * *****************/ /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * The first KSE available in the correct group will run this thread. * If several are available, use the one on the same CPU as last time. * When waiting to be run, threads are hung off the KSEGRP in priority order. * with N runnable and queued KSEs in the KSEGRP, the first N threads * are linked to them. Other threads are not yet assigned. */ struct thread { struct proc *td_proc; /* (*) Associated process. */ struct ksegrp *td_ksegrp; /* (*) Associated KSEG. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_kglist; /* (*) All threads in this ksegrp. */ /* The two queues below should someday be merged. */ TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. */ TAILQ_ENTRY(thread) td_runq; /* (j/z) Run queue(s). XXXKSE */ TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ int td_tid; /* (b) Thread ID. */ /* Cleared during fork1() or thread_sched_upcall(). */ #define td_startzero td_flags int td_flags; /* (j) TDF_* flags. */ int td_inhibitors; /* (j) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ struct kse *td_last_kse; /* (j) Previous value of td_kse. */ struct kse *td_kse; /* (j) Current KSE if running. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ void *td_wchan; /* (j) Sleep address. */ const char *td_wmesg; /* (j) Reason for sleep. */ u_char td_lastcpu; /* (j) Last cpu we were on. */ u_char td_oncpu; /* (j) Which cpu we are on. */ short td_locks; /* (k) DEBUG: lockmgr count of locks. */ struct turnstile *td_blocked; /* (j) Lock process is blocked on. */ struct ithd *td_ithd; /* (b) For interrupt threads only. */ const char *td_lockname; /* (j) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct thread *td_standin; /* (*) Use this for an upcall. */ u_int td_prticks; /* (*) Profclock hits in sys for user */ struct kse_upcall *td_upcall; /* (*) Upcall structure. */ u_int64_t td_sticks; /* (j) Statclock hits in system mode. */ u_int td_uuticks; /* (*) Statclock in user, for UTS. */ u_int td_usticks; /* (*) Statclock in kernel, for UTS. */ int td_intrval; /* (*) Return value of TDF_INTERRUPT. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ sigset_t td_sigmask; /* (c) Current signal mask. */ sigset_t td_siglist; /* (c) Sigs arrived, not delivered. */ sigset_t *td_waitset; /* (c) Wait set for sigwait. */ TAILQ_ENTRY(thread) td_umtx; /* (c?) Link for when we're blocked. */ volatile u_int td_generation; /* (k) Enable detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_kflags; /* (c) Flags for KSE threading. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ #define td_startcopy td_endzero u_char td_base_pri; /* (j) Thread base kernel priority. */ u_char td_priority; /* (j) Thread active priority. */ #define td_endcopy td_pcb /* * fields that must be manually set in fork1() or thread_sched_upcall() * or already have been set in the allocator, contstructor, etc.. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */ vm_offset_t td_altkstack; /* (a) Kernel VA of alternate kstack. */ int td_altkstack_pages; /* (a) Size of the alternate kstack */ u_int td_critnest; /* (k) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler-specific data. */ }; /* Flags kept in td_flags: */ #define TDF_INPANIC 0x000002 /* Caused a panic, let it drive crashdump. */ #define TDF_CAN_UNBIND 0x000004 /* Only temporarily bound. */ #define TDF_SINTR 0x000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x000020 /* This is one of the per-CPU idle threads. */ #define TDF_SELECT 0x000040 /* Selecting; wakeup/waiting danger. */ #define TDF_TSNOBLOCK 0x000100 /* Don't block on a turnstile due to race. */ #define TDF_ASTPENDING 0x000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x002000 /* Thread is marked as interrupted. */ #define TDF_USTATCLOCK 0x004000 /* Finish user statclock hit at next AST. */ #define TDF_OWEUPC 0x008000 /* Owe thread an addupc() call at next AST. */ #define TDF_NEEDRESCHED 0x010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x020000 /* Thread may need signal delivery. */ #define TDF_UMTXWAKEUP 0x080000 /* Libthr thread must not sleep on a umtx. */ #define TDF_THRWAKEUP 0x100000 /* Libthr thread must not suspend itself. */ /* "Private" flags kept in td_pflags: */ #define TDP_OLDMASK 0x0001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x0002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x0004 /* Thread is currently in KTRACE code. */ #define TDP_UPCALLING 0x0008 /* This thread is doing an upcall. */ #define TDP_COWINPROGRESS 0x0010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x0020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x0040 /* Lock aquisition - deadlock treatment. */ #define TDP_SA 0x0080 /* A scheduler activation based thread. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem.. bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TDK_KSEREL 0x0001 /* Blocked in msleep on kg->kg_completed. */ #define TDK_KSERELSIG 0x0002 /* Blocked in msleep on p->p_siglist. */ #define TDK_WAKEUP 0x0004 /* Thread has been woken by kse_wakeup. */ #define TD_CAN_UNBIND(td) \ (((td)->td_flags & TDF_CAN_UNBIND) == TDF_CAN_UNBIND && \ ((td)->td_upcall != NULL)) #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN /* * The schedulable entity that can be given a context to run. * A process may have several of these. Probably one per processor * but posibly a few more. In this universe they are grouped * with a KSEG that contains the priority and niceness * for the group. */ struct kse { struct proc *ke_proc; /* (*) Associated process. */ struct ksegrp *ke_ksegrp; /* (*) Associated KSEG. */ TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of KSEs in ke_ksegrp. */ TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of KSEs in this state. */ TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ #define ke_startzero ke_flags int ke_flags; /* (j) KEF_* flags. */ struct thread *ke_thread; /* (*) Active associated thread. */ fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ u_char ke_oncpu; /* (j) Which cpu we are on. */ char ke_rqindex; /* (j) Run queue index. */ enum { KES_UNUSED = 0x0, KES_IDLE, KES_ONRUNQ, KES_UNQUEUED, /* in transit */ KES_THREAD /* slaved to thread state */ } ke_state; /* (j) KSE status. */ #define ke_endzero ke_dummy u_char ke_dummy; struct ke_sched *ke_sched; /* (*) Scheduler-specific data. */ }; /* flags kept in ke_flags */ #define KEF_SCHED0 0x00001 /* For scheduler-specific use. */ #define KEF_SCHED1 0x00002 /* For scheduler-specific use. */ #define KEF_SCHED2 0X00004 /* For scheduler-specific use. */ #define KEF_SCHED3 0x00008 /* For scheduler-specific use. */ #define KEF_DIDRUN 0x02000 /* KSE actually ran. */ #define KEF_EXIT 0x04000 /* KSE is being killed. */ /* * The upcall management structure. * The upcall is used when returning to userland. If a thread does not have * an upcall on return to userland the thread exports its context and exits. */ struct kse_upcall { TAILQ_ENTRY(kse_upcall) ku_link; /* List of upcalls in KSEG. */ struct ksegrp *ku_ksegrp; /* Associated KSEG. */ struct thread *ku_owner; /* owning thread */ int ku_flags; /* KUF_* flags. */ struct kse_mailbox *ku_mailbox; /* userland mailbox address. */ stack_t ku_stack; /* userland upcall stack. */ void *ku_func; /* userland upcall function. */ unsigned int ku_mflags; /* cached upcall mailbox flags */ }; #define KUF_DOUPCALL 0x00001 /* Do upcall now, don't wait. */ #define KUF_EXITING 0x00002 /* Upcall structure is exiting. */ /* * Kernel-scheduled entity group (KSEG). The scheduler considers each KSEG to * be an indivisible unit from a time-sharing perspective, though each KSEG may * contain multiple KSEs. */ struct ksegrp { struct proc *kg_proc; /* (*) Process that contains this KSEG. */ TAILQ_ENTRY(ksegrp) kg_ksegrp; /* (*) Queue of KSEGs in kg_proc. */ TAILQ_HEAD(, kse) kg_kseq; /* (ke_kglist) All KSEs. */ TAILQ_HEAD(, kse) kg_iq; /* (ke_kgrlist) All idle KSEs. */ TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */ TAILQ_HEAD(, thread) kg_runq; /* (td_runq) waiting RUNNABLE threads */ TAILQ_HEAD(, thread) kg_slpq; /* (td_runq) NONRUNNABLE threads. */ TAILQ_HEAD(, kse_upcall) kg_upcalls; /* All upcalls in the group. */ #define kg_startzero kg_estcpu u_int kg_estcpu; /* (j) Sum of the same field in KSEs. */ u_int kg_slptime; /* (j) How long completely blocked. */ struct thread *kg_last_assigned; /* (j) Last thread assigned to a KSE. */ int kg_runnable; /* (j) Num runnable threads on queue. */ int kg_runq_kses; /* (j) Num KSEs on runq. */ int kg_idle_kses; /* (j) Num KSEs on iq. */ int kg_numupcalls; /* (j) Num upcalls. */ int kg_upsleeps; /* (c) Num threads in kse_release(). */ struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. */ int kg_nextupcall; /* (*) Next upcall time. */ int kg_upquantum; /* (*) Quantum to schedule an upcall. */ #define kg_endzero kg_pri_class #define kg_startcopy kg_endzero u_char kg_pri_class; /* (j) Scheduling class. */ u_char kg_user_pri; /* (j) User pri from estcpu and nice. */ - signed char kg_nice; /* (c + j) Process "nice" value. */ #define kg_endcopy kg_numthreads int kg_numthreads; /* (j) Num threads in total. */ int kg_kses; /* (j) Num KSEs in group. */ struct kg_sched *kg_sched; /* (*) Scheduler-specific data. */ }; /* * The old fashionned process. May have multiple threads, KSEGRPs * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, ksegrp) p_ksegrps; /* (kg_ksegrp) All KSEGs. */ TAILQ_HEAD(, thread) p_threads; /* (td_plist) Threads. (shortcut) */ TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Ptr to open files structure. */ struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */ /* Accumulated stats for all KSEs? */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ struct vm_object *p_upages_obj; /* (a) Upages object. */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ /* * The following don't make too much sense.. * See the td_ or ke_ versions of the same flags */ int p_flag; /* (c) P_* flags. */ int p_sflag; /* (j) PS_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* KSEs can be run. */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct mtx p_mtx; /* (n) Lock for this struct. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtime; /* (j) Time swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct bintime p_runtime; /* (j) Real time. */ u_int64_t p_uu; /* (j) Previous user time in usec. */ u_int64_t p_su; /* (j) Previous system time in usec. */ u_int64_t p_iu; /* (j) Previous intr time in usec. */ u_int64_t p_uticks; /* (j) Statclock hits in user mode. */ u_int64_t p_sticks; /* (j) Statclock hits in system mode. */ u_int64_t p_iticks; /* (j) Statclock hits in intr. */ int p_profthreads; /* (c) Num threads in addupc_task. */ int p_maxthrwaits; /* (c) Max threads num waiters */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ sigset_t p_siglist; /* (c) Sigs not delivered to a td. */ char p_lock; /* (c) Proclock (prevent swap) count. */ struct klist p_klist; /* (c) Knotes attached to this proc. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ void *p_aioinfo; /* (?) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (c) # threads in suspended mode */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (j) Current CPU limit in seconds. */ + signed char p_nice; /* (c + j) Process "nice" value. */ /* End area that is copied on creation. */ #define p_endcopy p_xstat u_short p_xstat; /* (c) Exit status; also stop sig. */ int p_numthreads; /* (j) Number of threads. */ int p_numksegrps; /* (?) number of ksegrps */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ struct user *p_uarea; /* (k) Kernel VA of u-area (CPU). */ u_short p_acflag; /* (c) Accounting flags. */ struct rusage *p_ru; /* (a) Exit information. XXX */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU 0xff /* For when we aren't on a CPU. (SMP) */ /* Status values (p_stat). */ /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread. (*)*/ #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread in requesting to stop prof */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_SA 0x08000 /* Using scheduler activations. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only one thread can continue */ /* (not to user) */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) /* These flags are kept in p_sflag and are protected with sched_lock. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ #define PS_XCPU 0x00002 /* Exceeded CPU limit. */ #define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */ #define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */ #define PS_SWAPINREQ 0x00100 /* Swapin request due to wakeup. */ #define PS_SWAPPINGOUT 0x00200 /* Process is being swapped out. */ #define PS_SWAPPINGIN 0x04000 /* Process is being swapped in. */ #define PS_MACPEND 0x08000 /* Ast()-based MAC event pending. */ /* used only in legacy conversion code */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_KSEGRP_IN_PROC(p, kg) \ TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp) #define FOREACH_THREAD_IN_GROUP(kg, td) \ TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist) #define FOREACH_KSE_IN_GROUP(kg, ke) \ TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist) #define FOREACH_UPCALL_IN_GROUP(kg, ku) \ TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) /* XXXKSE the lines below should probably only be used in 1:1 code */ #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) #define FIRST_KSEGRP_IN_PROC(p) TAILQ_FIRST(&(p)->p_ksegrps) #define FIRST_KSE_IN_KSEGRP(kg) TAILQ_FIRST(&(kg)->kg_kseq) #define FIRST_KSE_IN_PROC(p) FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p)) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) { \ if (--(s)->s_count == 0) \ FREE(s, M_SESSION); \ } #define STOPEVENT(p, e, v) do { \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_lock++; \ if (((p)->p_sflag & PS_INMEM) == 0) \ faultin((p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (--(p)->p_lock); \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td)) /* Lock and unlock process arguments. */ #define PARGS_LOCK(p) mtx_lock(&pargs_ref_lock) #define PARGS_UNLOCK(p) mtx_unlock(&pargs_ref_lock) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx pargs_ref_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct ksegrp ksegrp0; /* Primary ksegrp in proc0. */ extern struct kse kse0; /* Primary kse in proc0. */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; extern int sched_quantum; /* Scheduling quantum in ticks. */ LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct proc *updateproc; /* Process slot for syncer (sic). */ extern struct uma_zone *proc_zone; extern int lastpid; struct proc *pfind(pid_t); /* Find process by id. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ void adjustrunqueue(struct thread *, int newpri); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, int, int, struct proc **); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); int leavepgrp(struct proc *p); void mi_switch(int flags); /* Flags for mi_switch(). */ #define SW_VOL 0x0001 /* Voluntary switch. */ #define SW_INVOL 0x0002 /* Involuntary switch. */ int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_free(struct pargs *pa); void pargs_hold(struct pargs *pa); void procinit(void); void threadinit(void); void proc_linkup(struct proc *p, struct ksegrp *kg, struct kse *ke, struct thread *td); void proc_reparent(struct proc *child, struct proc *newparent); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void setrunnable(struct thread *); void setrunqueue(struct thread *); void setsugid(struct proc *p); int sigonstack(size_t sp); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void cpu_idle(void); extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *old, struct thread *new); void cpu_throw(struct thread *old, struct thread *new) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *, u_int); void cpu_exit(struct thread *); void exit1(struct thread *, int) __dead2; void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ struct ksegrp *ksegrp_alloc(void); void ksegrp_free(struct ksegrp *kg); void ksegrp_stash(struct ksegrp *kg); struct kse *kse_alloc(void); void kse_free(struct kse *ke); void kse_stash(struct kse *ke); void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_setup(struct thread *td); void cpu_thread_siginfo(int sig, u_long code, siginfo_t *si); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); void kse_reassign(struct kse *ke); void kse_link(struct kse *ke, struct ksegrp *kg); void kse_unlink(struct kse *ke); void ksegrp_link(struct ksegrp *kg, struct proc *p); void ksegrp_unlink(struct ksegrp *kg); void thread_signal_add(struct thread *td, int sig); struct thread *thread_alloc(void); void thread_exit(void) __dead2; int thread_export_context(struct thread *td, int willexit); void thread_free(struct thread *td); void thread_link(struct thread *td, struct ksegrp *kg); int thread_new_tid(void); void thread_reap(void); struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku); int thread_single(int how); #define SINGLE_NO_EXIT 0 /* values for 'how' */ #define SINGLE_EXIT 1 void thread_single_end(void); void thread_stash(struct thread *td); int thread_suspend_check(int how); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_unsuspend_one(struct thread *td); int thread_userret(struct thread *td, struct trapframe *frame); int thread_upcall_check(struct thread *td); void thread_user_enter(struct proc *p, struct thread *td); void thread_wait(struct proc *p); int thread_statclock(int user); struct kse_upcall *upcall_alloc(void); void upcall_free(struct kse_upcall *ku); void upcall_link(struct kse_upcall *ku, struct ksegrp *kg); void upcall_unlink(struct kse_upcall *ku); void upcall_remove(struct thread *td); void upcall_stash(struct kse_upcall *ke); void thread_sanity_check(struct thread *td, char *); void thread_stopped(struct proc *p); void thread_switchout(struct thread *td); void thr_exit1(void); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/sched.h =================================================================== --- head/sys/sys/sched.h (revision 130550) +++ head/sys/sys/sched.h (revision 130551) @@ -1,122 +1,122 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SCHED_H_ #define _SYS_SCHED_H_ /* * General scheduling info. * * sched_load: * Total runnable non-ithread threads in the system. * * sched_runnable: * Runnable threads for this processor. */ int sched_load(void); int sched_rr_interval(void); int sched_runnable(void); /* * Proc related scheduling hooks. */ void sched_exit(struct proc *p, struct proc *child); void sched_fork(struct proc *p, struct proc *child); /* * KSE Groups contain scheduling priority information. They record the * behavior of groups of KSEs and threads. */ void sched_class(struct ksegrp *kg, int class); void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child); void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child); -void sched_nice(struct ksegrp *kg, int nice); +void sched_nice(struct proc *p, int nice); /* * Threads are switched in and out, block on resources, have temporary * priorities inherited from their ksegs, and use up cpu time. */ void sched_exit_thread(struct thread *td, struct thread *child); void sched_fork_thread(struct thread *td, struct thread *child); fixpt_t sched_pctcpu(struct thread *td); void sched_prio(struct thread *td, u_char prio); void sched_sleep(struct thread *td); void sched_switch(struct thread *td); void sched_userret(struct thread *td); void sched_wakeup(struct thread *td); /* * Threads are moved on and off of run queues */ void sched_add(struct thread *td); struct kse *sched_choose(void); /* XXX Should be thread * */ void sched_clock(struct thread *td); void sched_rem(struct thread *td); /* * Binding makes cpu affinity permanent while pinning is used to temporarily * hold a thread on a particular CPU. */ void sched_bind(struct thread *td, int cpu); static __inline void sched_pin(void); void sched_unbind(struct thread *td); static __inline void sched_unpin(void); /* * These interfaces will eventually be removed. */ void sched_exit_kse(struct kse *ke, struct kse *child); void sched_fork_kse(struct kse *ke, struct kse *child); /* * These procedures tell the process data structure allocation code how * many bytes to actually allocate. */ int sched_sizeof_kse(void); int sched_sizeof_ksegrp(void); int sched_sizeof_proc(void); int sched_sizeof_thread(void); extern struct ke_sched *kse0_sched; extern struct kg_sched *ksegrp0_sched; extern struct p_sched *proc0_sched; extern struct td_sched *thread0_sched; static __inline void sched_pin(void) { curthread->td_pinned++; } static __inline void sched_unpin(void) { curthread->td_pinned--; } #endif /* !_SYS_SCHED_H_ */ Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 130550) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 130551) @@ -1,2120 +1,2120 @@ /* * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int ffs_copyonwrite(struct vnode *, struct buf *); static int readblock(struct buf *, ufs2_daddr_t); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the following flag allows this * crash persistence to be disabled. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; int flag = mp->mnt_flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; struct snaphead *snaphead; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp, *sbp = NULL; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *nvp, *devvp; struct uio auio; struct iovec aiov; /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); vput(nd.ni_dvp); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); return (error); } vp = nd.ni_vp; ip = VTOI(vp); devvp = ip->i_devvp; /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP(ip, i_size) = ip->i_size; ip->i_flag |= IN_CHANGE | IN_UPDATE; if ((error = readblock(bp, numblks - 1)) != 0) goto out; bawrite(bp); /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); bzero(fs->fs_active, len); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP(ip, i_flags) = ip->i_flags; ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) goto out; VOP_UNLOCK(vp, 0, td); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ - if (td->td_ksegrp->kg_nice > 0) { + if (td->td_proc->p_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); - saved_nice = td->td_ksegrp->kg_nice; - sched_nice(td->td_ksegrp, 0); + saved_nice = td->td_proc->p_nice; + sched_nice(td->td_proc, 0); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (collectsnapstats) nanotime(&starttime); /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &sbp); if (error) { brelse(sbp); sbp = NULL; goto out1; } loc = blkoff(fs, fs->fs_sblockloc); copy_fs = (struct fs *)(sbp->b_data + loc); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_ILOCK(mp); loop: for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (xvp->v_mount != mp) goto loop; nvp = TAILQ_NEXT(xvp, v_nmntvnodes); VI_LOCK(xvp); MNT_IUNLOCK(mp); if ((xvp->v_iflag & VI_XLOCK) || xvp->v_usecount == 0 || xvp->v_type == VNON || (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { MNT_ILOCK(mp); goto loop; } if (snapdebug) vprint("ffs_snapshot: busy vnode", xvp); if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0, td); MNT_ILOCK(mp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0, td); MNT_ILOCK(mp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len < fs->fs_bsize) { ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number); blkno = DIP(xp, i_db[loc]); DIP(xp, i_db[loc]) = 0; } } snaplistsize += 1; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY); if (blkno) DIP(xp, i_db[loc]) = blkno; if (!error) error = ffs_freefile(copy_fs, vp, xp->i_number, xp->i_mode); VOP_UNLOCK(xvp, 0, td); if (error) { free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; goto out1; } MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the snapshots * to share. In either case, acquire the snapshot lock and give * up our original private lock. */ VI_LOCK(devvp); snaphead = &devvp->v_rdev->si_snapshots; if ((xp = TAILQ_FIRST(snaphead)) != NULL) { struct lock *lkp; lkp = ITOV(xp)->v_vnlock; VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = lkp; } else { struct lock *lkp; VI_UNLOCK(devvp); MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, M_WAITOK); lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOPAUSE); VI_LOCK(vp); vp->v_vnlock = lkp; } vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); transferlockers(&vp->v_lock, vp->v_vnlock); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (devvp->v_rdev->si_snapblklist != NULL) panic("ffs_snapshot: non-empty list"); devvp->v_rdev->si_snapblklist = snapblklist; devvp->v_rdev->si_snaplistsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %d already on list", ip->i_number); TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); vp->v_vflag |= VV_SYSTEM; out1: /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount); vn_start_write(NULL, &wrtmp, V_WAIT); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (sbp == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ snaphead = &devvp->v_rdev->si_snapshots; TAILQ_FOREACH(xp, snaphead, i_nextsnap) { if (xp == ip) break; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (ip->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len;; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = devvp->v_rdev->si_snapblklist; devvp->v_rdev->si_snapblklist = snapblklist; devvp->v_rdev->si_snaplistsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) FREE(space, M_UFSMNT); done: free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); out: if (saved_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); - sched_nice(td->td_ksegrp, saved_nice); + sched_nice(td->td_proc, saved_nice); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } if (fs->fs_active != 0) { FREE(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } mp->mnt_flag = flag; if (error) (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); vn_finished_write(wrtmp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ip->i_fs; error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (EIO); } atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cg * fs->fs_fpg / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < NDADDR) { for ( ; loc < NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP(ip, i_db[loc]) = BLK_NOCOPY; else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP(ip, i_db[loc]) = 0; else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { brelse(bp); return (error); } indiroff = (base + loc - NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { brelse(bp); return (error); } indiroff = 0; } if (ip->i_ump->um_fstype == UFS1) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } bqrelse(bp); if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; /* * Find snapshot in incore list. */ TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %d\n", ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ fs = ip->i_fs; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct lock *lkp; struct buf *ibp; struct fs *fs; struct thread *td = curthread; ufs2_daddr_t numblks, blkno, dblk, *snapblklist; int error, loc, last; ip = VTOI(vp); fs = ip->i_fs; devvp = ip->i_devvp; /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ if (ip->i_nextsnap.tqe_prev != 0) { VI_LOCK(devvp); lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp), td); VI_LOCK(devvp); TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; lkp = vp->v_vnlock; vp->v_vnlock = &vp->v_lock; lockmgr(lkp, LK_RELEASE, NULL, td); if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { VI_UNLOCK(devvp); } else { snapblklist = devvp->v_rdev->si_snapblklist; devvp->v_rdev->si_snapblklist = 0; devvp->v_rdev->si_snaplistsize = 0; devvp->v_rdev->si_copyonwrite = 0; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); lockmgr(lkp, LK_RELEASE, NULL, td); lockdestroy(lkp); FREE(lkp, M_UFSMNT); FREE(snapblklist, M_UFSMNT); } } /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP(ip, i_db[blkno]) = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { DIP(ip, i_blocks) -= btodb(fs->fs_bsize); DIP(ip, i_db[blkno]) = 0; } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (ip->i_ump->um_fstype == UFS1) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP(ip, i_flags) = ip->i_flags; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; struct snaphead *snaphead; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); snaphead = &devvp->v_rdev->si_snapshots; TAILQ_FOREACH(ip, snaphead, i_nextsnap) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { if (snapshot_locked == 0 && lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) goto retry; snapshot_locked = 1; td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (snapshot_locked == 0 && lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, VI_MTX(devvp), td) != 0) { if (lbn >= NDADDR) bqrelse(ibp); vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); goto retry; } snapshot_locked = 1; if (lbn < NDADDR) { DIP(ip, i_db[lbn]) = BLK_NOCOPY; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (snapshot_locked == 0 && lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, VI_MTX(devvp), td) != 0) { if (lbn >= NDADDR) bqrelse(ibp); vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); goto retry; } snapshot_locked = 1; if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %d lbn %jd from inum %d\n", "Grabonremove: snapino", ip->i_number, (intmax_t)lbn, inum); #endif if (lbn < NDADDR) { DIP(ip, i_db[lbn]) = bno; } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP(ip, i_blocks) += btodb(size); ip->i_flag |= IN_CHANGE | IN_UPDATE; VOP_UNLOCK(vp, 0, td); return (1); } if (lbn >= NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", "Copyonremove: snapino ", ip->i_number, (intmax_t)lbn, "for inum", inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (snapshot_locked) VOP_UNLOCK(vp, 0, td); else VI_UNLOCK(devvp); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snaphead *snaphead; struct vnode *vp; struct inode *ip, *xp; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before UFS_TRUNCATE or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; snaphead = &devvp->v_rdev->si_snapshots; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if ((ip->i_flags & SF_SNAPSHOT) == 0) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the * snapshots to share. In either case, acquire the snapshot * lock and give up our original private lock. */ VI_LOCK(devvp); if ((xp = TAILQ_FIRST(snaphead)) != NULL) { struct lock *lkp; lkp = ITOV(xp)->v_vnlock; VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = lkp; } else { struct lock *lkp; VI_UNLOCK(devvp); MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, M_WAITOK); lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOPAUSE); VI_LOCK(vp); vp->v_vnlock = lkp; } vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); transferlockers(&vp->v_lock, vp->v_vnlock); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %d already on list", ip->i_number); else TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0, td); } /* * No usable snapshots found. */ if (vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0, td); return; } MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0, td); FREE(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0, td); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); devvp->v_rdev->si_snaplistsize = snaplistsize; devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; struct lock *lkp = NULL; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); while ((xp = TAILQ_FIRST(snaphead)) != 0) { vp = ITOV(xp); lkp = vp->v_vnlock; vp->v_vnlock = &vp->v_lock; TAILQ_REMOVE(snaphead, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; if (xp->i_effnlink > 0) { VI_UNLOCK(devvp); vrele(vp); VI_LOCK(devvp); } } if (devvp->v_rdev->si_snapblklist != NULL) { FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); devvp->v_rdev->si_snapblklist = NULL; devvp->v_rdev->si_snaplistsize = 0; } if (lkp != NULL) { lockdestroy(lkp); FREE(lkp, M_UFSMNT); } ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); devvp->v_rdev->si_copyonwrite = 0; devvp->v_vflag &= ~VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ static int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snaphead *snaphead; struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = 0; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); snaphead = &devvp->v_rdev->si_snapshots; ip = TAILQ_FIRST(snaphead); fs = ip->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = devvp->v_rdev->si_snapblklist; upper = devvp->v_rdev->si_snaplistsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } /* * Not in the precomputed list, so check the snapshots. */ retry: TAILQ_FOREACH(ip, snaphead, i_nextsnap) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { if (snapshot_locked == 0 && lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) { VI_LOCK(devvp); goto retry; } snapshot_locked = 1; td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef DIAGNOSTIC if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ if (snapshot_locked == 0 && lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) { VI_LOCK(devvp); goto retry; } snapshot_locked = 1; td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %d lbn %jd for ", ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %d", VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); } if (snapshot_locked) VOP_UNLOCK(vp, 0, td); else VI_UNLOCK(devvp); return (error); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(bp, lbn) struct buf *bp; ufs2_daddr_t lbn; { struct uio auio; struct iovec aiov; struct thread *td = curthread; struct inode *ip = VTOI(bp->b_vp); aiov.iov_base = bp->b_data; aiov.iov_len = bp->b_bcount; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); auio.uio_resid = bp->b_bcount; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; return (physio(ip->i_devvp->v_rdev, &auio, 0)); } Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 130550) +++ head/sys/vm/vm_glue.c (revision 130551) @@ -1,1120 +1,1120 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int maxslp; /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits(void *); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler(void *); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL) #ifndef NO_SWAPPING static void swapout(struct proc *); static void vm_proc_swapin(struct proc *p); static void vm_proc_swapout(struct proc *p); #endif /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjuction with this call. */ int useracc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); PROC_LOCK(curproc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) > lim_cur(curproc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(curproc); return (ENOMEM); } PROC_UNLOCK(curproc); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Create the U area for a new process. * This routine directly affects the fork perf for a process. */ void vm_proc_new(struct proc *p) { vm_page_t ma[UAREA_PAGES]; vm_object_t upobj; vm_offset_t up; vm_page_t m; u_int i; /* * Get a kernel virtual address for the U area for this process. */ up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE); if (up == 0) panic("vm_proc_new: upage allocation failed"); p->p_uarea = (struct user *)up; /* * Allocate object and page(s) for the U area. */ upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES); p->p_upages_obj = upobj; VM_OBJECT_LOCK(upobj); for (i = 0; i < UAREA_PAGES; i++) { m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; vm_page_lock_queues(); vm_page_wakeup(m); m->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(upobj); /* * Enter the pages into the kernel address space. */ pmap_qenter(up, ma, UAREA_PAGES); } /* * Dispose the U area for a process that has exited. * This routine directly impacts the exit perf of a process. * XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called. */ void vm_proc_dispose(struct proc *p) { vm_object_t upobj; vm_offset_t up; vm_page_t m; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_dispose: incorrect number of pages in upobj"); vm_page_lock_queues(); while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) { vm_page_busy(m); vm_page_unwire(m, 0); vm_page_free(m); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qremove(up, UAREA_PAGES); kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE); vm_object_deallocate(upobj); } #ifndef NO_SWAPPING /* * Allow the U area for a process to be prejudicially paged out. */ static void vm_proc_swapout(struct proc *p) { vm_object_t upobj; vm_offset_t up; vm_page_t m; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_dispose: incorrect number of pages in upobj"); vm_page_lock_queues(); TAILQ_FOREACH(m, &upobj->memq, listq) { vm_page_dirty(m); vm_page_unwire(m, 0); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qremove(up, UAREA_PAGES); } /* * Bring the U area for a specified process back in. */ static void vm_proc_swapin(struct proc *p) { vm_page_t ma[UAREA_PAGES]; vm_object_t upobj; vm_offset_t up; vm_page_t m; int rv; int i; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); for (i = 0; i < UAREA_PAGES; i++) { m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_proc_swapin: cannot get upage"); } ma[i] = m; } if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_swapin: lost pages from upobj"); vm_page_lock_queues(); TAILQ_FOREACH(m, &upobj->memq, listq) { m->valid = VM_PAGE_BITS_ALL; vm_page_wire(m); vm_page_wakeup(m); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qenter(up, ma, UAREA_PAGES); } /* * Swap in the UAREAs of all processes swapped out to the given device. * The pages in the UAREA are marked dirty and their swap metadata is freed. */ void vm_proc_swapin_all(struct swdevt *devidx) { struct proc *p; vm_object_t object; vm_page_t m; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); object = p->p_upages_obj; if (object != NULL) { VM_OBJECT_LOCK(object); if (swap_pager_isswapped(object, devidx)) { VM_OBJECT_UNLOCK(object); sx_sunlock(&allproc_lock); faultin(p); PROC_UNLOCK(p); VM_OBJECT_LOCK(object); vm_page_lock_queues(); TAILQ_FOREACH(m, &object->memq, listq) vm_page_dirty(m); vm_page_unlock_queues(); swap_pager_freespace(object, 0, object->un_pager.swp.swp_bcount); VM_OBJECT_UNLOCK(object); goto retry; } VM_OBJECT_UNLOCK(object); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } #endif #ifndef KSTACK_MAX_PAGES #define KSTACK_MAX_PAGES 32 #endif /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ void vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i; /* Bounds check */ if (pages <= 1) pages = KSTACK_PAGES; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); td->td_kstack_obj = ksobj; /* * Get a kernel virtual address for this thread's kstack. */ ks = kmem_alloc_nofault(kernel_map, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); if (ks == 0) panic("vm_thread_new: kstack allocation failed"); if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { /* * Get a kernel stack page. */ m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; vm_page_lock_queues(); vm_page_wakeup(m); m->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(ks, ma, pages); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m; int i, pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; pmap_qremove(ks, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock_queues(); vm_page_busy(m); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); vm_object_deallocate(ksobj); kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Allow a thread's kernel stack to be paged out. */ void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_lock_queues(); vm_page_dirty(m); vm_page_unwire(m, 0); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } ma[i] = m; vm_page_lock_queues(); vm_page_wire(m); vm_page_wakeup(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } /* * Set up a variable-sized alternate kstack. */ void vm_thread_new_altkstack(struct thread *td, int pages) { td->td_altkstack = td->td_kstack; td->td_altkstack_obj = td->td_kstack_obj; td->td_altkstack_pages = td->td_kstack_pages; vm_thread_new(td, pages); } /* * Restore the original kstack. */ void vm_thread_dispose_altkstack(struct thread *td) { vm_thread_dispose(td); td->td_kstack = td->td_altkstack; td->td_kstack_obj = td->td_altkstack_obj; td->td_kstack_pages = td->td_altkstack_pages; td->td_altkstack = 0; td->td_altkstack_obj = NULL; td->td_altkstack_pages = 0; } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ void vm_forkproc(td, p2, td2, flags) struct thread *td; struct proc *p2; struct thread *td2; int flags; { struct proc *p1 = td->td_proc; struct user *up; GIANT_REQUIRED; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { vmspace_unshare(p1); } } cpu_fork(td, p2, td2, flags); return; } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; p1->p_vmspace->vm_refcnt++; } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* XXXKSE this is unsatisfactory but should be adequate */ up = p2->p_uarea; MPASS(p2->p_sigacts != NULL); /* * p_stats currently points at fields in the user struct * but not at &u, instead at p_addr. Copy parts of * p_stats; zero the rest of p_stats (statistics). */ p2->p_stats = &up->u_stats; bzero(&up->u_stats.pstat_startzero, (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - (caddr_t) &up->u_stats.pstat_startzero)); bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, ((caddr_t) &up->u_stats.pstat_endcopy - (caddr_t) &up->u_stats.pstat_startcopy)); /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); } /* * Called after process has been wait(2)'ed apon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { GIANT_REQUIRED; vmspace_exitfree(p); /* and clean-out the vmspace */ } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { struct proc *p = udata; struct plimit *limp; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ limp = p->p_limit; limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } void faultin(p) struct proc *p; { #ifdef NO_SWAPPING PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_sflag & PS_INMEM) == 0) panic("faultin: proc swapped out with NO_SWAPPING!"); #else /* !NO_SWAPPING */ struct thread *td; GIANT_REQUIRED; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_sflag & PS_SWAPPINGIN) msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0); else if ((p->p_sflag & PS_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; mtx_lock_spin(&sched_lock); p->p_sflag |= PS_SWAPPINGIN; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); vm_proc_swapin(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGIN; p->p_sflag |= PS_INMEM; FOREACH_THREAD_IN_PROC(p, td) { TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); } mtx_unlock_spin(&sched_lock); wakeup(&p->p_sflag); /* Allow other threads to swap p out now. */ --p->p_lock; } #endif /* NO_SWAPPING */ } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * XXXKSE - process with the thread with highest priority counts.. * * Giant is still held at this point, to be released in tsleep. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { struct proc *p; struct thread *td; int pri; struct proc *pp; int ppri; mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); /* GIANT_REQUIRED */ loop: if (vm_page_count_min()) { VM_WAIT; goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct ksegrp *kg; if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { continue; } mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ if (td->td_inhibitors == TDI_SWAPPED) { kg = td->td_ksegrp; pri = p->p_swtime + kg->kg_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { - pri -= kg->kg_nice * 8; + pri -= p->p_nice * 8; } /* * if this ksegrp is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } } mtx_unlock_spin(&sched_lock); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "sched", maxslp * hz / 2); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { PROC_UNLOCK(p); goto loop; } mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPINREQ; mtx_unlock_spin(&sched_lock); /* * We would like to bring someone in. (only if there is space). * [What checks the space? ] */ faultin(p); PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); p->p_swtime = 0; mtx_unlock_spin(&sched_lock); goto loop; } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs(action) int action; { struct proc *p; struct thread *td; struct ksegrp *kg; int didswap = 0; GIANT_REQUIRED; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct vmspace *vm; int minslptime = 100000; /* * Watch out for a process in * creation. It may have no * address space or lock yet. */ mtx_lock_spin(&sched_lock); if (p->p_state == PRS_NEW) { mtx_unlock_spin(&sched_lock); continue; } mtx_unlock_spin(&sched_lock); /* * An aio daemon switches its * address space while running. * Perform a quick check whether * a process has P_SYSTEM. */ if ((p->p_flag & P_SYSTEM) != 0) continue; /* * Do not swapout a process that * is waiting for VM data * structures as there is a possible * deadlock. Test this first as * this may block. * * Lock the map until swapout * finishes, or a thread of this * process may attempt to alter * the map. */ PROC_LOCK(p); vm = p->p_vmspace; KASSERT(vm != NULL, ("swapout_procs: a process has no address space")); ++vm->vm_refcnt; PROC_UNLOCK(p); if (!vm_map_trylock(&vm->vm_map)) goto nextproc1; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) ) != 0) { goto nextproc2; } /* * only aiod changes vmspace, however it will be * skipped because of the if statement above checking * for P_SYSTEM */ if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM) goto nextproc2; switch (p->p_state) { default: /* Don't swap out processes in any sort * of 'special' state. */ break; case PRS_NORMAL: mtx_lock_spin(&sched_lock); /* * do not swapout a realtime process * Check all the thread groups.. */ FOREACH_KSEGRP_IN_PROC(p, kg) { if (PRI_IS_REALTIME(kg->kg_pri_class)) goto nextproc; /* * Guarantee swap_idle_threshold1 * time in memory. */ if (kg->kg_slptime < swap_idle_threshold1) goto nextproc; /* * Do not swapout a process if it is * waiting on a critical event of some * kind or there is a thread whose * pageable memory may be accessed. * * This could be refined to support * swapping out a thread. */ FOREACH_THREAD_IN_GROUP(kg, td) { if ((td->td_priority) < PSOCK || !thread_safetoswapout(td)) goto nextproc; } /* * If the system is under memory stress, * or if we are swapping * idle processes >= swap_idle_threshold2, * then swap the process out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (kg->kg_slptime < swap_idle_threshold2))) goto nextproc; if (minslptime > kg->kg_slptime) minslptime = kg->kg_slptime; } /* * If the process has been asleep for awhile and had * most of its pages taken away already, swap it out. */ if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (minslptime > swap_idle_threshold2))) { swapout(p); didswap++; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); sx_sunlock(&allproc_lock); goto retry; } nextproc: mtx_unlock_spin(&sched_lock); } nextproc2: PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); nextproc1: vmspace_free(vm); continue; } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapout(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM, ("swapout: lost a swapout race?")); #if defined(INVARIANTS) /* * Make sure that all threads are safe to be swapped out. * * Alternatively, we could swap out only safe threads. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(thread_safetoswapout(td), ("swapout: there is a thread not safe for swapout")); } #endif /* INVARIANTS */ ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPINGOUT; PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) TD_SET_SWAPPED(td); mtx_unlock_spin(&sched_lock); vm_proc_swapout(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGOUT; p->p_swtime = 0; } #endif /* !NO_SWAPPING */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 130550) +++ head/sys/vm/vm_pageout.c (revision 130551) @@ -1,1573 +1,1570 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_pmap_collect(void); static void vm_pageout_scan(int pass); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) #endif int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif static int vm_max_launder = 32; static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; static int vm_pageout_full_stats_interval = 0; static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; #else static int vm_swap_enabled=1; static int vm_swap_idle_enabled=0; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); SYSCTL_INT(_vm, OID_AUTO, max_launder, CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, ""); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(void); #endif static void vm_pageout_page_stats(void); /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ static int vm_pageout_clean(m) vm_page_t m; { vm_object_t object; vm_page_t mc[2*vm_pageout_page_count]; int pageout_count; int ib, is, page_base; vm_pindex_t pindex = m->pindex; mtx_assert(&vm_page_queue_mtx, MA_OWNED); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); /* * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP * with the new swapper, but we could have serious problems paging * out other object types if there is insufficient memory. * * Unfortunately, checking free memory here is far too late, so the * check has been moved up a procedural level. */ /* * Don't mess with the page if it's busy, held, or special */ if ((m->hold_count != 0) || ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) { return 0; } mc[vm_pageout_page_count] = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying * align the clusters (which leave sporatic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ object = m->object; more: while (ib && pageout_count < vm_pageout_page_count) { vm_page_t p; if (ib > pindex) { ib = 0; break; } if ((p = vm_page_lookup(object, pindex - ib)) == NULL) { ib = 0; break; } if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { ib = 0; break; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0 || p->queue != PQ_INACTIVE || p->wire_count != 0 || /* may be held by buf cache */ p->hold_count != 0) { /* may be undergoing I/O */ ib = 0; break; } mc[--page_base] = p; ++pageout_count; ++ib; /* * alignment boundry, stop here and switch directions. Do * not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { vm_page_t p; if ((p = vm_page_lookup(object, pindex + is)) == NULL) break; if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { break; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0 || p->queue != PQ_INACTIVE || p->wire_count != 0 || /* may be held by buf cache */ p->hold_count != 0) { /* may be undergoing I/O */ break; } mc[page_base + pageout_count] = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past a page boundry. This catches boundry * conditions. */ if (ib && pageout_count < vm_pageout_page_count) goto more; /* * we allow reads during pageouts... */ return (vm_pageout_flush(&mc[page_base], pageout_count, 0)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i; mtx_assert(&vm_page_queue_mtx, MA_OWNED); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_io_start(mc[i]); pmap_page_protect(mc[i], VM_PROT_READ); } vm_page_unlock_queues(); vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)), pageout_status); vm_page_lock_queues(); for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT((mt->flags & PG_WRITEABLE) == 0, ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ pmap_clear_modify(mt); vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_activate(mt); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_io_finish(mt); if (vm_page_count_severe()) vm_page_try_to_cache(mt); } } return numpagedout; } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * backing_objects. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap, first_object, desired) pmap_t pmap; vm_object_t first_object; long desired; { vm_object_t backing_object, object; vm_page_t p, next; int actcount, rcount, remove_mode; VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED); if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (object->paging_in_progress) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = TAILQ_FIRST(&object->memq); vm_page_lock_queues(); while (p && (rcount-- > 0)) { if (pmap_resident_count(pmap) <= desired) { vm_page_unlock_queues(); goto unlock_return; } next = TAILQ_NEXT(p, listq); cnt.v_pdpages++; if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || (p->flags & (PG_BUSY|PG_UNMANAGED)) || !pmap_page_exists_quick(pmap, p)) { p = next; continue; } actcount = pmap_ts_referenced(p); if (actcount) { vm_page_flag_set(p, PG_REFERENCED); } else if (p->flags & PG_REFERENCED) { actcount = 1; } if ((p->queue != PQ_ACTIVE) && (p->flags & PG_REFERENCED)) { vm_page_activate(p); p->act_count += actcount; vm_page_flag_clear(p, PG_REFERENCED); } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) { pmap_remove_all(p); vm_page_deactivate(p); } else { vm_pageq_requeue(p); } } else { vm_page_activate(p); vm_page_flag_clear(p, PG_REFERENCED); if (p->act_count < (ACT_MAX - ACT_ADVANCE)) p->act_count += ACT_ADVANCE; vm_pageq_requeue(p); } } else if (p->queue == PQ_INACTIVE) { pmap_remove_all(p); } p = next; } vm_page_unlock_queues(); if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_LOCK(backing_object); if (object != first_object) VM_OBJECT_UNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_UNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_UNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_UNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_UNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_UNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { GIANT_REQUIRED; vm_page_lock_queues(); pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); vm_page_unlock_queues(); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * This routine is very drastic, but can save the system * in a pinch. */ static void vm_pageout_pmap_collect(void) { int i; vm_page_t m; static int warningdone; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } vm_page_lock_queues(); for (i = 0; i < vm_page_array_size; i++) { m = &vm_page_array[i]; if (m->wire_count || m->hold_count || m->busy || (m->flags & (PG_BUSY | PG_UNMANAGED))) continue; pmap_remove_all(m); } vm_page_unlock_queues(); pmap_pagedaemon_waken = 0; } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ static void vm_pageout_scan(int pass) { vm_page_t m, next; struct vm_page marker; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; struct proc *p, *bigproc; struct thread *td; vm_offset_t size, bigsize; vm_object_t object; int actcount; int vnodes_skipped = 0; int maxlaunder; int s; mtx_lock(&Giant); /* * Decrease registered cache sizes. */ EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been drained above. */ uma_reclaim(); /* * Do whatever cleanup that the pmap code can. */ vm_pageout_pmap_collect(); addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit); /* * Calculate the number of pages we want to either free or move * to the cache. */ page_shortage = vm_paging_target() + addl_page_shortage_init; /* * Initialize our marker */ bzero(&marker, sizeof(marker)); marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; marker.queue = PQ_INACTIVE; marker.wire_count = 1; /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. * * maxlaunder limits the number of dirty pages we flush per scan. * For most systems a smaller value (16 or 32) is more robust under * extreme memory and disk pressure because any unnecessary writes * to disk can result in extreme performance degredation. However, * systems with excessive dirty pages (especially when MAP_NOSYNC is * used) will die horribly with limited laundering. If the pageout * daemon cannot clean enough pages in the first pass, we let it go * all out in succeeding passes. */ if ((maxlaunder = vm_max_launder) <= 1) maxlaunder = 1; if (pass) maxlaunder = 10000; vm_page_lock_queues(); rescan0: addl_page_shortage = addl_page_shortage_init; maxscan = cnt.v_inactive_count; for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { cnt.v_pdpages++; if (m->queue != PQ_INACTIVE) { goto rescan0; } next = TAILQ_NEXT(m, pageq); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; /* * A held page may be undergoing I/O, so skip it. */ if (m->hold_count) { vm_pageq_requeue(m); addl_page_shortage++; continue; } /* * Don't mess with busy pages, keep in the front of the * queue, most likely are being paged out. */ if (m->busy || (m->flags & PG_BUSY)) { addl_page_shortage++; continue; } /* * If the object is not being used, we ignore previous * references. */ if (m->object->ref_count == 0) { vm_page_flag_clear(m, PG_REFERENCED); pmap_clear_reference(m); /* * Otherwise, if the page has been referenced while in the * inactive queue, we bump the "activation count" upwards, * making it less likely that the page will be added back to * the inactive queue prematurely again. Here we check the * page tables (or emulated bits, if any), given the upper * level VM system not knowing anything about existing * references. */ } else if (((m->flags & PG_REFERENCED) == 0) && (actcount = pmap_ts_referenced(m))) { vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE); continue; } /* * If the upper level VM system knows about any page * references, we activate the page. We also set the * "activation count" higher than normal so that we will less * likely place pages back onto the inactive queue again. */ if ((m->flags & PG_REFERENCED) != 0) { vm_page_flag_clear(m, PG_REFERENCED); actcount = pmap_ts_referenced(m); vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE + 1); continue; } /* * If the upper level VM system doesn't know anything about * the page being dirty, we have to check for it again. As * far as the VM code knows, any partially dirty pages are * fully dirty. */ if (m->dirty == 0 && !pmap_is_modified(m)) { /* * Avoid a race condition: Unless write access is * removed from the page, another processor could * modify it before all access is removed by the call * to vm_page_cache() below. If vm_page_cache() finds * that the page has been modified when it removes all * access, it panics because it cannot cache dirty * pages. In principle, we could eliminate just write * access here rather than all access. In the expected * case, when there are no last instant modifications * to the page, removing all access will be cheaper * overall. */ if ((m->flags & PG_WRITEABLE) != 0) pmap_remove_all(m); } else { vm_page_dirty(m); } object = m->object; if (!VM_OBJECT_TRYLOCK(object)) continue; if (m->valid == 0) { /* * Invalid pages can be easily freed */ vm_page_busy(m); pmap_remove_all(m); vm_page_free(m); cnt.v_dfree++; --page_shortage; } else if (m->dirty == 0) { /* * Clean pages can be placed onto the cache queue. * This effectively frees them. */ vm_page_cache(m); --page_shortage; } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { /* * Dirty pages need to be paged out, but flushing * a page is extremely expensive verses freeing * a clean page. Rather then artificially limiting * the number of pages we can flush, we instead give * dirty pages extra priority on the inactive queue * by forcing them to be cycled through the queue * twice before being flushed, after which the * (now clean) page will cycle through once more * before being freed. This significantly extends * the thrash point for a heavily loaded machine. */ vm_page_flag_set(m, PG_WINATCFLS); vm_pageq_requeue(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if * we encounter them, to keep the system stable. * Normally this number is small, but under extreme * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ int swap_pageouts_ok; struct vnode *vp = NULL; struct mount *mp; if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { swap_pageouts_ok = 1; } else { swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && vm_page_count_min()); } /* * We don't bother paging objects that are "dead". * Those objects are in a "rundown" state. */ if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(object); vm_pageq_requeue(m); continue; } /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * The previous code skipped locked vnodes and, worse, * reordered pages in the queue. This results in * completely non-deterministic operation and, on a * busy system, can lead to extremely non-optimal * pageouts. For example, it can cause clean pages * to be freed and dirty pages to be moved to the end * of the queue. Since dirty pages are also moved to * the end of the queue once-cleaned, this gives * way too large a weighting to defering the freeing * of dirty pages. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vp = object->handle; mp = NULL; if (vp->v_type == VREG) vn_start_write(vp, &mp, V_NOWAIT); vm_page_unlock_queues(); VI_LOCK(vp); VM_OBJECT_UNLOCK(object); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_TIMELOCK, curthread)) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); ++pageout_lock_miss; vn_finished_write(mp); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; VM_OBJECT_UNLOCK(object); continue; } VM_OBJECT_LOCK(object); vm_page_lock_queues(); /* * The page might have been moved to another * queue during potential blocking in vget() * above. The page might have been freed and * reused for another vnode. The object might * have been reused for another vnode. */ if (m->queue != PQ_INACTIVE || m->object != object || object->handle != vp) { if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; goto unlock_and_continue; } /* * The page may have been busied during the * blocking in vput(); We don't move the * page back onto the end of the queue so that * statistics are more correct if we don't. */ if (m->busy || (m->flags & PG_BUSY)) { goto unlock_and_continue; } /* * If the page has become held it might * be undergoing I/O, so skip it */ if (m->hold_count) { vm_pageq_requeue(m); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; goto unlock_and_continue; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. * * This operation may cluster, invalidating the 'next' * pointer. To prevent an inordinate number of * restarts we use our marker to remember our place. * * decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ s = splvm(); TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); splx(s); if (vm_pageout_clean(m) != 0) { --page_shortage; --maxlaunder; } s = splvm(); next = TAILQ_NEXT(&marker, pageq); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); splx(s); unlock_and_continue: VM_OBJECT_UNLOCK(object); if (vp) { vm_page_unlock_queues(); vput(vp); vn_finished_write(mp); vm_page_lock_queues(); } continue; } VM_OBJECT_UNLOCK(object); } /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ page_shortage = vm_paging_target() + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; /* * Scan the active queue for things we can deactivate. We nominally * track the per-page activity counter and use it to locate * deactivation candidates. */ pcount = cnt.v_active_count; m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { vm_pageq_requeue(m); m = next; continue; } /* * The count for pagedaemon pages is done after checking the * page for eligibility... */ cnt.v_pdpages++; /* * Check to see "how much" the page has been used. */ actcount = 0; if (m->object->ref_count != 0) { if (m->flags & PG_REFERENCED) { actcount += 1; } actcount += pmap_ts_referenced(m); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } } /* * Since we have "tested" this bit, we need to clear it now. */ vm_page_flag_clear(m, PG_REFERENCED); /* * Only if an object is currently being used, do we use the * page activation count stats. */ if (actcount && (m->object->ref_count != 0)) { vm_pageq_requeue(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm || m->object->ref_count == 0 || m->act_count == 0) { page_shortage--; if (m->object->ref_count == 0) { pmap_remove_all(m); if (m->dirty == 0) vm_page_cache(m); else vm_page_deactivate(m); } else { vm_page_deactivate(m); } } else { vm_pageq_requeue(m); } } m = next; } s = splvm(); /* * We try to maintain some *really* free pages, this allows interrupt * code to be guaranteed space. Since both cache and free queues * are considered basically 'free', moving pages from cache to free * does not effect other calculations. */ while (cnt.v_free_count < cnt.v_free_reserved) { static int cache_rover = 0; if ((m = vm_page_select_cache(cache_rover)) == NULL) break; cache_rover = (m->pc + PQ_PRIME2) & PQ_L2_MASK; object = m->object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); vm_page_busy(m); vm_page_free(m); VM_OBJECT_UNLOCK(object); cnt.v_dfree++; } splx(s); vm_page_unlock_queues(); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_pageout_req_swapout |= VM_SWAP_IDLE; vm_req_vmdaemon(); lsec = time_second; } } #endif /* * If we didn't get enough free pages, and we have skipped a vnode * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ if (vm_paging_target() > 0) { if (vnodes_skipped && vm_page_count_min()) (void) speedup_syncer(); #if !defined(NO_SWAPPING) if (vm_swap_enabled && vm_page_count_target()) { vm_req_vmdaemon(); vm_pageout_req_swapout |= VM_SWAP_NORMAL; } #endif } /* * If we are critically low on one of RAM or swap and low on * the other, kill the largest process. However, we avoid * doing this on the first pass in order to give ourselves a * chance to flush out dirty vnode-backed pages and to allow * active pages to be moved to the inactive queue and reclaimed. * * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ if (pass != 0 && ((swap_pager_avail < 64 && vm_page_count_min()) || (swap_pager_full && vm_paging_target() > 0))) { bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; if (PROC_TRYLOCK(p) == 0) continue; /* * If this is a system or protected process, skip it. */ if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || (p->p_flag & P_PROTECTED) || ((p->p_pid < 48) && (swap_pager_avail != 0))) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ mtx_lock_spin(&sched_lock); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { breakout = 1; break; } } if (breakout) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); continue; } mtx_unlock_spin(&sched_lock); /* * get the process size */ if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) { PROC_UNLOCK(p); continue; } size = vmspace_swap_count(p->p_vmspace); vm_map_unlock_read(&p->p_vmspace->vm_map); size += vmspace_resident_count(p->p_vmspace); /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { if (bigproc != NULL) PROC_UNLOCK(bigproc); bigproc = p; bigsize = size; } else PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (bigproc != NULL) { - struct ksegrp *kg; killproc(bigproc, "out of swap space"); mtx_lock_spin(&sched_lock); - FOREACH_KSEGRP_IN_PROC(bigproc, kg) { - sched_nice(kg, PRIO_MIN); /* XXXKSE ??? */ - } + sched_nice(bigproc, PRIO_MIN); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(bigproc); wakeup(&cnt.v_free_count); } } mtx_unlock(&Giant); } /* * This routine tries to maintain the pseudo LRU active queue, * so that during long periods of time where there is no paging, * that some statistic accumulation still occurs. This code * helps the situation where paging just starts to occur. */ static void vm_pageout_page_stats() { vm_page_t m,next; int pcount,tpcount; /* Number of pages to check */ static int fullintervalcount = 0; int page_shortage; int s0; page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); if (page_shortage <= 0) return; s0 = splvm(); vm_page_lock_queues(); pcount = cnt.v_active_count; fullintervalcount += vm_pageout_stats_interval; if (fullintervalcount < vm_pageout_full_stats_interval) { tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count; if (pcount > tpcount) pcount = tpcount; } else { fullintervalcount = 0; } m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); while ((m != NULL) && (pcount-- > 0)) { int actcount; KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_page_stats: page %p isn't active", m)); next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { vm_pageq_requeue(m); m = next; continue; } actcount = 0; if (m->flags & PG_REFERENCED) { vm_page_flag_clear(m, PG_REFERENCED); actcount += 1; } actcount += pmap_ts_referenced(m); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; vm_pageq_requeue(m); } else { if (m->act_count == 0) { /* * We turn off page access, so that we have * more accurate RSS stats. We don't do this * in the normal page deactivation when the * system is loaded VM wise, because the * cost of the large number of page protect * operations would be higher than the value * of doing the operation. */ pmap_remove_all(m); vm_page_deactivate(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); vm_pageq_requeue(m); } } m = next; } vm_page_unlock_queues(); splx(s0); } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout() { int error, pass, s; /* * Initialize some paging parameters. */ cnt.v_interrupt_free_min = 2; if (cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (cnt.v_page_count > 1024) cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; else cnt.v_free_min = 4; cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + cnt.v_interrupt_free_min; cnt.v_free_reserved = vm_pageout_page_count + cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE; cnt.v_free_severe = cnt.v_free_min / 2; cnt.v_free_min += cnt.v_free_reserved; cnt.v_free_severe += cnt.v_free_reserved; /* * v_free_target and v_cache_min control pageout hysteresis. Note * that these are more a measure of the VM cache queue hysteresis * then the VM free queue. Specifically, v_free_target is the * high water mark (free+cache pages). * * v_free_reserved + v_cache_min (mostly means v_cache_min) is the * low water mark, while v_free_min is the stop. v_cache_min must * be big enough to handle memory needs while the pageout daemon * is signalled and run to free more pages. */ if (cnt.v_free_count > 6144) cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; else cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; if (cnt.v_free_count > 2048) { cnt.v_cache_min = cnt.v_free_target; cnt.v_cache_max = 2 * cnt.v_cache_min; cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; } else { cnt.v_cache_min = 0; cnt.v_cache_max = 0; cnt.v_inactive_target = cnt.v_free_count / 4; } if (cnt.v_inactive_target > cnt.v_free_count / 3) cnt.v_inactive_target = cnt.v_free_count / 3; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; if (vm_pageout_stats_max == 0) vm_pageout_stats_max = cnt.v_free_target; /* * Set interval in seconds for stats scan. */ if (vm_pageout_stats_interval == 0) vm_pageout_stats_interval = 5; if (vm_pageout_full_stats_interval == 0) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; /* * Set maximum free per pass */ if (vm_pageout_stats_free_max == 0) vm_pageout_stats_free_max = 5; swap_pager_swap_init(); pass = 0; /* * The pageout daemon is never done, so loop forever. */ while (TRUE) { s = splvm(); vm_page_lock_queues(); /* * If we have enough free memory, wakeup waiters. Do * not clear vm_pages_needed until we reach our target, * otherwise we may be woken up over and over again and * waste a lot of cpu. */ if (vm_pages_needed && !vm_page_count_min()) { if (!vm_paging_needed()) vm_pages_needed = 0; wakeup(&cnt.v_free_count); } if (vm_pages_needed) { /* * Still not done, take a second pass without waiting * (unlimited dirty cleaning), otherwise sleep a bit * and try again. */ ++pass; if (pass > 1) msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, "psleep", hz/2); } else { /* * Good enough, sleep & handle stats. Prime the pass * for the next run. */ if (pass > 1) pass = 1; else pass = 0; error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { vm_page_unlock_queues(); splx(s); pass = 0; vm_pageout_page_stats(); continue; } } if (vm_pages_needed) cnt.v_pdwakeups++; vm_page_unlock_queues(); splx(s); vm_pageout_scan(pass); } } /* * Unless the page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &cnt.v_free_count following this function unless * the page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup() { if (!vm_pages_needed && curthread->td_proc != pageproc) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon() { static int lastrun = 0; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } } static void vm_daemon() { struct rlimit rsslim; struct proc *p; struct thread *td; int breakout; mtx_lock(&Giant); while (TRUE) { tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0); if (vm_pageout_req_swapout) { swapout_procs(vm_pageout_req_swapout); vm_pageout_req_swapout = 0; } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_flag & (P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ mtx_lock_spin(&sched_lock); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { breakout = 1; break; } } mtx_unlock_spin(&sched_lock); if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_sflag & PS_INMEM) == 0) limit = 0; /* XXX */ PROC_UNLOCK(p); size = vmspace_resident_count(p->p_vmspace); if (limit >= 0 && size >= limit) { vm_pageout_map_deactivate_pages( &p->p_vmspace->vm_map, limit); } } sx_sunlock(&allproc_lock); } } #endif /* !defined(NO_SWAPPING) */