Index: stable/7/sys/contrib/pf =================================================================== --- stable/7/sys/contrib/pf (revision 191812) +++ stable/7/sys/contrib/pf (revision 191813) Property changes on: stable/7/sys/contrib/pf ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/contrib/pf:r189878 Index: stable/7/sys/dev/ath/ath_hal =================================================================== --- stable/7/sys/dev/ath/ath_hal (revision 191812) +++ stable/7/sys/dev/ath/ath_hal (revision 191813) Property changes on: stable/7/sys/dev/ath/ath_hal ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/dev/ath/ath_hal:r189878 Index: stable/7/sys/dev/cxgb =================================================================== --- stable/7/sys/dev/cxgb (revision 191812) +++ stable/7/sys/dev/cxgb (revision 191813) Property changes on: stable/7/sys/dev/cxgb ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/dev/cxgb:r189878 Index: stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c =================================================================== --- stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c (revision 191812) +++ stable/7/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c (revision 191813) @@ -1,336 +1,336 @@ /* * Copyright (c) 2001,2005 Russell Cattelan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "xfs.h" #include "xfs_types.h" #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" #include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_clnt.h" #include "xfs_mountops.h" #include #include xfs_buf_t * xfs_buf_read_flags(xfs_buftarg_t *target, xfs_daddr_t blkno, size_t len, int flags) { struct buf *bp; KASSERT((target != NULL), ("got NULL buftarg_t")); if (bread(target->specvp, blkno, BBTOB(len), NOCRED, &bp)) { printf("bread failed specvp %p blkno %qd BBTOB(len) %ld\n", target->specvp, blkno, (long)BBTOB(len)); bp = NULL; } /* not really sure what B_MANAGED really does for us * maybe we should drop this and just stick with a locked buf */ if (flags & B_MANAGED) bp->b_flags |= B_MANAGED; xfs_buf_set_target(bp, target); return (bp); } xfs_buf_t * xfs_buf_get_flags(xfs_buftarg_t *target, xfs_daddr_t blkno, size_t len, int flags) { struct buf *bp = NULL; KASSERT((target != NULL), ("got NULL buftarg_t")); bp = getblk(target->specvp, blkno, BBTOB(len), 0, 0, 0); if (bp != NULL) xfs_buf_set_target(bp, target); return (bp); } xfs_buf_t* xfs_buf_get_empty(size_t size, xfs_buftarg_t *target) { struct buf *bp; - bp = geteblk(0); + bp = geteblk(0, 0); if (bp != NULL) { bp->b_bufsize = size; bp->b_bcount = size; KASSERT(BUF_REFCNT(bp) == 1, ("xfs_buf_get_empty: bp %p not locked",bp)); xfs_buf_set_target(bp, target); } return (bp); } xfs_buf_t* xfs_buf_get_noaddr(size_t len, xfs_buftarg_t *target) { struct buf *bp; if (len >= MAXPHYS) return (NULL); - bp = geteblk(len); + bp = geteblk(len, 0); if (bp != NULL) { KASSERT(BUF_REFCNT(bp) == 1, ("xfs_buf_get_empty: bp %p not locked",bp)); xfs_buf_set_target(bp, target); } return (bp); } void xfs_buf_free(xfs_buf_t *bp) { bp->b_flags |= B_INVAL; BUF_KERNPROC(bp); /* ugly hack #1 */ if (bp->b_kvasize == 0) { bp->b_saveaddr = bp->b_kvabase; /* ugly hack #2 */ bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_bufsize = 0; } brelse(bp); } void xfs_buf_readahead( xfs_buftarg_t *target, xfs_daddr_t ioff, size_t isize, xfs_buf_flags_t flags) { daddr_t rablkno; int rabsize; rablkno = ioff; rabsize = BBTOB(isize); breada(target->specvp, &rablkno, &rabsize, 1, NOCRED); } void xfs_buf_set_target(xfs_buf_t *bp, xfs_buftarg_t *targ) { bp->b_bufobj = &targ->specvp->v_bufobj; bp->b_caller1 = targ; } xfs_buftarg_t * xfs_buf_get_target(xfs_buf_t *bp) { return (xfs_buftarg_t *)bp->b_caller1; } int XFS_bwrite(xfs_buf_t *bp) { int error; if (bp->b_vp == NULL) { error = xfs_buf_iorequest(bp); if ((bp->b_flags & B_ASYNC) == 0) { error = bufwait(bp); #if 0 if (BUF_REFCNT(bp) > 1) BUF_UNLOCK(bp); else brelse(bp); #endif brelse(bp); } return (error); } error = bwrite(bp); return (error); } void xfs_buf_pin(xfs_buf_t *bp) { bpin(bp); } void xfs_buf_unpin(xfs_buf_t *bp) { bunpin(bp); } int xfs_buf_ispin(xfs_buf_t *bp) { return bp->b_pin_count; } #if 0 void xfs_buf_wait_unpin( xfs_buf_t *bp) { bunpin_wait(bp); } #endif /* * Move data into or out of a buffer. */ void xfs_buf_iomove( xfs_buf_t *bp, /* buffer to process */ size_t boff, /* starting buffer offset */ size_t bsize, /* length to copy */ caddr_t data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { printf("xfs_buf_iomove NI\n"); #ifdef RMC size_t bend, cpoff, csize; struct page *page; bend = boff + bsize; while (boff < bend) { page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; cpoff = xfs_buf_poff(boff + bp->b_offset); csize = min_t(size_t, PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); switch (mode) { case XBRW_ZERO: memset(page_address(page) + cpoff, 0, csize); break; case XBRW_READ: memcpy(data, page_address(page) + cpoff, csize); break; case XBRW_WRITE: memcpy(page_address(page) + cpoff, data, csize); } boff += csize; data += csize; } #endif } /* * Handling of buffer targets (buftargs). */ /* * Wait for any bufs with callbacks that have been submitted but * have not yet returned... walk the hash list for the target. */ void xfs_wait_buftarg( xfs_buftarg_t *bp) { printf("xfs_wait_buftarg(%p) NI\n", bp); } int xfs_flush_buftarg( xfs_buftarg_t *btp, int wait) { int error = 0; error = vinvalbuf(btp->specvp, V_SAVE|V_NORMAL, curthread, 0, 0); return error; } void xfs_free_buftarg( xfs_buftarg_t *btp, int external) { xfs_flush_buftarg(btp, /* wait */ 0); kmem_free(btp, sizeof(*btp)); } int xfs_readonly_buftarg( xfs_buftarg_t *btp) { struct g_consumer *cp; KASSERT(btp->specvp->v_bufobj.bo_ops == &xfs_bo_ops, ("Bogus xfs_buftarg_t pointer")); cp = btp->specvp->v_bufobj.bo_private; return (cp->acw == 0); } #if 0 void xfs_relse_buftarg( xfs_buftarg_t *btp) { printf("xfs_relse_buftargNI %p\n",btp); } #endif unsigned int xfs_getsize_buftarg( xfs_buftarg_t *btp) { struct g_consumer *cp; cp = btp->specvp->v_bufobj.bo_private; return (cp->provider->sectorsize); } int xfs_setsize_buftarg( xfs_buftarg_t *btp, unsigned int blocksize, unsigned int sectorsize) { printf("xfs_setsize_buftarg NI %p\n",btp); return 0; } xfs_buftarg_t * xfs_alloc_buftarg( struct vnode *bdev, int external) { xfs_buftarg_t *btp; btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); btp->dev = bdev->v_rdev; btp->specvp = bdev; return btp; } Index: stable/7/sys/kern/vfs_bio.c =================================================================== --- stable/7/sys/kern/vfs_bio.c (revision 191812) +++ stable/7/sys/kern/vfs_bio.c (revision 191813) @@ -1,3982 +1,4055 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; /* * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ static struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf *bp); static void vfs_setdirty(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); -static int flushbufqueues(int, int); +static int buf_do_flush(struct vnode *vp); +static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); int runningbufspace; SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static int bufspace; SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Virtual memory used for buffers"); static int maxbufspace; SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static int bufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static int maxbufmallocspace; SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static int lobufspace; SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); int hibufspace; SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static int lorunningspace; SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, "Minimum preferred space used for in-progress I/O"); static int hirunningspace; SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); +static int flushbufqtarget = -1; +SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, + "Amount of work to do in flushbufqueues when helping bufdaemon"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * This lock synchronizes access to bd_request. */ static struct mtx bdlock; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx rbreqlock; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct mtx nblock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bdonelock; /* * Lock that protects against bwait()/bdone()/B_DONE races. */ static struct mtx bpinlock; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */ #define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 5 /* empty buffer headers */ +#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; /* Lock for the bufqueues */ static struct mtx bqlock; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ /* * numdirtywakeup: * * If someone is blocked due to there being too many dirty buffers, * and numdirtybuffers is now reasonable, wake them up. */ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } mtx_unlock(&nblock); } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * runningbufwakeup() - in-progress I/O accounting. * */ void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); bp->b_runningbufspace = 0; mtx_lock(&rbreqlock); if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { atomic_add_int(&numfreebuffers, 1); mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * Reads will adjust runningbufspace, but will not block based on it. * The read load has a side effect of reducing the allowed write load. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(int dirtybuflevel) { mtx_lock(&bdlock); if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ static __inline void bd_speedup(void) { bd_wakeup(1); } /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int maxbuf; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += (physmem_est - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; /* XXX Avoid integer overflows later on with maxbufspace. */ maxbuf = (INT_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) nbuf = maxbuf; } #if 0 /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } #endif /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vflags = 0; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = nbuf * BKVASIZE; hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; hirunningspace = 1024 * 1024; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd (8K) buffers. */ while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; /* * Maximum number of async ops initiated per buf_daemon loop. This is * somewhat of a hack at the moment, we really need to limit ourselves * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * bfreekva() - free the kva allocation for a buffer. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf *bp) { if (bp->b_kvasize) { atomic_add_int(&buffreekvacnt, 1); atomic_subtract_int(&bufspace, bp->b_kvasize); vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize); bp->b_kvasize = 0; bufspacewakeup(); } } /* * bremfree: * * Mark the buffer for removal from the appropriate free list in brelse. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp), ("bremfree: buf must be locked.")); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); bp->b_flags |= B_REMFREE; /* Fixup numfreebuffers count. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) atomic_subtract_int(&numfreebuffers, 1); } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * bqlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp), ("bremfreel: buffer %p not locked.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); mtx_assert(&bqlock, MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer * from the queue and return the stats are already done. */ if (bp->b_flags & B_REMFREE) { bp->b_flags &= ~B_REMFREE; return; } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, the buffer was free and we must decrement * numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) atomic_subtract_int(&numfreebuffers, 1); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). This is really just a special case of breadn(). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf **bpp) { return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); if (bp->b_pin_count > 0) bunpin_wait(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* Mark the buffer clean */ bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; bufobj_wref(bp->b_bufobj); vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy")); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * Wakeup the buffer flushing daemon if we have a lot of dirty * buffers (midpoint between our recovery point and our stall * point). */ bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(BUF_REFCNT(bp) == 1, ("bdirty: bp %p not locked",bp)); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); atomic_add_int(&numdirtybuffers, 1); bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); KASSERT(BUF_REFCNT(bp) == 1, ("bundirty: bp %p not locked",bp)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&nblock); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; msleep(&needsbuffer, &nblock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&nblock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && bp->b_error != ENXIO && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If B_INVAL is set then * this case is not run and the next case is run to * destroy the buffer. B_INVAL can occur if the buffer * is outside the range supported by the underlying device. * If the error is that the device went away (ENXIO), we * shouldn't redirty the buffer either, but discard the * data too. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (vm_page_count_severe()) { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (!(bp->b_vflags & BV_BKGRDINPROG)) bp->b_flags |= B_RELBUF; BO_UNLOCK(bp->b_bufobj); } else bp->b_flags |= B_RELBUF; } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; obj = bp->b_bufobj->bo_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_lock_queues(); vm_page_set_invalid(m, poffset, presid); vm_page_unlock_queues(); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_UNLOCK(obj); if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { if (bp->b_bufsize != 0) allocbuf(bp, 0); if (bp->b_vp != NULL) brelvp(bp); } if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); return; } /* enqueue */ mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) { bp->b_qindex = QUEUE_EMPTYKVA; } else { bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_INVAL; bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); /* remaining buffers */ } else { if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) == (B_DELWRI|B_NEEDSGIANT)) bp->b_qindex = QUEUE_DIRTY_GIANT; else if (bp->b_flags & B_DELWRI) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } mtx_unlock(&bqlock); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already * placed the buffer on the correct queue. We must also disassociate * the device and vnode for a B_INVAL buffer so gbincore() doesn't * find it. */ if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if (!(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse */ if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) { mtx_lock(&bqlock); bremfreel(bp); mtx_unlock(&bqlock); } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); BUF_UNLOCK(bp); return; } mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { if (bp->b_flags & B_NEEDSGIANT) bp->b_qindex = QUEUE_DIRTY_GIANT; else bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } else { /* * XXX This lock may not be necessary since BKGRDINPROG * cannot be set while we hold the buf lock, it can only be * cleared if it is already pending. */ BO_LOCK(bp->b_bufobj); if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } else { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ BO_UNLOCK(bp->b_bufobj); mtx_unlock(&bqlock); brelse(bp); return; } } mtx_unlock(&bqlock); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(struct buf *bp) { int i; vm_page_t m; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->oflags & VPO_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_free(m); } else if (bp->b_flags & B_DIRECT) { vm_page_try_to_free(m); } else if (vm_page_count_severe()) { vm_page_try_to_cache(m); } } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; VI_LOCK(vp); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; VI_UNLOCK(vp); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); return nwritten; } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * To avoid VFS layer recursion we do not flush dirty buffers ourselves. * Instead we ask the buf daemon to do it for us. We attempt to * avoid piecemeal wakeups of the pageout daemon. */ static struct buf * -getnewbuf(int slpflag, int slptimeo, int size, int maxsize) +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, + int gbflags) { + struct thread *td; struct buf *bp; struct buf *nbp; int defrag = 0; int nqindex; static int flushingbufs; + td = curthread; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ - atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ mtx_lock(&bqlock); nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp == NULL) { /* * If no EMPTYKVA buffers and we are either * defragging or reusing, locate a CLEAN buffer * to free or reuse. If bufspace useage is low * skip this step so we can allocate a new buffer. */ if (defrag || bufspace >= lobufspace) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a * CLEAN buffer, check to see if it is ok to use an EMPTY * buffer. We can only use an EMPTY buffer if allocating * its KVA would not otherwise run us out of buffer space. */ if (nbp == NULL && defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { int qindex = nqindex; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* FALLTHROUGH */ case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; if (bp->b_vp) { BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { BO_UNLOCK(bp->b_bufobj); BUF_UNLOCK(bp); continue; } BO_UNLOCK(bp->b_bufobj); } CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, bp->b_kvasize, bp->b_bufsize, qindex); /* * Sanity Checks */ KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); bremfreel(bp); mtx_unlock(&bqlock); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", bp, bp->b_vp, qindex)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_pin_count = 0; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * Notify any waiters for the buffer lock about * identity change by freeing the buffer. */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { - int flags; + int flags, norunbuf; char *waitmsg; + int fl; if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); mtx_unlock(&bqlock); bd_speedup(); /* heeeelp */ + if (gbflags & GB_NOWAIT_BD) + return (NULL); mtx_lock(&nblock); while (needsbuffer & flags) { + if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { + mtx_unlock(&nblock); + /* + * getblk() is called with a vnode + * locked, and some majority of the + * dirty buffers may as well belong to + * the vnode. Flushing the buffers + * there would make a progress that + * cannot be achieved by the + * buf_daemon, that cannot lock the + * vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + /* play bufdaemon */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_do_flush(vp); + td->td_pflags &= norunbuf; + mtx_lock(&nblock); + if (fl != 0) + continue; + if ((needsbuffer & flags) == 0) + break; + } if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { mtx_unlock(&nblock); return (NULL); } } mtx_unlock(&nblock); } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize) { vm_offset_t addr = 0; bfreekva(bp); vm_map_lock(buffer_map); if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ atomic_add_int(&bufdefragcnt, 1); vm_map_unlock(buffer_map); defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; atomic_add_int(&bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } vm_map_unlock(buffer_map); } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } return(bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); +static int +buf_do_flush(struct vnode *vp) +{ + int flushed; + + flushed = flushbufqueues(vp, QUEUE_DIRTY, 0); + /* The list empty check here is slightly racy */ + if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) { + mtx_lock(&Giant); + flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0); + mtx_unlock(&Giant); + } + if (flushed == 0) { + /* + * Could not find any buffers without rollback + * dependencies, so just write the first one + * in the hopes of eventually making progress. + */ + flushbufqueues(vp, QUEUE_DIRTY, 1); + if (!TAILQ_EMPTY( + &bufqueues[QUEUE_DIRTY_GIANT])) { + mtx_lock(&Giant); + flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1); + mtx_unlock(&Giant); + } + } + return (flushed); +} + static void buf_daemon() { /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ - curthread->td_pflags |= TDP_NORUNNINGBUF; + curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(bufdaemonproc); /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. */ while (numdirtybuffers > lodirtybuffers) { - int flushed; - - flushed = flushbufqueues(QUEUE_DIRTY, 0); - /* The list empty check here is slightly racy */ - if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) { - mtx_lock(&Giant); - flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0); - mtx_unlock(&Giant); - } - if (flushed == 0) { - /* - * Could not find any buffers without rollback - * dependencies, so just write the first one - * in the hopes of eventually making progress. - */ - flushbufqueues(QUEUE_DIRTY, 1); - if (!TAILQ_EMPTY( - &bufqueues[QUEUE_DIRTY_GIANT])) { - mtx_lock(&Giant); - flushbufqueues(QUEUE_DIRTY_GIANT, 1); - mtx_unlock(&Giant); - } + if (buf_do_flush(NULL) == 0) break; - } uio_yield(); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int -flushbufqueues(int queue, int flushdeps) +flushbufqueues(struct vnode *lvp, int queue, int flushdeps) { struct thread *td = curthread; struct buf sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int target; - target = numdirtybuffers - lodirtybuffers; - if (flushdeps && target > 2) - target /= 2; + if (lvp == NULL) { + target = numdirtybuffers - lodirtybuffers; + if (flushdeps && target > 2) + target /= 2; + } else + target = flushbufqtarget; flushed = 0; bp = NULL; + sentinel.b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlock); - TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist); + TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist); while (flushed != target) { - bp = TAILQ_FIRST(&bufqueues[queue]); - if (bp == &sentinel) + bp = TAILQ_NEXT(&sentinel, b_freelist); + if (bp != NULL) { + TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); + TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel, + b_freelist); + } else break; - TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist); - TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist); - + /* + * Skip sentinels inserted by other invocations of the + * flushbufqueues(), taking care to not reorder them. + */ + if (bp->b_qindex == QUEUE_SENTINEL) + continue; + /* + * Only flush the buffers that belong to the + * vnode locked by the curthread. + */ + if (lvp != NULL && bp->b_vp != lvp) + continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; if (bp->b_pin_count > 0) { BUF_UNLOCK(bp); continue; } BO_LOCK(bp->b_bufobj); if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BO_UNLOCK(bp->b_bufobj); BUF_UNLOCK(bp); continue; } BO_UNLOCK(bp->b_bufobj); if (bp->b_flags & B_INVAL) { bremfreel(bp); mtx_unlock(&bqlock); brelse(bp); flushed++; numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); mtx_lock(&bqlock); continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } - if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { + if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE, + td) == 0) { mtx_unlock(&bqlock); CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); - vfs_bio_awrite(bp); + if (curproc == bufdaemonproc) + vfs_bio_awrite(bp); + else { + bremfree(bp); + bwrite(bp); + } vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); flushwithdeps += hasdeps; flushed++; - waitrunningbufspace(); + + /* + * Sleeping on runningbufspace while holding + * vnode lock leads to deadlock. + */ + if (curproc == bufdaemonproc) + waitrunningbufspace(); numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); mtx_lock(&bqlock); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist); mtx_unlock(&bqlock); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_LOCK(bo); bp = gbincore(bo, blkno); BO_UNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_LOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_UNLOCK(obj); return 1; notinmem: VM_OBJECT_UNLOCK(obj); return (0); } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((bp->b_flags & B_VMIO) == 0) return; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vfs_setdirty_locked_object(bp); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; vm_page_lock_queues(); /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); vm_page_unlock_queues(); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int error; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); bo = &vp->v_bufobj; loop: /* * Block if we are low on buffers. Certain processes are allowed * to completely exhaust the buffer cache. * * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. * * XXX remove if 0 sections (clean this up after its proven) */ if (numfreebuffers == 0) { if (TD_IS_IDLETHREAD(curthread)) return NULL; mtx_lock(&nblock); needsbuffer |= VFS_BIO_NEED_ANY; mtx_unlock(&nblock); } BO_LOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy, it must * be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, VI_MTX(vp), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { /* * If buffer is pinned and caller does * not want sleep waiting for it to be * unpinned, bail out * */ if (bp->b_pin_count > 0) { if (flags & GB_LOCK_NOWAIT) { bqrelse(bp); return (NULL); } else { bunpin_wait(bp); } } bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { int bsize, maxsize, vmio; off_t offset; /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_UNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; bsize = bo->bo_bsize; offset = blkno * bsize; vmio = vp->v_object != NULL; maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); - bp = getnewbuf(slpflag, slptimeo, size, maxsize); + bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vn_canvmio(vp) != TRUE) printf("getblk: VMIO on vnode type %d\n", vp->v_type); #endif KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); } allocbuf(bp, size); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * -geteblk(int size) +geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; - while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) - continue; + while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { + if ((flags & GB_NOWAIT_BD) && + (curthread->td_pflags & TDP_BUFNEED) != 0) + return (NULL); + } allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { atomic_subtract_int( &bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; atomic_add_int(&bufmallocspace, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { atomic_subtract_int(&bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, TRUE, "biodep")) vm_page_lock_queues(); bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = bp->b_bufobj->bo_object; VM_OBJECT_LOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { /* * note: must allocate system pages * since blocking here could intefere * with paging I/O, no matter which * process we are. */ m = vm_page_alloc(obj, pi, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (m == NULL) { atomic_add_int(&vm_pageout_deficit, desiredpages - bp->b_npages); VM_OBJECT_UNLOCK(obj); VM_WAIT; VM_OBJECT_LOCK(obj); } else { if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test VPO_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. */ vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_UNLOCK(obj); /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } void biodone(struct bio *bp) { void (*done)(struct bio *); mtx_lock(&bdonelock); bp->bio_flags |= BIO_DONE; done = bp->bio_done; if (done == NULL) wakeup(bp); mtx_unlock(&bdonelock); if (done != NULL) done(bp); } /* * Wait for a BIO to finish. * * XXX: resort to a timeout for now. The optimal locking (if any) for this * case is not yet clear. */ int biowait(struct bio *bp, const char *wchan) { mtx_lock(&bdonelock); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); mtx_unlock(&bdonelock); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. */ static void bufdonebio(struct bio *bip) { struct buf *bp; bp = bip->bio_caller2; bp->b_resid = bp->b_bcount - bip->bio_completed; bp->b_resid = bip->bio_resid; /* XXX: remove */ bp->b_ioflags = bip->bio_flags; bp->b_error = bip->bio_error; if (bp->b_error) bp->b_ioflags |= BIO_ERROR; bufdone(bp); g_destroy_bio(bip); } void dev_strategy(struct cdev *dev, struct buf *bp) { struct cdevsw *csw; struct bio *bip; if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) panic("b_iocmd botch"); for (;;) { bip = g_new_bio(); if (bip != NULL) break; /* Try again later */ tsleep(&bp, PRIBIO, "dev_strat", hz/10); } bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ bip->bio_data = bp->b_data; bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s)", devtoname(dev))); csw = dev_refthread(dev); if (csw == NULL) { g_destroy_bio(bip); bp->b_error = ENXIO; bp->b_ioflags = BIO_ERROR; bufdone(bp); return; } (*csw->d_strategy)(bip); dev_relthread(dev); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; boolean_t are_queues_locked; obj = bp->b_bufobj->bo_object; #if defined(VFS_BIO_DEBUG) mp_fixme("usecount and vflag accessed without locks."); if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } KASSERT(vp->v_object != NULL, ("biodone: vnode %p has no vm_object", vp)); #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); VM_OBJECT_LOCK(obj); #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } if (bp->b_iocmd == BIO_READ) { vm_page_lock_queues(); are_queues_locked = TRUE; } else are_queues_locked = FALSE; for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%jd)/m->pindex(%ju) mismatch\n", (intmax_t)foff, (uintmax_t)m->pindex); } #endif /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (!vn_isdisk(vp, NULL)) printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize, (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n", (intmax_t) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n", (u_long)m->valid, (u_long)m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } if (are_queues_locked) vm_page_unlock_queues(); vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_LOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_UNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being VPO_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_LOCK(obj); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); retry: for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ pmap_remove_all(m); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf *bp) { int i; vm_ooffset_t foff, noff, eoff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. * */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_LOCK(bp->b_bufobj->bo_object); vm_page_lock_queues(); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_LOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) == 0)) { bzero(bp->b_data, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } ea = sa = bp->b_data; for(i = 0; i < bp->b_npages; i++, sa = ea) { ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)(vm_offset_t)ulmin( (u_long)(vm_offset_t)ea, (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); if (bp->b_pages[i] == bogus_page) continue; j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) bzero(sa, ea - sa); } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1 << j)) == 0) bzero(sa, DEV_BSIZE); } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could intefere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!p) { atomic_add_int(&vm_pageout_deficit, (to - pg) >> PAGE_SHIFT); VM_OBJECT_UNLOCK(kernel_object); VM_WAIT; VM_OBJECT_LOCK(kernel_object); goto tryagain; } p->valid = VM_PAGE_BITS_ALL; pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; VM_OBJECT_LOCK(kernel_object); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { if (p->busy) { printf( "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); } bp->b_pages[index] = NULL; pmap_qremove(pg, 1); vm_page_lock_queues(); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } } VM_OBJECT_UNLOCK(kernel_object); bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. */ int vmapbuf(struct buf *bp) { caddr_t addr, kva; vm_prot_t prot; int pidx, i; struct vm_page *m; struct pmap *pmap = &curproc->p_vmspace->vm_pmap; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, pidx++) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. * * NOTE! Must use pmap_extract() because addr may be in * the userland address space, and kextract is only guarenteed * to work for the kernland address space (see: sparc64 port). */ retry: if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, prot) < 0) { vm_page_lock_queues(); for (i = 0; i < pidx; ++i) { vm_page_unhold(bp->b_pages[i]); bp->b_pages[i] = NULL; } vm_page_unlock_queues(); return(-1); } m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); if (m == NULL) goto retry; bp->b_pages[pidx] = m; } if (pidx > btoc(MAXPHYS)) panic("vmapbuf: mapped more than MAXPHYS"); pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); kva = bp->b_saveaddr; bp->b_npages = pidx; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(struct buf *bp) { int pidx; int npages; npages = bp->b_npages; pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_lock_queues(); for (pidx = 0; pidx < npages; pidx++) vm_page_unhold(bp->b_pages[pidx]); vm_page_unlock_queues(); bp->b_data = bp->b_saveaddr; } void bdone(struct buf *bp) { mtx_lock(&bdonelock); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(&bdonelock); } void bwait(struct buf *bp, u_char pri, const char *wchan) { mtx_lock(&bdonelock); while ((bp->b_flags & B_DONE) == 0) msleep(bp, &bdonelock, pri, wchan, 0); mtx_unlock(&bdonelock); } int bufsync(struct bufobj *bo, int waitfor, struct thread *td) { return (VOP_FSYNC(bo->__bo_vnode, waitfor, td)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_LOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_LOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_MTX(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } void bpin(struct buf *bp) { mtx_lock(&bpinlock); bp->b_pin_count++; mtx_unlock(&bpinlock); } void bunpin(struct buf *bp) { mtx_lock(&bpinlock); if (--bp->b_pin_count == 0) wakeup(bp); mtx_unlock(&bpinlock); } void bunpin_wait(struct buf *bp) { mtx_lock(&bpinlock); while (bp->b_pin_count > 0) msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0); mtx_unlock(&bpinlock); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, bp->b_dep.lh_first); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } lockmgr_printinfo(&bp->b_lock); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (lockcount(&bp->b_lock)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } #endif /* DDB */ Index: stable/7/sys/sys/buf.h =================================================================== --- stable/7/sys/sys/buf.h (revision 191812) +++ stable/7/sys/sys/buf.h (revision 191813) @@ -1,556 +1,557 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; typedef unsigned char b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning vnode lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint8_t b_iocmd; uint8_t b_ioflags; off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ struct buf *b_left; /* (V) splay tree link */ struct buf *b_right; /* (V) splay tree link */ uint32_t b_vflags; /* (V) BV_* flags */ TAILQ_ENTRY(buf) b_freelist; /* (Q) Free list position inactive. */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ void *b_saveaddr; /* Original b_addr for physio. */ union pager_info { int pg_reqpage; } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; int b_pin_count; }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_00000800 0x00000800 /* Available flag. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_00004000 0x00004000 /* Available flag. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_000400000 0x00040000 /* Available flag. */ #define B_000800000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_00800000 0x00800000 /* Available flag. */ #define B_01000000 0x01000000 /* Available flag. */ #define B_NEEDSGIANT 0x02000000 /* Buffer's vnode needs giant. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34b27" \ "\33paging\32b25\31b24\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14b11\13eintr\12done\11persist\10delwri\7validsuspwrt" \ "\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ static __inline int BUF_LOCK(struct buf *, int, struct mtx *); static __inline int BUF_LOCK(struct buf *bp, int locktype, struct mtx *interlock) { int s, ret; s = splbio(); mtx_lock(bp->b_lock.lk_interlock); locktype |= LK_INTERNAL; bp->b_lock.lk_wmesg = buf_wmesg; bp->b_lock.lk_prio = PRIBIO + 4; ret = lockmgr(&(bp)->b_lock, locktype, interlock, curthread); splx(s); return ret; } /* * Get a lock sleeping with specified interruptably and timeout. */ static __inline int BUF_TIMELOCK(struct buf *, int, struct mtx *, char *, int, int); static __inline int BUF_TIMELOCK(struct buf *bp, int locktype, struct mtx *interlock, char *wmesg, int catch, int timo) { int s, ret; s = splbio(); mtx_lock(bp->b_lock.lk_interlock); locktype |= LK_INTERNAL | LK_TIMELOCK; bp->b_lock.lk_wmesg = wmesg; bp->b_lock.lk_prio = (PRIBIO + 4) | catch; bp->b_lock.lk_timo = timo; ret = lockmgr(&(bp)->b_lock, (locktype), interlock, curthread); splx(s); return ret; } /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ static __inline void BUF_UNLOCK(struct buf *); static __inline void BUF_UNLOCK(struct buf *bp) { int s; s = splbio(); KASSERT((bp->b_flags & B_REMFREE) == 0, ("BUF_UNLOCK %p while B_REMFREE is still set.", bp)); lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curthread); splx(s); } /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ do { \ if (BUF_REFCNT(bp) > 0) \ panic("free locked buf"); \ lockdestroy(&(bp)->b_lock); \ } while (0) #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ static __inline void BUF_KERNPROC(struct buf *); static __inline void BUF_KERNPROC(struct buf *bp) { struct thread *td = curthread; if (!TD_IS_IDLETHREAD(td) && bp->b_lock.lk_lockholder == td) td->td_locks--; bp->b_lock.lk_lockholder = LK_KERNPROC; } #endif /* * Find out the number of references to a lock. */ static __inline int BUF_REFCNT(struct buf *); static __inline int BUF_REFCNT(struct buf *bp) { int s, ret; /* * When the system is panicing, the lock manager grants all lock * requests whether or not the lock is available. To avoid "unlocked * buffer" panics after a crash, we just claim that all buffers * are locked when cleaning up after a system panic. */ if (panicstr != NULL) return (1); s = splbio(); ret = lockcount(&(bp)->b_lock); splx(s); return ret; } /* * Find out the number of waiters on a lock. */ static __inline int BUF_LOCKWAITERS(struct buf *); static __inline int BUF_LOCKWAITERS(struct buf *bp) { return (lockwaiters(&bp->b_lock)); } #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); BUF_LOCKFREE(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern int maxswzone; /* Max KVA for swap structures */ extern int maxbcache; /* Max KVA for buffer cache */ extern int runningbufspace; extern int hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int buf_maxio; /* nominal maximum I/O for buffer */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **); void bdwrite(struct buf *); void bawrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); -struct buf *geteblk(int); +struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int); void vfs_bio_set_validclean(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); void bpin(struct buf *); void bunpin(struct buf *); void bunpin_wait(struct buf *); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: stable/7/sys/sys/proc.h =================================================================== --- stable/7/sys/sys/proc.h (revision 191812) +++ stable/7/sys/sys/proc.h (revision 191813) @@ -1,929 +1,930 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifndef _KERNEL #include #endif #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { int s_count; /* (m) Ref cnt; pgrps in session. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct tty *s_ttyp; /* (m) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * p - select lock (sellock) * q - td_contested lock * r - p_peers lock * t - thread lock * x - created at fork, only changes during single threading in exec * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct kaudit_record; struct td_sched; struct nlminfo; struct kaioinfo; struct p_sched; struct proc; struct sleepqueue; struct thread; struct trapframe; struct turnstile; struct mqueue_notifier; struct cpuset; struct kdtrace_proc; struct kdtrace_thread; /* * Here we define the two structures used for process information. * * The first is the thread. It might be thought of as a "Kernel * Schedulable Entity Context". * This structure contains all the information as to where a thread of * execution is now, or was when it was suspended, why it was suspended, * and anything else that will be needed to restart it when it is * rescheduled. It includes a scheduler specific substructure that is different * for each scheduler. * * M:N notes. * It is important to remember that when using M:N threading, * a particular thread structure may only exist as long as * the system call or kernel entrance (e.g. by pagefault) * which it is currently executing. It should therefore NEVER be referenced * by pointers in long lived structures that live longer than a single * request. If several threads complete their work at the same time, * they will all rewind their stacks to the user boundary, report their * completion state, and all but one will be freed. That last one will * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel * entrance (basically to save freeing and then re-allocating it). The existing * thread keeps a cached spare thread available to allow it to quickly * get one when it needs a new one. There is also a system * cache of free threads. Threads have priority and partake in priority * inheritance schemes. * * The second is the proc (process) which owns all the resources of a process * other than CPU cycles, which are parceled out to the threads. */ /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ /* The two queues below should someday be merged. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals /* Cleared during fork1() or thread_schedule_upcall(). */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ u_char td_lastcpu; /* (t) Last cpu we were on. */ u_char td_oncpu; /* (t) Which cpu we are on. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ short td_locks; /* (k) Count of non-spin locks. */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct thread *td_standin; /* (k + a) Use this for an upcall. */ struct kse_upcall *td_upcall; /* (k + t) Upcall structure. */ u_int td_estcpu; /* (t) estimated cpu utilization */ u_int td_slptick; /* (t) Time at sleep. */ struct rusage td_ru; /* (t) rusage information */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ u_int td_uuticks; /* (k) Statclock hits (usr), for UTS. */ u_int td_usticks; /* (k) Statclock hits (sys), for UTS. */ int td_intrval; /* (t) Return value of TDF_INTERRUPT. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ sigset_t td_sigmask; /* (c) Current signal mask. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_kflags; /* (c) Flags for KSE threading. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ #define td_startcopy td_endzero u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or thread_sched_upcall() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */ vm_offset_t td_altkstack; /* (a) Kernel VA of alternate kstack. */ int td_altkstack_pages; /* (a) Size of alternate kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler-specific data. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ int td_syscalls; /* per-thread syscall count (used by NFS :)) */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct file *td_fpop; /* (k) file referencing cdev under op */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_UNUSEDx100 0x00000100 /* --available-- */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_UNUSED15 0x00008000 /* --available-- */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */ #define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* * "Private" flags kept in td_pflags: * These are only accessed by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_UPCALLING 0x00000008 /* This thread is doing an upcall. */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ #define TDP_SA 0x00000080 /* A scheduler activation based thread. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_CAN_UNBIND 0x00000800 /* Only temporarily bound. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ +#define TDP_BUFNEED 0x02000000 /* Do not recurse into the buf flush */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ /* * flags (in kflags) related to M:N threading. */ #define TDK_KSEREL 0x0001 /* Blocked in msleep on p->p_completed. */ #define TDK_KSERELSIG 0x0002 /* Blocked in msleep on p->p_siglist. */ #define TDK_WAKEUP 0x0004 /* Thread has been woken by kse_wakeup. */ #define TD_CAN_UNBIND(td) \ (((td)->td_pflags & TDP_CAN_UNBIND) && \ ((td)->td_upcall != NULL)) #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #if 0 #define TD_IS_IDLETHREAD(td) ((td) == pcpu(idlethread)) #else #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #endif #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN /* * An upcall is used when returning to userland. If a thread does not have * an upcall on return to userland the thread exports its context and exits. */ struct kse_upcall { TAILQ_ENTRY(kse_upcall) ku_link; /* List of upcalls in proc. */ struct proc *ku_proc; /* Associated proc. */ struct thread *ku_owner; /* Owning thread. */ int ku_flags; /* KUF_* flags. */ struct kse_mailbox *ku_mailbox; /* Userland mailbox address. */ stack_t ku_stack; /* Userland upcall stack. */ void *ku_func; /* Userland upcall function. */ unsigned int ku_mflags; /* Cached upcall mbox flags. */ }; #define KUF_DOUPCALL 0x00001 /* Do upcall now; don't wait. */ #define KUF_EXITING 0x00002 /* Upcall structure is exiting. */ /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking: (cj) means (j) for p_rux and (c) for p_crux. */ struct rusage_ext { u_int64_t rux_runtime; /* (cj) Real time. */ u_int64_t rux_uticks; /* (cj) Statclock hits in user mode. */ u_int64_t rux_sticks; /* (cj) Statclock hits in sys mode. */ u_int64_t rux_iticks; /* (cj) Statclock hits in intr mode. */ u_int64_t rux_uu; /* (c) Previous user time in usec. */ u_int64_t rux_su; /* (c) Previous sys time in usec. */ u_int64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * The old fashionned process. May have multiple threads. * Starts off with a single embedded THREAD. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */ TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ /* Accumulated stats for all threads? */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ /* * The following don't make too much sense. * See the td_ or ke_ versions of the same flags. */ int p_flag; /* (c) P_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (j) Tick when swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cj) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ char p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (c) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (c) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ int p_numupcalls; /* (j) Num upcalls. */ int p_upsleeps; /* (c) Num threads in kse_release(). */ struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */ int p_nextupcall; /* (n) Next upcall time. */ int p_upquantum; /* (n) Quantum to schedule an upcall. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c + j) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ /* End area that is copied on creation. */ #define p_endcopy p_xstat u_short p_xstat; /* (c) Exit status; also stop sig. */ struct knlist p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (j) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU 0xff /* For when we aren't on a CPU. */ #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread (*). */ #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_SA 0x08000 /* Using scheduler activations. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Flags for mi_switch(). */ #define SW_VOL 0x0001 /* Voluntary switch. */ #define SW_INVOL 0x0002 /* Involuntary switch. */ #define SW_PREEMPT 0x0004 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 /* XXXKSE: Missing values for thread_suspend_check(). */ #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FOREACH_UPCALL_IN_PROC(p, ku) \ TAILQ_FOREACH((ku), &(p)->p_upcalls, ku_link) /* XXXKSE the following lines should probably only be used in 1:1 code: */ #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) sessrele(s) #define STOPEVENT(p, e, v) do { \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process")); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process not held")); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process held")); \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td)) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() do { \ KASSERT(!(curthread->td_pflags & TDP_NOSLEEPING), \ ("nested no sleeping")); \ curthread->td_pflags |= TDP_NOSLEEPING; \ } while (0) #define THREAD_SLEEPING_OK() do { \ KASSERT((curthread->td_pflags & TDP_NOSLEEPING), \ ("nested sleeping ok")); \ curthread->td_pflags &= ~TDP_NOSLEEPING; \ } while (0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, int, int, struct proc **); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kick_proc0(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sessrele(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int sigonstack(size_t sp); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void threadinit(void); void cpu_idle(void); extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int) __dead2; void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ #ifdef KSE void kse_unlink(struct thread *); void kseinit(void); void upcall_reap(void); void upcall_remove(struct thread *td); #endif void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(void); void thread_continued(struct proc *p); void thread_exit(void) __dead2; int thread_export_context(struct thread *td, int willexit); void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); void thread_signal_add(struct thread *td, ksiginfo_t *); int thread_single(int how); void thread_single_end(void); void thread_stash(struct thread *td); int thread_statclock(int user); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); void thread_suspend_switch(struct thread *); void thread_suspend_one(struct thread *td); struct thread *thread_switchout(struct thread *td, int flags, struct thread *newtd); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); int thread_unsuspend_one(struct thread *td); void thread_unthread(struct thread *td); int thread_userret(struct thread *td, struct trapframe *frame); void thread_user_enter(struct thread *td); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void thr_exit1(void); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: stable/7/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- stable/7/sys/ufs/ffs/ffs_vfsops.c (revision 191812) +++ stable/7/sys/ufs/ffs/ffs_vfsops.c (revision 191813) @@ -1,1978 +1,1981 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_reload(struct mount *, struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "multilabel", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", NULL }; static int ffs_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; struct ufsmount *ump = 0; struct fs *fs; int error, flags; u_int mntorflags, mntandnotflags; mode_t accessmode; struct nameidata ndp; char *fspec; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; mntandnotflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); for (;;) { vn_finished_write(mp); if ((error = vfs_write_suspend(mp)) != 0) return (error); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPENDED) { /* * Allow the secondary writes * to proceed. */ mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); /* * Allow the curthread to * ignore the suspension to * synchronize on-disk state. */ curthread->td_pflags |= TDP_IGNSUSP; break; } MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: %s: blocks %jd files %d\n", fs->fs_fsmnt, "update error", (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp); return (error); } DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not %s\n", fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } DROP_GIANT(); g_topology_lock(); /* * If we're the root device, we may not have an E count * yet, get it now. */ if (ump->um_cp->ace == 0) error = g_access(ump->um_cp, 0, 1, 1); else error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp)) != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; /* The file system is still read-only. */ newfs->fs_ronly = 1; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: reload pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; bp = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; dev_ref(dev); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); /* * If we are a root mount, drop the E flag so fsck can do its magic. * We will pick it up again when we remount R/W. */ if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS)) error = g_access(cp, 0, 0, -1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) goto out; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; devvp->v_bufobj.bo_private = cp; devvp->v_bufobj.bo_ops = &ffs_ops; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: mount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ size = 1024; mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &size, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf( "WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf( "WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: multilabel flag on fs but no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ mp->mnt_stat.f_iosize = fs->fs_bsize; (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } dev_rel(dev); return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; fs = ump->um_fs; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; susp = fs->fs_ronly != 0; } else susp = 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("ffs_unmount: ufs_extattr_stop returned %d\n", error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { /* * dounmount already called vn_start_write(). */ for (;;) { vn_finished_write(mp); if ((error = vfs_write_suspend(mp)) != 0) return (error); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPENDED) { mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; break; } MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } } if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, td)) != 0) goto fail; } else { if ((error = ffs_flushfiles(mp, flags, td)) != 0) goto fail; } UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: unmount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error) { fs->fs_clean = 0; goto fail; } } if (susp) { vfs_write_resume(mp); vn_start_write(NULL, &mp, V_WAIT); } DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); fail: if (susp) { vfs_write_resume(mp); vn_start_write(NULL, &mp, V_WAIT); } #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { quotaoff(td, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Flush all the files. */ if ((error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0, td); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ffs_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { struct vnode *mvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ wait = 0; suspend = 0; suspended = 0; lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; MNT_ILOCK(mp); loop: /* Grab snapshot of secondary write counts */ secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; /* Grab snapshot of softdep dependency counts */ MNT_IUNLOCK(mp); softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { /* * Depend on the mntvnode_slock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor)) != 0) allerror = error; vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) { MNT_ILOCK(mp); goto loop; } } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; VI_LOCK(devvp); bo = &devvp->v_bufobj; if (waitfor != MNT_LAZY && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td); if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(devvp, 0, td); if (allerror == 0 && waitfor == MNT_WAIT) { MNT_ILOCK(mp); goto loop; } } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) goto loop; /* More work needed */ mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else VI_UNLOCK(devvp); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; struct thread *td; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); dev = ump->um_dev; fs = ump->um_fs; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if (fs->fs_magic == FS_UFS1_MAGIC) error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp); else error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive and shared locking. */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = dev; ip->i_number = ino; ip->i_ea_refs = 0; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif td = curthread; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td); if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if (ip->i_ump->um_fstype == UFS1) error = ufs_vinit(mp, &ffs_fifoops1, &vp); else error = ufs_vinit(mp, &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din1->di_ouid; /* XXX */ ip->i_gid = ip->i_din1->di_ogid; /* XXX */ } /* XXX */ #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_associate_vnode_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(mp, waitfor, suspended) struct ufsmount *mp; int waitfor; int suspended; { struct fs *fs = mp->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* Grab an extra reference to be dropped by the bufdone() below. */ bufobj_wrefl(bufobj); BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { int oldflags, s; struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ - newbp = geteblk(bp->b_bufsize); + newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); + if (newbp == NULL) + goto normal_write; /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); BO_UNLOCK(bp->b_bufobj); newbp->b_bufobj = &bp->b_vp->v_bufobj; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES /* move over the dependencies */ if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, newbp); #endif /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } /* Let the normal bufwrite do the rest for us */ +normal_write: return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; struct buf *tbp; vp = bo->__bo_vnode; if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); bp->b_flags &= ~B_VALIDSUSPWRT; if ((vp->v_vflag & VV_COPYONWRITE) && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif } g_vfs_strategy(bo, bp); } #ifdef DDB static void db_print_ffs(struct ufsmount *ump) { db_printf("mp %p %s devvp %p fs %p su_wl %d su_wl_in %d su_deps %d " "su_req %d\n", ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp, ump->um_fs, ump->softdep_on_worklist, ump->softdep_on_worklist_inprogress, ump->softdep_deps, ump->softdep_req); } DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* DDB */ Index: stable/7/sys =================================================================== --- stable/7/sys (revision 191812) +++ stable/7/sys (revision 191813) Property changes on: stable/7/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r189878