diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index c15e2c99fead..c7c890dd83bd 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,5634 +1,5643 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue *bd_subq; struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; bool bd_shutdown; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static char *buf; /* buffer header pool */ static struct buf * nbufp(unsigned i) { return ((struct buf *)(buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * i)); } caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_range(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); -long runningbufspace; +static long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * bufdaemon shutdown request and sleep channel. */ static bool bd_shutdown; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } static void bufspace_daemon_shutdown(void *arg, int howto __unused) { struct bufdomain *bd = arg; int error; if (KERNEL_PANICKED()) return; BD_RUN_LOCK(bd); bd->bd_shutdown = true; wakeup(&bd->bd_running); error = msleep(&bd->bd_shutdown, BD_RUN_LOCKPTR(bd), 0, "bufspace_shutdown", 60 * hz); BD_RUN_UNLOCK(bd); if (error != 0) printf("bufspacedaemon wait error: %d\n", error); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd = arg; EVENTHANDLER_REGISTER(shutdown_pre_sync, bufspace_daemon_shutdown, bd, SHUTDOWN_PRI_LAST + 100); BD_RUN_LOCK(bd); while (!bd->bd_shutdown) { BD_RUN_UNLOCK(bd); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); if (bd->bd_shutdown) break; atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); } } wakeup(&bd->bd_shutdown); BD_RUN_UNLOCK(bd); kthread_exit(); } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } +long +runningbufclaim(struct buf *bp, int space) +{ + long old; + + old = atomic_fetchadd_long(&runningbufspace, space); + bp->b_runningbufspace = space; + return (old); +} + /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { /* * This function and its results are protected by higher level * synchronization requiring vnode and buf locks to page in and * validate pages. */ if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > maxphys) maxbcachebuf = maxphys; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * With KASAN or KMSAN enabled, the kernel map is shadowed. Account for * this when sizing maps based on the amount of physical memory * available. */ #if defined(KASAN) physmem_est = (physmem_est * KASAN_SHADOW_SCALE) / (KASAN_SHADOW_SCALE + 1); #elif defined(KMSAN) physmem_est /= 3; /* * KMSAN cannot reliably determine whether buffer data is initialized * unless it is updated through a KVA mapping. */ unmapped_buf_allowed = 0; #endif /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > maxphys) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / maxphys; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } if (nswbuf == 0) { /* * Pager buffers are allocated for short periods, so scale the * number of reserved buffers based on the number of CPUs rather * than amount of memory. */ nswbuf = min(nbuf / 4, 32 * mp_ncpus); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; } /* * Reserve space for the buffer cache buffers */ buf = (char *)v; v = (caddr_t)buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * nbuf; return (v); } /* * Single global constant for BUF_WMESG, to avoid getting multiple * references. */ static const char buf_wmesg[] = "bufwait"; /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; TSENTER(); KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(maxphys); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = nbufp(i); bzero(bp, sizeof(*bp) + sizeof(vm_page_t) * atop(maxbcachebuf)); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp, buf_wmesg); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); TSEXIT(); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + maxphys, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int i, iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) nbusy++; } if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { sched_relinquish(curthread); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ BOOTTRACE("shutdown failed to sync buffers"); printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ swapoff_all(); } else { BOOTTRACE("shutdown sync complete"); if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems and perform swapoff, to quiesce * the system as much as possible. In particular, no * I/O should be initiated from top levels since it * might be abruptly terminated by reset, or otherwise * erronously handled because other parts of the * system are disabled. * * Swapoff before unmount, because file-backed swap is * non-operational after unmount of the underlying * filesystem. */ if (!KERNEL_PANICKED()) { swapoff_all(); vfs_unmountall(); } BOOTTRACE("shutdown unmounted all filesystems"); } DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); MPASS((bp->b_flags & B_MAXPHYS) == 0); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs, error; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWITNESS, NULL); KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp, error)); (void)error; KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); MPASS((bp->b_flags & B_MAXPHYS) == 0); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; /* Per-CPU clean buf queues, plus one global queue. */ bd->bd_subq = mallocarray(mp_maxid + 2, sizeof(struct bufqueue), M_BIOBUF, M_WAITOK | M_ZERO); bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(maxsize <= maxbcachebuf, ("bufkva_alloc kva too large %d %u", maxsize, maxbcachebuf)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } KASSERT(blkno == bp->b_lblkno, ("getblkx returned buffer for blkno %jd instead of blkno %jd", (intmax_t)bp->b_lblkno, (intmax_t)blkno)); flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { struct vnode *vp; long space; int oldflags, retval; bool vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if ((bp->b_flags & B_INVAL) != 0) { brelse(bp); return (0); } if ((bp->b_flags & B_BARRIER) != 0) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; KASSERT((bp->b_vflags & BV_BKGRDINPROG) == 0, ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; vp_md = vp != NULL && (vp->v_vflag & VV_MD) != 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ - bp->b_runningbufspace = bp->b_bufsize; - space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); + space = runningbufclaim(bp, bp->b_bufsize); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if ((oldflags & B_ASYNC) != 0) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { retval = bufwait(bp); brelse(bp); return (retval); } else if (space > hirunningspace) { /* * Don't allow the async write to saturate the I/O * system. We do not block here if it is the update * or syncer daemon trying to clean up as that can * lead to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; struct bufdomain *bd; bd = &bdomain[bo->bo_domain]; if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("brelse: SU io not finished bp %p", bp)); } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_flags & B_INVALONERR)) { /* * Forced invalidation of dirty buffer contents, to be used * after a failed write in the rare case that the loss of the * contents is acceptable. The buffer is invalidated and * freed. */ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || vn_isdisk(bp->b_vp) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); bp->b_xflags &= ~(BX_CVTENXIO); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_xflags &= ~(BX_CVTENXIO); if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("bqrelse: SU io not finished bp %p", bp)); } if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", blockcount_read(&obj->paging_in_progress), bp->b_npages)); vp = bp->b_vp; VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_object != NULL, vp); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_relookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_busy_acquire(m, VM_ALLOC_SBUSY); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_sunbusy(m); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; if (bp->b_npages < desiredpages) { KASSERT(desiredpages <= atop(maxbcachebuf), ("vfs_vmio_extend past maxbcachebuf %p %d %u", bp, desiredpages, maxbcachebuf)); /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = maxphys / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon_shutdown(void *arg __unused, int howto __unused) { int error; if (KERNEL_PANICKED()) return; mtx_lock(&bdlock); bd_shutdown = true; wakeup(&bd_request); error = msleep(&bd_shutdown, &bdlock, 0, "buf_daemon_shutdown", 60 * hz); mtx_unlock(&bdlock); if (error != 0) printf("bufdaemon wait error: %d\n", error); } static void buf_daemon(void) { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, buf_daemon_shutdown, NULL, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); while (!bd_shutdown) { bd_request = 0; mtx_unlock(&bdlock); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (bd_shutdown) break; if (BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } wakeup(&bd_shutdown); mtx_unlock(&bdlock); kthread_exit(); } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW | CTLFLAG_STATS, &flushwithdeps, 0, "Number of buffers flushed with dependencies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { return (gbincore_unlocked(bo, blkno)); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ bool inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m, n; vm_ooffset_t off; int valid; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return (true); if (vp->v_mount == NULL) return (false); obj = vp->v_object; if (obj == NULL) return (false); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); recheck: if (m == NULL) return (false); tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); /* * Consider page validity only if page mapping didn't change * during the check. */ valid = vm_page_is_valid(m, (vm_offset_t)((toff + off) & PAGE_MASK), tinc); n = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); if (m != n) { m = n; goto recheck; } if (!valid) return (false); } return (true); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); vfs_busy_pages_acquire(bp); vfs_setdirty_range(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vfs_busy_pages_release(bp); } static void vfs_setdirty_range(struct buf *bp) { vm_offset_t boffset; vm_offset_t eoffset; int i; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whose * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (vp->v_type != VCHR) ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) { printf("getblkx: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); return (EIO); } if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = dblkno; /* Attempt lockless lookup first. */ bp = gbincore_unlocked(bo, blkno); if (bp == NULL) { /* * With GB_NOCREAT we must be sure about not finding the buffer * as it may have been reassigned during unlocked lookup. */ if ((flags & GB_NOCREAT) != 0) goto loop; goto newbuf_unlocked; } error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL, "getblku", 0, 0); if (error != 0) goto loop; /* Verify buf identify has not changed since lookup. */ if (bp->b_bufobj == bo && bp->b_lblkno == blkno) goto foundbuf_fastpath; /* It changed, fallback to locked lookup. */ BUF_UNLOCK_RAW(bp); loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_INTERLOCK | ((flags & GB_LOCK_NOWAIT) != 0 ? LK_NOWAIT : LK_SLEEPFAIL); #ifdef WITNESS lockflags |= (flags & GB_NOWITNESS) != 0 ? LK_NOWITNESS : 0; #endif error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); foundbuf_fastpath: /* If recursed, assume caller knows the rules. */ if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); newbuf_unlocked: /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); if (maxsize > maxbcachebuf) { printf( "getblkx: maxsize(%d) > maxbcachebuf(%d)\n", maxsize, maxbcachebuf); return (EIO); } } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && !vn_isdisk(vp)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; if (bp->b_bcount == size) return (1); KASSERT(bp->b_kvasize == 0 || bp->b_kvasize >= size, ("allocbuf: buffer too small %p %#x %#x", bp, bp->b_kvasize, size)); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = size == 0 ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); KASSERT((bp->b_flags & B_MALLOC) == 0, ("allocbuf: VMIO buffer can't be malloced %p", bp)); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; /* * The check for done == biodone is to allow biodone to be * used as a bio_done routine. */ if (done == NULL || done == biodone) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wmesg) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wmesg, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Acquire a shared busy on all pages in the buf. */ void vfs_busy_pages_acquire(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY); } void vfs_busy_pages_release(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, bp->b_npages); vfs_busy_pages_acquire(bp); } if (bp->b_bufsize != 0) vfs_setdirty_range(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_assert_sbusied(m); /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (vm_page_all_valid(m) && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); /* * Busy may not be strictly necessary here because the pages are * unlikely to be fully valid and the vnode lock will synchronize * their access via getpages. It is grabbed for consistency with * other page validation. */ vfs_busy_pages_acquire(bp); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vfs_busy_pages_release(bp); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, sa, ea, slide, zbits; vm_page_bits_t mask; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages_acquire(bp); sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; zbits = (sizeof(vm_page_bits_t) * NBBY) - (ea - sa) / DEV_BSIZE; mask = (VM_PAGE_BITS_ALL >> zbits) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE, roundup2(ea - sa, DEV_BSIZE)); } vfs_busy_pages_release(bp); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(to - from <= maxbcachebuf, ("vm_hold_load_pages too large %p %#jx %#jx %u", bp, (uintmax_t)from, (uintmax_t)to, maxbcachebuf)); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc_noobj(VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; vm_page_unwire_noq(p); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, void *uaddr, size_t len, int mapbuf) { vm_prot_t prot; int pidx; MPASS((bp->b_flags & B_MAXPHYS) != 0); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uaddr, len, prot, bp->b_pages, PBUF_PAGES); if (pidx < 0) return (-1); bp->b_bufsize = len; bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)uaddr) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return (0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); pctrie_init(&bo->bo_clean.bv_root); TAILQ_INIT(&bo->bo_dirty.bv_hd); pctrie_init(&bo->bo_dirty.bv_root); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } struct memdesc memdesc_bio(struct bio *bio) { if ((bio->bio_flags & BIO_VLIST) != 0) return (memdesc_vlist((struct bus_dma_segment *)bio->bio_data, bio->bio_ma_n)); if ((bio->bio_flags & BIO_UNMAPPED) != 0) return (memdesc_vmpages(bio->bio_ma, bio->bio_bcount, bio->bio_ma_offset)); return (memdesc_vaddr(bio->bio_data, bio->bio_bcount)); } static int buf_pager_relbuf; SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bo_bs, bsize; int br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; error = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)), &bo_bs); if (error != 0) return (VM_PAGER_ERROR); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; again: for (i = 0; i < count; i++) { if (ma[i] != bogus_page) vm_page_busy_downgrade(ma[i]); } lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; if (m == bogus_page) continue; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (vm_page_all_valid(m)) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; error = get_blksize(vp, lbn, &bsize); if (error == 0) error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (bp->b_rcred == curthread->td_ucred) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || !vm_page_all_valid(m)) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || vm_page_all_valid(m) || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { if (!vm_page_none_valid(m) && !vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); } next_page:; } end_pages: redo = false; for (i = 0; i < count; i++) { if (ma[i] == bogus_page) continue; if (vm_page_busy_tryupgrade(ma[i]) == 0) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex, VM_ALLOC_NORMAL); } /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (!vm_page_all_valid(ma[i])) redo = true; } if (redo && error == 0) goto again; return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS); db_printf("b_vflags=0x%b b_ioflags0x%b\n", (u_int)bp->b_vflags, PRINT_BUF_VFLAGS, (u_int)bp->b_ioflags, PRINT_BIO_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, " "b_vp = %p, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } DB_SHOW_COMMAND_FLAGS(bufqueues, bufqueues, DB_CMD_MEMSAFE) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i && BUF_ISLOCKED(bp)) { cnt++; total += bp->b_bufsize; } } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i) { cnt++; total += bp->b_bufsize; } } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } DB_SHOW_COMMAND_FLAGS(lockedbufs, lockedbufs, DB_CMD_MEMSAFE) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND_FLAGS(countfreebufs, db_coundfreebufs, DB_CMD_MEMSAFE) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 7fffb52f84f2..7c0ea9605e28 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,619 +1,619 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; struct vm_page; typedef uint32_t b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); void (*b_ckhashcalc)(struct buf *); uint64_t b_ckhash; /* B_CKHASH requested check-hash */ daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ uint8_t b_qindex; /* (Q) buffer queue index */ uint8_t b_domain; /* (Q) buf domain this resides in */ uint16_t b_subqueue; /* (Q) per-cpu q if any */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, struct vm_page **, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; #if defined(FULL_BUF_TRACKING) #define BUF_TRACKING_SIZE 32 #define BUF_TRACKING_ENTRY(x) ((x) & (BUF_TRACKING_SIZE - 1)) const char *b_io_tracking[BUF_TRACKING_SIZE]; uint32_t b_io_tcnt; #elif defined(BUF_TRACKING) const char *b_io_tracking; #endif struct vm_page *b_pages[]; }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_INVALONERR This flag is set on dirty buffers. It specifies that a * write error should forcibly invalidate the buffer * contents. This flag should be used with caution, as it * discards data. It is incompatible with B_ASYNC. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_CKHASH 0x00000100 /* checksum hash calculated on read */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_REUSE 0x00001000 /* Contents reused, second chance. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_INVALONERR 0x00040000 /* Invalidate on write error. */ #define B_IOSTARTED 0x00080000 /* buf_start() called */ #define B_00100000 0x00100000 /* Available flag. */ #define B_MAXPHYS 0x00200000 /* nitems(b_pages[]) = atop(MAXPHYS). */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26maxphys\25b20" \ "\24iostarted\23invalonerr\22clusterok\21malloc\20nocache\17b14" \ "\16inval\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. * * BX_FSPRIV reserves a set of eight flags that may be used by individual * filesystems for their own purpose. Their specific definitions are * found in the header files for each filesystem that uses them. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_CVTENXIO 0x00000004 /* Convert errors to ENXIO */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define BX_FSPRIV 0x00FF0000 /* Filesystem-specific flags mask */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\3cvtenxio" \ "\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif /* * Buffer locking */ #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp, wmesg) \ lockinit(&(bp)->b_lock, PRIBIO + 4, wmesg, 0, LK_NEW) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ BUF_UNLOCK_RAW((bp)); \ } while (0) #define BUF_UNLOCK_RAW(bp) do { \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; /* * Vnode clustering tracker */ struct vn_clusterw { daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("recursed buf_start %p", bp)); bp->b_flags |= B_IOSTARTED; if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if ((bp->b_flags & B_IOSTARTED) != 0) { bp->b_flags &= ~B_IOSTARTED; if (bioops.io_complete) (*bioops.io_complete)(bp); } } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } static __inline void buf_track(struct buf *bp __unused, const char *location __unused) { #if defined(FULL_BUF_TRACKING) bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location; #elif defined(BUF_TRACKING) bp->b_io_tracking = location; #endif } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #define GB_CKHASH 0x0020 /* If reading, calc checksum hash */ #define GB_NOSPARSE 0x0040 /* Do not instantiate holes */ #define GB_CVTENXIO 0x0080 /* Convert errors to ENXIO */ #define GB_NOWITNESS 0x0100 /* Do not record for WITNESS */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern u_long maxswzone; /* Max KVA for swap structures */ extern u_long maxbcache; /* Max KVA for buffer cache */ extern int maxbcachebuf; /* Max buffer cache block size */ -extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern caddr_t __read_mostly unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } +long runningbufclaim(struct buf *, int); void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, 0, \ NULL, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, blkno, size, NULL, NULL, 0, cred, \ gbflags, NULL, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, blkno, size, rablkno, rabsize, cnt, cred, \ 0, NULL, bpp) int breadn_flags(struct vnode *, daddr_t, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *), struct buf **); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_busy_pages_acquire(struct buf *bp); void vfs_busy_pages_release(struct buf *bp); struct buf *incore(struct bufobj *, daddr_t); bool inmem(struct vnode *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *gbincore_unlocked(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bd_speedup(void); extern uma_zone_t pbuf_zone; uma_zone_t pbuf_zsecond_create(const char *name, int max); struct vn_clusterw; void cluster_init_vn(struct vn_clusterw *vnc); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct vn_clusterw *, struct buf *, u_quad_t, int, int); void vfs_bio_brelse(struct buf *bp, int ioflags); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_bio_set_flags(struct buf *bp, int ioflags); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, void *, size_t, int); void vunmapbuf(struct buf *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t); typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t, long *); int vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index fedac50e38fb..ae3b444a19e9 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -1,2762 +1,2759 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(struct mount *mp, char *snapfile) { return (EINVAL); } int ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *wkhd) { return (EINVAL); } void ffs_snapremove(struct vnode *vp) { } void ffs_snapshot_mount(struct mount *mp) { } void ffs_snapshot_unmount(struct mount *mp) { } void ffs_snapgone(struct inode *ip) { } int ffs_copyonwrite(struct vnode *devvp, struct buf *bp) { return (EINVAL); } void ffs_sync_snap(struct mount *mp, int waitfor) { } #else FEATURE(ffs_snapshot, "FFS snapshot support"); LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void try_free_snapdata(struct vnode *devvp); static void revert_snaplock(struct vnode *, struct vnode *, struct snapdata *); static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the code normally only ensures * persistence for the filesystem metadata contained within a * snapshot. Setting the following flag allows this crash * persistence to be enabled for file contents. */ int dopersistence = 0; #ifdef DIAGNOSTIC #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DIAGNOSTIC */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(struct mount *mp, char *snapfile) { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; ufs2_daddr_t blockno; uint64_t flag; char saved_nice = 0; #ifdef DIAGNOSTIC long redo = 0; #endif long snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs, *bpfs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; #ifdef DIAGNOSTIC struct timespec starttime = {0, 0}, endtime; #endif ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; /* * At the moment, filesystems using gjournal cannot support * taking snapshots. */ if ((mp->mnt_flag & MNT_GJOURNAL) != 0) { vfs_mount_error(mp, "%s: Snapshots are not yet supported when " "using gjournal", fs->fs_fsmnt); return (EOPNOTSUPP); } MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, snapfile); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE_PNBUF(&nd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); if (error) { VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); NDFREE_PNBUF(&nd); vn_finished_write(wrtmp); if (error == ERELOOKUP) goto restart; return (error); } vp = nd.ni_vp; vref(nd.ni_dvp); VOP_VPUT_PAIR(nd.ni_dvp, &vp, false); if (VN_IS_DOOMED(vp)) { error = EBADF; goto out; } vnode_create_vobject(nd.ni_vp, fs->fs_size, td); vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ITODEVVP(ip); /* * Calculate the size of the filesystem then allocate the block * immediately following the last block of the filesystem that * will contain the snapshot list. This operation allows us to * set the size of the snapshot. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)numblks), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; bawrite(bp); ip->i_size = lblktosize(fs, (off_t)(numblks + 1)); vnode_pager_setsize(vp, ip->i_size); DIP_SET(ip, i_size, ip->i_size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); if (cg % 10 == 0) { error = ffs_syncvnode(vp, MNT_WAIT, 0); /* vp possibly reclaimed if unlocked */ if (error != 0) goto out; } } /* * Change inode to snapshot type file. Before setting its block * pointers to BLK_SNAP and BLK_NOCOPY in cgaccount, we have to * set its type to SF_SNAPSHOT so that VOP_REMOVE will know that * they need to be rolled back before attempting deletion. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = roundup2(howmany(fs->fs_ncg, NBBY), sizeof(uint64_t)); space = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (cg % 10 == 0 && error == 0) error = ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) goto out; VOP_UNLOCK(vp); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto resumefs; } #ifdef DIAGNOSTIC if (collectsnapstats) nanotime(&starttime); #endif /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; #ifdef DIAGNOSTIC redo++; #endif error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto resumefs; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto resumefs; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ copy_fs = malloc((uint64_t)fs->fs_bsize, M_UFSMNT, M_WAITOK); bcopy(fs, copy_fs, fs->fs_sbsize); copy_fs->fs_si = malloc(sizeof(struct fs_summary_info), M_UFSMNT, M_ZERO | M_WAITOK); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&((char *)copy_fs)[fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((uint64_t)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); goto resumefs; } bcopy(bp->b_data, space, (uint64_t)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the size needed for the snapshot list. * Initial number of entries is composed of: * - one for each cylinder group map * - one for each block used by superblock summary table * - one for each snapshot inode block * - one for the superblock * - one for the snapshot list * The direct block entries in the snapshot are always * copied (see reason below). Note that the superblock and * the first cylinder group will almost always be allocated * in the direct blocks, but we add the slop for them in case * they do not end up there. The snapshot list size may get * expanded by one because of an update of an inode block for * an unlinked but still open file when it is expunged. * * Because the direct block pointers are always copied, they * are not added to the list. Instead ffs_copyonwrite() * explicitly checks for them before checking the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + /* superblock */ 1 + /* snaplist */ 1; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_IUNLOCK(mp); loop: MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { if ((xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || IS_SNAPSHOT(VTOI(xvp))) { VI_UNLOCK(xvp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp); vdrop(xvp); continue; } VI_UNLOCK(xvp); #ifdef DIAGNOSTIC if (snapdebug) vn_printf(xvp, "ffs_snapshot: busy vnode "); #endif if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp); vdrop(xvp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp); vdrop(xvp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < UFS_NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, xvp->v_type, NULL, SINGLETON_KEY); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 1); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); VOP_UNLOCK(xvp); vdrop(xvp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto resumefs; } } /* * Erase the journal file from the snapshot. */ if (fs->fs_flags & FS_SUJ) { error = softdep_journal_lookup(mp, &xvp); if (error) goto resumefs; xp = VTOI(xvp); if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 0); vput(xvp); } /* * Preallocate all the direct blocks in the snapshot inode so * that we never have to write the inode itself to commit an * update to the contents of the snapshot. Note that once * created, the size of the snapshot will never change, so * there will never be a need to write the inode except to * update the non-integrity-critical time fields and * allocated-block count. */ for (blockno = 0; blockno < UFS_NDADDR; blockno++) { if (DIP(ip, i_db[blockno]) != 0) continue; error = UFS_BALLOC(vp, lblktosize(fs, blockno), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto resumefs; error = readblock(vp, bp, blockno); bawrite(bp); if (error != 0) goto resumefs; } /* * Acquire a lock on the snapdata structure, creating it if necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the original * private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg)) > blkno) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %ju already on list", (uintmax_t)ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); resumefs: ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); if (error != 0 && copy_fs != NULL) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs->fs_si, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; } KASSERT(error != 0 || (sn != NULL && copy_fs != NULL), ("missing snapshot setup parameters")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); #ifdef DIAGNOSTIC if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime, &endtime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } #endif if (copy_fs == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP, 0); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP, 0); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, (off_t)numblks); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); } else { loc = blkoff(fs, fs->fs_sblockloc); copy_fs->fs_fmod = 0; bpfs = (struct fs *)&nbp->b_data[loc]; bcopy((caddr_t)copy_fs, (caddr_t)bpfs, (uint64_t)fs->fs_sbsize); ffs_oldfscompat_write(bpfs, ump); bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) free(space, M_UFSMNT); done: free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs->fs_si, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; out: if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); sched_nice(td->td_proc, saved_nice); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { free(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); NDFREE_PNBUF(&nd); vrele(nd.ni_dvp); if (error == 0) { (void) ffs_syncvnode(vp, MNT_WAIT, 0); VOP_UNLOCK(vp); } else if (VN_IS_DOOMED(vp)) { vput(vp); } else { int rmerr; /* Remove snapshot as its creation has failed. */ vput(vp); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile); if ((rmerr = namei(&nd)) != 0 || (rmerr = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd)) != 0) printf("Delete of %s failed with error %d\n", nd.ni_dirp, rmerr); NDFREE_PNBUF(&nd); if (nd.ni_dvp != NULL) vput(nd.ni_dvp); if (nd.ni_vp != NULL) vput(nd.ni_vp); } vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(int cg, struct vnode *vp, struct buf *nbp, int passno) { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ITOFS(ip); if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, 0, &bp, &cgp)) != 0) return (error); UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed * at mount time. Sync up summary information for current cylinder * group while data is in memory to ensure that result of background * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < UFS_NDADDR) { for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = 0; } if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); out: /* * We have to calculate the crc32c here rather than just setting the * BX_CYLGRP b_xflags because the allocation of the block for the * the cylinder group map will always be a full size block (fs_bsize) * even though the cylinder group may be smaller (fs_cgsize). The * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP * flag causes it to be computed over the size of the buffer. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) { ((struct cg *)nbp->b_data)->cg_ckhash = 0; ((struct cg *)nbp->b_data)->cg_ckhash = calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); } return (error); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype, int clearmode) { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(dip->di_db, UFS_NDADDR * sizeof(ufs1_daddr_t)); bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, ufs_lbn_t blksperindir, struct fs *fs, int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype) { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int exptype) /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din1->di_db[lbn]; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL, SINGLETON_KEY); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype, int clearmode) { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(dip->di_db, UFS_NDADDR * sizeof(ufs2_daddr_t)); bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs2_daddr_t)); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; else ffs_update_dinode_ckhash(fs, dip); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, ufs_lbn_t blksperindir, struct fs *fs, int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype) { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & B_CACHE) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int exptype) /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din2->di_db[lbn]; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP && lblkno >= UFS_NDADDR) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL, SINGLETON_KEY); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(struct inode *ip) { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); #ifdef DIAGNOSTIC else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %ju\n", (uintmax_t)ip->i_number); #endif /* * Delete snapshot inode from superblock. Keep list dense. */ ump = ITOUMP(ip); fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(struct vnode *vp) { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs2_daddr_t numblks, blkno, dblk; int error, last, loc; struct snapdata *sn; ip = VTOI(vp); fs = ITOFS(ip); devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; revert_snaplock(vp, devvp, sn); try_free_snapdata(devvp); } VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); /* * The dirtied indirects must be written out before * softdep_setup_freeblocks() is called. Otherwise indir_trunc() * may find indirect pointers using the magic BLK_* values. */ if (DOINGSOFTDEP(vp)) ffs_syncvnode(vp, MNT_WAIT, 0); #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *wkhd) { struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } /* * Use LK_SLEEPFAIL because sn might be freed under us while * both devvp interlock and snaplk are not owned. */ if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Lookup block being written. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= UFS_NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DIAGNOSTIC if (snapdebug) printf("%s %ju lbn %jd from inum %ju\n", "Grabonremove: snapino", (uintmax_t)ip->i_number, (intmax_t)lbn, (uintmax_t)inum); #endif /* * If journaling is tracking this write we must add * the work to the inode or indirect being written. */ if (wkhd != NULL) { if (lbn < UFS_NDADDR) softdep_inode_append(ip, curthread->td_ucred, wkhd); else softdep_buf_append(ibp, wkhd); } if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (1); } if (lbn >= UFS_NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DIAGNOSTIC if (snapdebug) printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", "Copyonremove: snapino ", (uintmax_t)ip->i_number, (intmax_t)lbn, "for inum", (uintmax_t)inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((vtype == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (error != 0 && wkhd != NULL) softdep_freework(wkhd); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(struct mount *mp) { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if (vp->v_type != VREG) { reason = "non-file snapshot"; } else if (!IS_SNAPSHOT(ip)) { reason = "non-snapshot"; } else if (ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); (void)ffs_syncvnode(vp, MNT_WAIT, 0); } else { reason = NULL; } if (reason != NULL) { printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Acquire a lock on the snapdata structure, creating it if * necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the * original private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %ju already on list", (uintmax_t)ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp); return; } snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp); free(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp); VI_LOCK(devvp); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(struct mount *mp) { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); VI_LOCK(devvp); revert_snaplock(vp, devvp, sn); lockmgr(&vp->v_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) { VI_UNLOCK(devvp); vrele(vp); VI_LOCK(devvp); } sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp); VI_UNLOCK(devvp); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(struct vnode *devvp, struct buf *bp) { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(struct bufobj *bo, struct buf *bp) { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo2vnode(bo); KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { VI_UNLOCK(devvp); BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(struct vnode *devvp, struct buf *bp) { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = NULL; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); if (lbn < UFS_NDADDR) { VI_UNLOCK(devvp); return (0); /* Direct blocks are always copied */ } snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { - bp->b_runningbufspace = saved_runningbufspace; - atomic_add_long(&runningbufspace, - bp->b_runningbufspace); + (void)runningbufclaim(bp, + saved_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DIAGNOSTIC if (snapdebug) { printf("Copyonwrite: snapino %ju lbn %jd for ", (uintmax_t)ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %ju", (uintmax_t)VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ - if (saved_runningbufspace != 0) { - bp->b_runningbufspace = saved_runningbufspace; - atomic_add_long(&runningbufspace, bp->b_runningbufspace); - } + if (saved_runningbufspace != 0) + (void)runningbufclaim(bp, saved_runningbufspace); return (error); } /* * sync snapshots to force freework records waiting on snapshots to claim * blocks to free. */ void ffs_sync_snap(struct mount *mp, int waitfor) { struct snapdata *sn; struct vnode *devvp; struct vnode *vp; struct inode *ip; devvp = VFSTOUFS(mp)->um_devvp; if ((devvp->v_vflag & VV_COPYONWRITE) == 0) return; for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return; } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) == 0) break; } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); ffs_syncvnode(vp, waitfor, NO_INO_UPDT); } lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(struct vnode *vp, struct buf *bp, ufs2_daddr_t lbn) { struct inode *ip; struct fs *fs; ip = VTOI(vp); fs = ITOFS(ip); bp->b_iocmd = BIO_READ; bp->b_iooffset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bp->b_iodone = bdone; g_vfs_strategy(&ITODEVVP(ip)->v_bufobj, bp); bufwait(bp); return (bp->b_error); } #endif /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; int error; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } vholdl(vp); retry_vnode: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; UFS_INODE_SET_FLAG(ip, IN_MODIFIED); } VI_LOCK(vp); error = vinactive(vp); if (error == ERELOOKUP && vp->v_usecount == 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp); goto retry_vnode; } VI_UNLOCK(vp); VOP_UNLOCK(vp); vdrop(vp); } vn_finished_secondary_write(mp); } #ifndef NO_FFS_SNAPSHOT static struct snapdata * ffs_snapdata_alloc(void) { struct snapdata *sn; /* * Fetch a snapdata from the free list if there is one available. */ mtx_lock(&snapfree_lock); sn = LIST_FIRST(&snapfree); if (sn != NULL) LIST_REMOVE(sn, sn_link); mtx_unlock(&snapfree_lock); if (sn != NULL) return (sn); /* * If there were no free snapdatas allocate one. */ sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); return (sn); } /* * The snapdata is never freed because we can not be certain that * there are no threads sleeping on the snap lock. Persisting * them permanently avoids costly synchronization in ffs_lock(). */ static void ffs_snapdata_free(struct snapdata *sn) { mtx_lock(&snapfree_lock); LIST_INSERT_HEAD(&snapfree, sn, sn_link); mtx_unlock(&snapfree_lock); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp) { struct snapdata *sn; ufs2_daddr_t *snapblklist; ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) return; devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (snapblklist != NULL) free(snapblklist, M_UFSMNT); ffs_snapdata_free(sn); VI_LOCK(devvp); } /* * Revert a vnode lock from using the snapshot lock back to its own lock. * * Aquire a lock on the vnode's own lock and release the lock on the * snapshot lock. If there are any recursions on the snapshot lock * get the same number of recursions on the vnode's own lock. */ static void revert_snaplock(struct vnode *vp, struct vnode *devvp, struct snapdata *sn) { int i; ASSERT_VI_LOCKED(devvp, "revert_snaplock"); /* * Avoid LOR with snapshot lock. The LK_NOWAIT should * never fail as the lock is currently unused. Rather than * panic, we recover by doing the blocking lock. */ for (i = 0; i <= sn->sn_lock.lk_recurse; i++) { if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(devvp)) != 0) { printf("revert_snaplock: Unexpected LK_NOWAIT " "failure\n"); lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(devvp)); } VI_LOCK(devvp); } KASSERT(vp->v_vnlock == &sn->sn_lock, ("revert_snaplock: lost lock mutation")); vp->v_vnlock = &vp->v_lock; while (sn->sn_lock.lk_recurse > 0) lockmgr(&sn->sn_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } static struct snapdata * ffs_snapdata_acquire(struct vnode *devvp) { struct snapdata *nsn, *sn; int error; /* * Allocate a free snapdata. This is done before acquiring the * devvp lock to avoid allocation while the devvp interlock is * held. */ nsn = ffs_snapdata_alloc(); for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { /* * This is the first snapshot on this * filesystem and we use our pre-allocated * snapdata. Publish sn with the sn_lock * owned by us, to avoid the race. */ error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL); if (error != 0) panic("leaked sn, lockmgr error %d", error); sn = devvp->v_rdev->si_snapdata = nsn; VI_UNLOCK(devvp); nsn = NULL; break; } /* * There is a snapshots which already exists on this * filesystem, grab a reference to the common lock. */ error = lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); if (error == 0) break; } /* * Free any unused snapdata. */ if (nsn != NULL) ffs_snapdata_free(nsn); return (sn); } #endif diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 14046de1e7b3..c56603752a50 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,2613 +1,2611 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size); static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = vfs_cache_root, .vfs_cachedroot = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", "untrusted", NULL }; static int ffs_enxio_enable = 1; SYSCTL_DECL(_vfs_ffs); SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN, &ffs_enxio_enable, 0, "enable mapping of other disk I/O errors to ENXIO"); /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ static int ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ static int ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; int error; if (I_IS_UFS1(ip)) { dip1 = ip->i_din1; *dip1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = dip1->di_mode; ip->i_nlink = dip1->di_nlink; ip->i_effnlink = dip1->di_nlink; ip->i_size = dip1->di_size; ip->i_flags = dip1->di_flags; ip->i_gen = dip1->di_gen; ip->i_uid = dip1->di_uid; ip->i_gid = dip1->di_gid; return (0); } dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 && !ffs_fsfail_cleanup(ITOUMP(ip), error)) { printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt, (intmax_t)ino); return (error); } *ip->i_din2 = *dip2; dip2 = ip->i_din2; ip->i_mode = dip2->di_mode; ip->i_nlink = dip2->di_nlink; ip->i_effnlink = dip2->di_nlink; ip->i_size = dip2->di_size; ip->i_flags = dip2->di_flags; ip->i_gen = dip2->di_gen; ip->i_uid = dip2->di_uid; ip->i_gid = dip2->di_gid; return (0); } /* * Verify that a filesystem block number is a valid data block. * This routine is only called on untrusted filesystems. */ static int ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t end_daddr; int cg, havemtx; KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0, ("ffs_check_blkno called on a trusted file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; cg = dtog(fs, daddr); end_daddr = daddr + numfrags(fs, blksize); /* * Verify that the block number is a valid data block. Also check * that it does not point to an inode block or a superblock. Accept * blocks that are unalloacted (0) or part of snapshot metadata * (BLK_NOCOPY or BLK_SNAP). * * Thus, the block must be in a valid range for the filesystem and * either in the space before a backup superblock (except the first * cylinder group where that space is used by the bootstrap code) or * after the inode blocks and before the end of the cylinder group. */ if ((uint64_t)daddr <= BLK_SNAP || ((uint64_t)end_daddr <= fs->fs_size && ((cg > 0 && end_daddr <= cgsblock(fs, cg)) || (daddr >= cgdmin(fs, cg) && end_daddr <= cgbase(fs, cg) + fs->fs_fpg)))) return (0); if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0) UFS_LOCK(ump); if (ppsratecheck(&ump->um_last_integritymsg, &ump->um_secs_integritymsg, 1)) { UFS_UNLOCK(ump); uprintf("\n%s: inode %jd, out-of-range indirect block " "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr); if (havemtx) UFS_LOCK(ump); } else if (!havemtx) UFS_UNLOCK(ump); return (EINTEGRITY); } /* * On first ENXIO error, initiate an asynchronous forcible unmount. * Used to unmount filesystems whose underlying media has gone away. * * Return true if a cleanup is in progress. */ int ffs_fsfail_cleanup(struct ufsmount *ump, int error) { int retval; UFS_LOCK(ump); retval = ffs_fsfail_cleanup_locked(ump, error); UFS_UNLOCK(ump); return (retval); } int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; if (ump->um_mountp == rootvnode->v_mount) panic("UFS: root fs would be forcibly unmounted"); /* * Queue an async forced unmount. */ vfs_ref(ump->um_mountp); dounmount(ump->um_mountp, MNT_FORCE | MNT_RECURSE | MNT_DEFERRED, curthread); printf("UFS: forcibly unmounting %s from %s\n", ump->um_mountp->mnt_stat.f_mntfromname, ump->um_mountp->mnt_stat.f_mntonname); } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } /* * Wrapper used during ENXIO cleanup to allocate empty buffers when * the kernel is unable to read the real one. They are needed so that * the soft updates code can use them to unwind its dependencies. */ int ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { int error; flags |= GB_CVTENXIO; error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt, cred, flags, ckhashfunc, bpp); if (error != 0 && ffs_fsfail_cleanup(ump, error)) { error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp); KASSERT(error == 0, ("getblkx failed")); vfs_bio_bzero_buf(*bpp, 0, size); } return (error); } static int ffs_mount(struct mount *mp) { struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; int error, flags; int error1 __diagused; uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; bool mounted_softdep; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0) mntorflags |= MNT_UNTRUSTED; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) { if ((mp->mnt_flag & MNT_UPDATE) == 0) return (EINVAL); return (ffs_snapshot(mp, fspec)); } /* * Must not call namei() while owning busy ref. */ if (mp->mnt_flag & MNT_UPDATE) vfs_unbusy(mp); /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref successfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE_PNBUF(&ndp); if (!vn_isdisk_error(ndp.ni_vp, &error)) { vput(ndp.ni_vp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(ndp.ni_vp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(ndp.ni_vp); return (error); } /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((mp->mnt_flag & MNT_UPDATE) == 0) { if ((error = ffs_mountfs(ndp.ni_vp, mp, td)) != 0) { vrele(ndp.ni_vp); return (error); } } else { /* * When updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ ump = VFSTOUFS(mp); fs = ump->um_fs; odevvp = ump->um_odevvp; devvp = ump->um_devvp; /* * If it's not the same vnode, or at least the same device * then it's not correct. */ if (ndp.ni_vp->v_rdev != ump->um_odevvp->v_rdev) error = EINVAL; /* needs translation */ vput(ndp.ni_vp); if (error) return (error); if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 1; if (MOUNTEDSOFTDEP(mp)) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); mounted_softdep = true; } else mounted_softdep = false; /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mounted_softdep) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { fs->fs_ronly = 0; if (mounted_softdep) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SOFTDEP; MNT_IUNLOCK(mp); } vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; if (mounted_softdep) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SOFTDEP; MNT_IUNLOCK(mp); } vfs_write_resume(mp, 0); return (error); } if (mounted_softdep) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); VOP_UNLOCK(odevvp); if (error) { return (error); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", mp->mnt_stat.f_mntonname); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", mp->mnt_stat.f_mntonname, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); saved_mnt_flag = MNT_RDONLY; if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & MNT_ASYNC) != 0) saved_mnt_flag |= MNT_ASYNC; mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 1; if ((fs->fs_flags & FS_DOSOFTDEP) != 0) softdep_unmount(mp); MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } } MNT_ILOCK(mp); /* * This is racy versus lookup, see ufs_fplookup_vexec for details. */ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; uint64_t size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EINTEGRITY); } /* * Preserve the summary information, read-only status, and * superblock location by copying these fields into our new * superblock before using it to update the existing superblock. */ newfs->fs_si = fs->fs_si; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (uint64_t)fs->fs_sbsize); brelse(bp); ump->um_bsize = fs->fs_bsize; ump->um_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(uint8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (uint64_t)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(uint8_t); fs->fs_contigdirs = (uint8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) { brelse(bp); vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); } return (0); } /* * Common code for mount and mountroot */ static int ffs_mountfs(struct vnode *odevvp, struct mount *mp, struct thread *td) { struct ufsmount *ump; struct fs *fs; struct cdev *dev; int error, i, len, ronly; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; int candelete, canspeedup; fs = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; devvp = mntfs_allocvp(mp, odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; KASSERT(dev->si_snapdata == NULL, ("non-NULL snapshot data")); if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag |= BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); VOP_UNLOCK(devvp); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > maxphys) mp->mnt_iosize_max = maxphys; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } /* fetch the superblock and summary information */ if ((mp->mnt_flag & (MNT_ROOTFS | MNT_FORCE)) != 0) error = ffs_sbsearch(devvp, &fs, 0, M_UFSMNT, ffs_use_bread); else error = ffs_sbget(devvp, &fs, UFS_STDSB, 0, M_UFSMNT, ffs_use_bread); if (error != 0) goto out; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", mp->mnt_stat.f_mntonname); } else { vfs_mount_error(mp, "R/W mount on %s denied. " "Filesystem is not clean - run fsck.%s", mp->mnt_stat.f_mntonname, (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((uint64_t)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { if ((mp->mnt_flag & MNT_RDONLY) == 0) printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = fs; if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; if ((mp->mnt_flag & MNT_UNTRUSTED) != 0) ump->um_check_blkno = ffs_check_blkno; else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); sx_init(&ump->um_checkpath_lock, "uchpth"); ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } ump->um_bsize = fs->fs_bsize; ump->um_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &candelete) == 0) { if (candelete) ump->um_flags |= UM_CANDELETE; else printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); } if (((ump->um_flags) & UM_CANDELETE) != 0) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, &ump->um_trimlisthashsize); } } len = sizeof(int); if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) { if (canspeedup) ump->um_flags |= UM_CANSPEEDUP; } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (fs != NULL) { free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); } if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump != NULL) { mtx_destroy(UFS_MTX(ump)); sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } MPASS(ump->um_softdep == NULL); free(ump, M_UFSMNT); mp->mnt_data = NULL; } BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); mntfs_freevp(devvp); dev_rel(dev); return (error); } /* * A read function for use by filesystem-layer routines. */ static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size) { struct buf *bp; int error; KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp)); *bufp = malloc(size, M_UFSMNT, M_WAITOK); if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED, &bp)) != 0) return (error); bcopy(bp->b_data, *bufp, size); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (0); } static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, ufs2_daddr_t sblockloc) { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump) { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; susp = fs->fs_ronly == 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && !ffs_fsfail_cleanup(ump, error)) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); MPASS(ump->um_softdep == NULL); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0 && !ffs_fsfail_cleanup(ump, error)) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { MPASS(ump->um_trim_inflight == 0); taskqueue_free(ump->um_trim_tq); free (ump->um_trimhash, M_TRIM); } vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); BO_LOCK(&ump->um_odevvp->v_bufobj); ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); mntfs_freevp(ump->um_devvp); vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(struct mount *mp, int flags, struct thread *td) { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif /* devvp is not locked there */ if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * If this is a forcible unmount and there were any files that * were unlinked but still open, then vflush() will have * truncated and freed those files, which might have started * some trim work. Wait here for any trims to complete * and process the blkfrees which follow the trims. * This may create more dirty devvp buffers and softdep deps. */ if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); } /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(struct mount *mp, struct statfs *sbp) { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } static int ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused) { struct inode *ip; /* * Flags are safe to access because ->v_data invalidation * is held off by listmtx. */ if (vp->v_type == VNON) return (false); ip = VTOI(vp); if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) return (false); return (true); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(struct mount *mp) { struct vnode *mvp, *vp; struct inode *ip; int allerror, error; allerror = 0; if ((mp->mnt_flag & MNT_NOATIME) != 0) { #ifdef QUOTA qsync(mp); #endif goto sbupdate; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0) continue; #ifdef QUOTA qsyncvp(vp); #endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } #ifdef QUOTA qsyncvp(vp); #endif for (;;) { error = ffs_syncvnode(vp, waitfor, 0); if (error == ERELOOKUP) continue; if (error != 0) allerror = error; break; } vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; return (allerror); } int ffs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(struct mount *mp, ino_t ino, int flags, struct vnode **vpp, int ffs_flags) { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; daddr_t dbn; int error; MPASS((ffs_flags & (FFSV_REPLACE | FFSV_REPLACE_DOOMED)) == 0 || (flags & LK_EXCLUSIVE) != 0); error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { if ((ffs_flags & FFSV_REPLACE) == 0 || ((ffs_flags & FFSV_REPLACE_DOOMED) == 0 || !VN_IS_DOOMED(*vpp))) return (0); vgone(*vpp); vput(*vpp); } /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree_smr(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ cluster_init_vn(&ip->i_clusterw); #ifdef DIAGNOSTIC ufs_init_trackers(ip); #endif #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { /* * Calls from ffs_valloc() (i.e. FFSV_REPLACE set) * operate on empty inode, which must not be found by * other threads until fully filled. Vnode for empty * inode must be not re-inserted on the hash by other * thread, after removal by us at the beginning. */ MPASS((ffs_flags & FFSV_REPLACE) == 0); return (0); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); if ((ffs_flags & FFSV_NEWINODE) != 0) { /* New inode, just zero out its contents. */ if (I_IS_UFS1(ip)) memset(ip->i_din1, 0, sizeof(struct ufs1_dinode)); else memset(ip->i_din2, 0, sizeof(struct ufs2_dinode)); } else { /* Read the disk contents for the inode, copy into the inode. */ dbn = fsbtodb(fs, ino_to_fsba(fs, ino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error != 0) { /* * The inode does not contain anything useful, so it * would be misleading to leave it on its hash chain. * With mode still zero, it will be unlinked and * returned to the free list by vput(). */ vgone(vp); vput(vp); *vpp = NULL; return (error); } if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) { bqrelse(bp); vgone(vp); vput(vp); *vpp = NULL; return (error); } bqrelse(bp); } if (DOINGSOFTDEP(vp) && (!fs->fs_ronly || (ffs_flags & FFSV_FORCEINODEDEP) != 0)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vgone(vp); vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { UFS_INODE_SET_FLAG(ip, IN_MODIFIED); DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vgone(vp); vput(vp); *vpp = NULL; return (error); } } #endif vn_set_state(vp, VSTATE_CONSTRUCTED); *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct ufid *ufhp; ufhp = (struct ufid *)fhp; return (ffs_inotovp(mp, ufhp->ufid_ino, ufhp->ufid_gen, flags, vpp, 0)); } /* * Return a vnode from a mounted filesystem for inode with specified * generation number. Return ESTALE if the inode with given generation * number no longer exists on that filesystem. */ int ffs_inotovp(struct mount *mp, ino_t ino, uint64_t gen, int lflags, struct vnode **vpp, int ffs_flags) { struct ufsmount *ump; struct vnode *nvp; struct inode *ip; struct fs *fs; struct cg *cgp; struct buf *bp; uint64_t cg; ump = VFSTOUFS(mp); fs = ump->um_fs; *vpp = NULL; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic == FS_UFS2_MAGIC) { cg = ino_to_cg(fs, ino); if (ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp) != 0) return (ESTALE); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); } if (ffs_vgetf(mp, ino, lflags, &nvp, ffs_flags) != 0) return (ESTALE); ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != gen || ip->i_effnlink <= 0) { if (ip->i_mode == 0) vgone(nvp); vput(nvp); return (ESTALE); } vnode_create_vobject(nvp, DIP(ip, i_size), curthread); *vpp = nvp; return (0); } /* * Initialize the filesystem. */ static int ffs_init(struct vfsconf *vfsp) { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(struct vfsconf *vfsp) { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); taskqueue_drain_all(taskqueue_thread); return (ret); } /* * Structure used to pass information from ffs_sbupdate to its * helper routine ffs_use_bwrite. */ struct devfd { struct ufsmount *ump; struct buf *sbbp; int waitfor; int suspended; int error; }; /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(struct ufsmount *ump, int waitfor, int suspended) { struct fs *fs; struct buf *sbbp; struct devfd devfd; fs = ump->um_fs; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * Initialize info needed for write function. */ devfd.ump = ump; devfd.sbbp = sbbp; devfd.waitfor = waitfor; devfd.suspended = suspended; devfd.error = 0; return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite)); } /* * Write function for use by filesystem-layer routines. */ static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size) { struct devfd *devfdp; struct ufsmount *ump; struct buf *bp; struct fs *fs; int error; devfdp = devfd; ump = devfdp->ump; fs = ump->um_fs; /* * Writing the superblock summary information. */ if (loc != fs->fs_sblockloc) { bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0); bcopy(buf, bp->b_data, (uint64_t)size); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (0); } /* * Writing the superblock itself. We need to do special checks for it. */ bp = devfdp->sbbp; if (ffs_fsfail_cleanup(ump, devfdp->error)) devfdp->error = 0; if (devfdp->error != 0) { brelse(bp); return (devfdp->error); } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); UFS_LOCK(ump); bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize); UFS_UNLOCK(ump); fs = (struct fs *)bp->b_data; fs->fs_fmod = 0; ffs_oldfscompat_write(fs, ump); fs->fs_si = NULL; /* Recalculate the superblock hash */ fs->fs_ckhash = ffs_calc_sbhash(fs); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (devfdp->error); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); #endif /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not lose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. Clear B_IOSTARTED in case of error. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~(B_CACHE | B_IOSTARTED); pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; struct cg *cgp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. We have to recalculate * its check hash in case the buffer gets freed and then * reconstituted from the buffer cache during a later read. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_ckhash = 0; cgp->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); } bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: /* * If we are writing a cylinder group, update its time. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_old_time = cgp->cg_time = time_second; } return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; struct buf *tbp; int error, nocopy; /* * This is the bufobj strategy for the private VCHR vnodes * used by FFS to access the underlying storage device. * We override the default bufobj strategy and thus bypass * VOP_STRATEGY() for these vnodes. */ vp = bo2vnode(bo); KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || bp->b_vp->v_rdev == NULL || bp->b_vp->v_rdev->si_mountpt == NULL || VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } - bp->b_runningbufspace = bp->b_bufsize; - atomic_add_long(&runningbufspace, - bp->b_runningbufspace); + (void)runningbufclaim(bp, bp->b_bufsize); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif /* * Check for metadata that needs check-hashes and update them. */ switch (bp->b_xflags & BX_FSPRIV) { case BX_CYLGRP: ((struct cg *)bp->b_data)->cg_ckhash = 0; ((struct cg *)bp->b_data)->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); break; case BX_SUPERBLOCK: case BX_INODE: case BX_INDIR: case BX_DIR: printf("Check-hash write is unimplemented!!!\n"); break; case 0: break; default: printf("multiple buffer types 0x%b\n", (bp->b_xflags & BX_FSPRIV), PRINT_UFS_BUF_XFLAGS); break; } } if (bp->b_iocmd != BIO_READ && ffs_enxio_enable) bp->b_xflags |= BX_CVTENXIO; g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index a690e48e07ee..496c3586c321 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1,1721 +1,1721 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); static void vnode_pager_update_writecount(vm_object_t, vm_offset_t, vm_offset_t); static void vnode_pager_release_writecount(vm_object_t, vm_offset_t, vm_offset_t); static void vnode_pager_getvp(vm_object_t, struct vnode **, bool *); const struct pagerops vnodepagerops = { .pgo_kvme_type = KVME_TYPE_VNODE, .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, .pgo_update_writecount = vnode_pager_update_writecount, .pgo_release_writecount = vnode_pager_release_writecount, .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, .pgo_mightbedirty = vm_object_mightbedirty_, .pgo_getvp = vnode_pager_getvp, }; static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); static int nvnpbufs; SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nvnpbufs, 0, "number of physical buffers allocated for vnode pager"); static uma_zone_t vnode_pbuf_zone; static void vnode_pager_init(void *dummy) { #ifdef __LP64__ nvnpbufs = nswbuf * 2; #else nvnpbufs = nswbuf / 2; #endif TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs); vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs); } SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; bool last; if (!vn_isdisk(vp) && vn_canvmio(vp) == FALSE) return (0); object = vp->v_object; if (object != NULL) return (0); if (size == 0) { if (vn_isdisk(vp)) { size = IDX_TO_OFF(INT_MAX); } else { if (vn_getsize_locked(vp, &size, td->td_ucred) != 0) return (0); } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. We still have * to serialize with vnode_pager_dealloc() for the last * potential reference. */ VM_OBJECT_RLOCK(object); last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (last) vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL || obj->handle != vp) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); MPASS(obj->type == OBJT_VNODE); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { KASSERT((obj->flags & OBJ_DEAD) == 0, ("vnode_destroy_vobject: Terminating dead object")); vm_object_set_flag(obj, OBJ_DEAD); /* * Clean pages and flush buffers. */ vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(obj); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(obj); vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *)handle; ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc"); VNPASS(vp->v_usecount > 0, vp); retry: object = vp->v_object; if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->domain.dr_policy = vnode_domainset; object->handle = handle; if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) { VM_OBJECT_WLOCK(object); vm_object_set_flag(object, OBJ_SIZEVNLOCK); VM_OBJECT_WUNLOCK(object); } VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were allocating. */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; refcount_init(&object->ref_count, 0); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); vrefact(vp); } else { vm_object_reference(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif } return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VI_LOCK(vp); /* * vm_map_entry_set_vnode_text() cannot reach this vnode by * following object->handle. Clear all text references now. * This also clears the transient references from * kern_execve(), which is fine because dead_vnodeops uses nop * for VOP_UNSET_TEXT(). */ if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); if (refs > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; uintptr_t lockstate; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_LOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || VN_IS_DOOMED(vp)) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } lockstate = VM_OBJECT_DROP(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_PICKUP(object, lockstate); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Internal routine clearing partial-page content */ static void vnode_pager_subpage_purge(struct vm_page *m, int base, int end) { int size; KASSERT(end > base && end <= PAGE_SIZE, ("%s: start %d end %d", __func__, base, end)); size = end - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks * that have been zeroed. Some of these valid * bits may have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round up "base" to the next block boundary so * that the dirty bit for a partially zeroed * block is not cleared. */ base = roundup2(base, DEV_BSIZE); end = rounddown2(end, DEV_BSIZE); if (end > base) { /* * Clear out partial-page dirty bits. * * note that we do not clear out the * valid bits. This would prevent * bogus_page replacement from working * properly. */ vm_page_clear_dirty(m, base, end - base); } } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; #ifdef DEBUG_VFS_LOCKS { struct mount *mp; mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_VMSETSIZE_BUG) == 0) assert_vop_elocked(vp, "vnode_pager_setsize and not locked vnode"); } #endif VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if (!(nsize & PAGE_MASK)) goto out; m = vm_page_grab(object, OFF_TO_IDX(nsize), VM_ALLOC_NOCREAT); if (m == NULL) goto out; if (!vm_page_none_valid(m)) vnode_pager_subpage_purge(m, (int)nsize & PAGE_MASK, PAGE_SIZE); vm_page_xunbusy(m); } out: #if defined(__powerpc__) && !defined(__powerpc64__) object->un_pager.vnp.vnp_size = nsize; #else atomic_store_64(&object->un_pager.vnp.vnp_size, nsize); #endif object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * Lets the VM system know about the purged range for a file. We toss away any * cached pages in the associated object that are affected by the purge * operation. Partial-page area not aligned to page boundaries will be zeroed * and the dirty blocks in DEV_BSIZE unit within a page will not be flushed. */ void vnode_pager_purge_range(struct vnode *vp, vm_ooffset_t start, vm_ooffset_t end) { struct vm_page *m; struct vm_object *object; vm_pindex_t pi, pistart, piend; bool same_page; int base, pend; ASSERT_VOP_LOCKED(vp, "vnode_pager_purge_range"); object = vp->v_object; pi = start + PAGE_MASK < start ? OBJ_MAX_SIZE : OFF_TO_IDX(start + PAGE_MASK); pistart = OFF_TO_IDX(start); piend = end == 0 ? OBJ_MAX_SIZE : OFF_TO_IDX(end); same_page = pistart == piend; if ((end != 0 && end <= start) || object == NULL) return; VM_OBJECT_WLOCK(object); if (pi < piend) vm_object_page_remove(object, pi, piend, 0); if ((start & PAGE_MASK) != 0) { base = (int)start & PAGE_MASK; pend = same_page ? (int)end & PAGE_MASK : PAGE_SIZE; m = vm_page_grab(object, pistart, VM_ALLOC_NOCREAT); if (m != NULL) { if (!vm_page_none_valid(m)) vnode_pager_subpage_purge(m, base, pend); vm_page_xunbusy(m); } if (same_page) goto out; } if ((end & PAGE_MASK) != 0) { base = same_page ? (int)start & PAGE_MASK : 0 ; pend = (int)end & PAGE_MASK; m = vm_page_grab(object, piend, VM_ALLOC_NOCREAT); if (m != NULL) { if (!vm_page_none_valid(m)) vnode_pager_subpage_purge(m, base, pend); vm_page_xunbusy(m); } } out: VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (VN_IS_DOOMED(vp)) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize / PAGE_SIZE; *run -= voffset / PAGE_SIZE; } } return (err); } static void vnode_pager_input_bdone(struct buf *bp) { runningbufwakeup(bp); bdone(bp); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (VN_IS_DOOMED(vp)) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = vnode_pager_input_bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; - bp->b_runningbufspace = bp->b_bufsize; - atomic_add_long(&runningbufspace, bp->b_runningbufspace); + (void)runningbufclaim(bp, bp->b_bufsize); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) { KASSERT(bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = bp->b_error; } /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); vm_page_bits_set(m, &m->valid, bits); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) vm_page_valid(m); return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; /* Handle is stable with paging in progress. */ vp = object->handle; rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { int error; error = vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); if (error != 0 && ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (VN_IS_DOOMED(vp)) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= atop(maxphys), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (!vm_page_none_valid(m[count - 1]) && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); vm_page_valid(m[0]); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > atop(maxphys)) { int trim, sum; trim = rbehind + rahead + count - atop(maxphys) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= atop(maxphys), ("%s: behind %d ahead %d count %d maxphys %lu", __func__, rbehind, rahead, count, maxphys)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= atop(maxphys), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page) continue; KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex == j - prev, ("%s: pages array not consecutive, bp %p", __func__, bp)); prev = j; } #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; - bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; + bp->b_bcount = bp->b_bufsize = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); - atomic_add_long(&runningbufspace, bp->b_runningbufspace); + (void)runningbufclaim(bp, bp->b_bufsize); + VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; KASSERT((bp->b_ioflags & BIO_ERROR) == 0 || bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = (bp->b_ioflags & BIO_ERROR) != 0 ? bp->b_error : 0; object = bp->b_vp->v_object; runningbufwakeup(bp); if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } /* * If the read failed, we must free any read ahead/behind pages here. * The requested pages are freed by the caller (for sync requests) * or by the bp->b_pgiodone callback (for async requests). */ if (error != 0) { VM_OBJECT_WLOCK(object); for (i = 0; i < bp->b_pgbefore; i++) vm_page_free_invalid(bp->b_pages[i]); for (i = bp->b_npages - bp->b_pgafter; i < bp->b_npages; i++) vm_page_free_invalid(bp->b_pages[i]); VM_OBJECT_WUNLOCK(object); return (error); } /* Read lock to protect size. */ VM_OBJECT_RLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (mt == bogus_page) continue; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ vm_page_valid(mt); KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_RUNLOCK(object); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval __diagused; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_page_count_min()) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } static int vn_off2bidx(vm_ooffset_t offset) { return ((offset & PAGE_MASK) / DEV_BSIZE); } static bool vn_dirty_blk(vm_page_t m, vm_ooffset_t offset) { KASSERT(IDX_TO_OFF(m->pindex) <= offset && offset < IDX_TO_OFF(m->pindex + 1), ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex, (uintmax_t)offset)); return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t max_offset, next_offset, poffset, prev_offset; struct uio auio; struct iovec aiov; off_t prev_resid, wrsz; int count, error, i, maxsize, ncount, pgoff, ppscheck; bool in_hole; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_generic_putpages: " "attempt to write meta-data 0x%jx(%lx)\n", (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return (VM_PAGER_BAD); } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page busied we are free to fix up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_RLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { pgoff = roundup2(pgoff, DEV_BSIZE); /* * If the page is busy and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); MPASS(m->dirty != 0); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } for (i = ncount; i < count; i++) rtvals[i] = VM_PAGER_BAD; } VM_OBJECT_RUNLOCK(object); auio.uio_iov = &aiov; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_td = NULL; max_offset = roundup2(poffset + maxsize, DEV_BSIZE); for (prev_offset = poffset; prev_offset < max_offset;) { /* Skip clean blocks. */ for (in_hole = true; in_hole && prev_offset < max_offset;) { m = ma[OFF_TO_IDX(prev_offset - poffset)]; for (i = vn_off2bidx(prev_offset); i < sizeof(vm_page_bits_t) * NBBY && prev_offset < max_offset; i++) { if (vn_dirty_blk(m, prev_offset)) { in_hole = false; break; } prev_offset += DEV_BSIZE; } } if (in_hole) goto write_done; /* Find longest run of dirty blocks. */ for (next_offset = prev_offset; next_offset < max_offset;) { m = ma[OFF_TO_IDX(next_offset - poffset)]; for (i = vn_off2bidx(next_offset); i < sizeof(vm_page_bits_t) * NBBY && next_offset < max_offset; i++) { if (!vn_dirty_blk(m, next_offset)) goto start_write; next_offset += DEV_BSIZE; } } start_write: if (next_offset > poffset + maxsize) next_offset = poffset + maxsize; if (prev_offset == next_offset) goto write_done; /* * Getting here requires finding a dirty block in the * 'skip clean blocks' loop. */ aiov.iov_base = NULL; auio.uio_iovcnt = 1; auio.uio_offset = prev_offset; prev_resid = auio.uio_resid = aiov.iov_len = next_offset - prev_offset; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); wrsz = prev_resid - auio.uio_resid; if (wrsz == 0) { if (ppsratecheck(&lastfail, &curfail, 1) != 0) { vn_printf(vp, "vnode_pager_putpages: " "zero-length write at %ju resid %zd\n", auio.uio_offset, auio.uio_resid); } break; } /* Adjust the starting offset for next iteration. */ prev_offset += wrsz; MPASS(auio.uio_offset == prev_offset); ppscheck = 0; if (error != 0 && (ppscheck = ppsratecheck(&lastfail, &curfail, 1)) != 0) vn_printf(vp, "vnode_pager_putpages: I/O error %d\n", error); if (auio.uio_resid != 0 && (ppscheck != 0 || ppsratecheck(&lastfail, &curfail, 1) != 0)) vn_printf(vp, "vnode_pager_putpages: residual I/O %zd " "at %ju\n", auio.uio_resid, (uintmax_t)ma[0]->pindex); if (error != 0 || auio.uio_resid != 0) break; } write_done: /* Mark completely processed pages. */ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++) rtvals[i] = VM_PAGER_OK; /* Mark partial EOF page. */ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0) rtvals[i++] = VM_PAGER_OK; /* Unwritten pages in range, free bonus if the page is clean. */ for (; i < ncount; i++) rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR; VM_CNT_ADD(v_vnodepgsout, i); VM_CNT_INC(v_vnodeout); return (rtvals[0]); } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } /* * vnode_pager_undirty_pages(). * * A helper to mark pages as clean after pageout that was possibly * done with a short write. The lpos argument specifies the page run * length in bytes, and the written argument specifies how many bytes * were actually written. eof is the offset past the last valid byte * in the vnode using the absolute file position of the first byte in * the run as the base from which it is computed. */ void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof, int lpos) { int i, pos, pos_devb; if (written == 0 && eof >= lpos) return; for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } if (eof >= lpos) /* avoid truncation */ return; for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) { if (pos != trunc_page(pos)) { /* * The page contains the last valid byte in * the vnode, mark the rest of the page as * clean, potentially making the whole page * clean. */ pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE); vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE - pos_devb); /* * If the page was cleaned, report the pageout * on it as successful. msync() no longer * needs to write out the page, endlessly * creating write requests and dirty buffers. */ if (ma[i]->dirty == 0) rtvals[i] = VM_PAGER_OK; pos = round_page(pos); } else { /* vm_pageout_flush() clears dirty */ rtvals[i] = VM_PAGER_BAD; pos += PAGE_SIZE; } } } static void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_LOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT_CHECKED(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_LOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } static void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp); vdrop(vp); if (mp != NULL) vn_finished_write(mp); } static void vnode_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { *vpp = object->handle; } static void vnode_pager_clean1(struct vnode *vp, int sync_flags) { struct vm_object *obj; ASSERT_VOP_LOCKED(vp, "needs lock for writes"); obj = vp->v_object; if (obj == NULL) return; VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, sync_flags); VM_OBJECT_WUNLOCK(obj); } void vnode_pager_clean_sync(struct vnode *vp) { vnode_pager_clean1(vp, OBJPC_SYNC); } void vnode_pager_clean_async(struct vnode *vp) { vnode_pager_clean1(vp, 0); }