Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c +++ head/sys/kern/vfs_bio.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -105,7 +106,6 @@ /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; -struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); @@ -124,11 +124,8 @@ static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, int); -static int buf_recycle(bool); -static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); -static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); @@ -137,28 +134,17 @@ static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); -#endif - int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); -static long bufspace; -#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, - &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); -#else -SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, - "Physical memory used for buffers"); -#endif -static long bufkvaspace; -SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, + NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); +static counter_u64_t bufkvaspace; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, @@ -178,11 +164,11 @@ long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); -static int buffreekvacnt; -SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, +static counter_u64_t buffreekvacnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); -static int bufdefragcnt; -SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, +static counter_u64_t bufdefragcnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | @@ -225,24 +211,26 @@ static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); -static int getnewbufcalls; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, - "Number of calls to getnewbuf"); -static int getnewbufrestarts; -SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, +static counter_u64_t getnewbufcalls; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, + &getnewbufcalls, "Number of calls to getnewbuf"); +static counter_u64_t getnewbufrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, + &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); -static int mappingrestarts; -SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, +static counter_u64_t mappingrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, + &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); -static int numbufallocfails; -SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, - "Number of times buffer allocations failed"); +static counter_u64_t numbufallocfails; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, + &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); -static long notbufdflushes; -SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, +static counter_u64_t notbufdflushes; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, @@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* - * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. - */ -static struct rwlock_padalign __exclusive_cache_line nblock; - -/* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; @@ -283,11 +266,6 @@ static int bd_request; /* - * Request/wakeup point for the bufspace daemon. - */ -static int bufspace_request; - -/* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty @@ -302,15 +280,6 @@ */ static int runningbufreq; -/* - * Synchronization (sleep/wakeup) variable for buffer requests. - * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done - * by and/or. - * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), - * getnewbuf(), and getblk(). - */ -static volatile int needsbuffer; - /* * Synchronization for bwillwrite() waiters. */ @@ -323,29 +292,69 @@ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ -#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ +#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ -/* Maximum number of clean buffer queues. */ -#define CLEAN_QUEUES 16 +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_subqueue; + int bq_len; +} __aligned(CACHE_LINE_SIZE); +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufqueue __exclusive_cache_line bqempty; +struct bufqueue __exclusive_cache_line bqdirty; + +struct bufdomain { + struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ + struct bufqueue *bd_cleanq; + struct mtx_padalign bd_run_lock; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_running; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) +#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) +#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) +#define BD_DOMAIN(bd) (bd - bdclean) + +/* Maximum number of clean buffer domains. */ +#define CLEAN_DOMAINS 8 + /* Configured number of clean queues. */ -static int clean_queues; +static int __read_mostly clean_domains; -/* Maximum number of buffer queues. */ -#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) +struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS]; -/* Queues for free buffers with various properties */ -static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; -#ifdef INVARIANTS -static int bq_len[BUFFER_QUEUES]; -#endif +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); /* - * Lock for each bufqueue - */ -static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES]; - -/* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; @@ -391,46 +400,34 @@ { long lvalue; int ivalue; + int i; + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) - return (sysctl_handle_long(oidp, arg1, arg2, req)); - lvalue = *(long *)arg1; + return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } -#endif - +#else static int -bqcleanq(void) +sysctl_bufspace(SYSCTL_HANDLER_ARGS) { - static int nextq; + long lvalue; + int i; - return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); + lvalue = 0; + for (i = 0; i < clean_domains; i++) + lvalue += bdclean[i].bd_bufspace; + return (sysctl_handle_int(oidp, &lvalue, 0, req)); } +#endif -static int -bqisclean(int qindex) -{ - - return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); -} - /* - * bqlock: - * - * Return the appropriate queue lock based on the index. - */ -static inline struct mtx * -bqlock(int qindex) -{ - - return (struct mtx *)&bqlocks[qindex]; -} - -/* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. @@ -481,47 +478,50 @@ } /* - * bufspace_wakeup: + * bufspace_daemon_wakeup: * - * Called when buffer space is potentially available for recovery. - * getnewbuf() will block on this flag when it is unable to free - * sufficient buffer space. Buffer space becomes recoverable when - * bp's get placed back in the queues. + * Wakeup the daemons responsible for freeing clean bufs. */ static void -bufspace_wakeup(void) +bufspace_daemon_wakeup(struct bufdomain *bd) { /* - * If someone is waiting for bufspace, wake them up. - * - * Since needsbuffer is set prior to doing an additional queue - * scan it is safe to check for the flag prior to acquiring the - * lock. The thread that is preparing to scan again before - * blocking would discover the buf we released. + * avoid the lock if the daemon is running. */ - if (needsbuffer) { - rw_rlock(&nblock); - if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) - wakeup(__DEVOLATILE(void *, &needsbuffer)); - rw_runlock(&nblock); + if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 1); + wakeup(&bd->bd_running); + BD_RUN_UNLOCK(bd); } } /* - * bufspace_daemonwakeup: + * bufspace_daemon_wait: * - * Wakeup the daemon responsible for freeing clean bufs. + * Sleep until the domain falls below a limit or one second passes. */ static void -bufspace_daemonwakeup(void) +bufspace_daemon_wait(struct bufdomain *bd) { - rw_rlock(&nblock); - if (bufspace_request == 0) { - bufspace_request = 1; - wakeup(&bufspace_request); + /* + * Re-check our limits and sleep. bd_running must be + * cleared prior to checking the limits to avoid missed + * wakeups. The waker will adjust one of bufspace or + * freebuffers prior to checking bd_running. + */ + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 0); + if (bd->bd_bufspace < bd->bd_bufspacethresh && + bd->bd_freebuffers > bd->bd_lofreebuffers) { + msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, + "-", hz); + } else { + /* Avoid spurious wakeups while running. */ + atomic_store_int(&bd->bd_running, 1); + BD_RUN_UNLOCK(bd); } - rw_runlock(&nblock); } /* @@ -533,20 +533,22 @@ static void bufspace_adjust(struct buf *bp, int bufsize) { + struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); + bd = &bdclean[bp->b_domain]; diff = bufsize - bp->b_bufsize; if (diff < 0) { - atomic_subtract_long(&bufspace, -diff); - bufspace_wakeup(); + atomic_subtract_long(&bd->bd_bufspace, -diff); } else { - space = atomic_fetchadd_long(&bufspace, diff); + space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + diff >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && + space + diff >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } @@ -558,24 +560,25 @@ * different space limit than data. */ static int -bufspace_reserve(int size, bool metadata) +bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { - long limit; + long limit, new; long space; if (metadata) - limit = maxbufspace; + limit = bd->bd_maxbufspace; else - limit = hibufspace; - do { - space = bufspace; - if (space + size > limit) - return (ENOSPC); - } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); + limit = bd->bd_hibufspace; + space = atomic_fetchadd_long(&bd->bd_bufspace, size); + new = space + size; + if (new > limit) { + atomic_subtract_long(&bd->bd_bufspace, size); + return (ENOSPC); + } /* Wake up the daemon on the transition. */ - if (space < bufspacethresh && space + size >= bufspacethresh) - bufspace_daemonwakeup(); + if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); return (0); } @@ -586,21 +589,22 @@ * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void -bufspace_release(int size) +bufspace_release(struct bufdomain *bd, int size) { - atomic_subtract_long(&bufspace, size); - bufspace_wakeup(); + + atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is - * supplied. needsbuffer must be set in a safe fashion prior to - * polling for space. The operation must be re-tried on return. + * supplied. bd_wanted must be set prior to polling for space. The + * operation must be re-tried on return. */ static void -bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) +bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, + int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; @@ -609,11 +613,11 @@ return; td = curthread; - rw_wlock(&nblock); - while (needsbuffer != 0) { + BD_LOCK(bd); + while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { - rw_wunlock(&nblock); + BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as @@ -636,18 +640,18 @@ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; - rw_wlock(&nblock); + BD_LOCK(bd); if (fl != 0) continue; - if (needsbuffer == 0) + if (bd->bd_wanted == 0) break; } - error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, + error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } - rw_wunlock(&nblock); + BD_UNLOCK(bd); } @@ -659,10 +663,13 @@ * block nor work to reclaim buffers. */ static void -bufspace_daemon(void) +bufspace_daemon(void *arg) { + struct bufdomain *bd; + + bd = arg; for (;;) { - kproc_suspend_check(bufspacedaemonproc); + kproc_suspend_check(curproc); /* * Free buffers from the clean queue until we meet our @@ -689,46 +696,25 @@ * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ - while (bufspace > lobufspace || - numfreebuffers < hifreebuffers) { - if (buf_recycle(false) != 0) { - atomic_set_int(&needsbuffer, 1); - if (buf_recycle(false) != 0) { - rw_wlock(&nblock); - if (needsbuffer) - rw_sleep(__DEVOLATILE(void *, - &needsbuffer), &nblock, - PRIBIO|PDROP, "bufspace", - hz/10); - else - rw_wunlock(&nblock); - } + do { + if (buf_recycle(bd, false) != 0) { + if (bd_flushall(bd)) + continue; + BD_LOCK(bd); + if (bd->bd_wanted) { + msleep(&bd->bd_wanted, BD_LOCKPTR(bd), + PRIBIO|PDROP, "bufspace", hz/10); + } else + BD_UNLOCK(bd); } maybe_yield(); - } + } while (bd->bd_bufspace > bd->bd_lobufspace || + bd->bd_freebuffers < bd->bd_hifreebuffers); - /* - * Re-check our limits under the exclusive nblock. - */ - rw_wlock(&nblock); - if (bufspace < bufspacethresh && - numfreebuffers > lofreebuffers) { - bufspace_request = 0; - rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, - "-", hz); - } else - rw_wunlock(&nblock); + bufspace_daemon_wait(bd); } } -static struct kproc_desc bufspace_kp = { - "bufspacedaemon", - bufspace_daemon, - &bufspacedaemonproc -}; -SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, - &bufspace_kp); - /* * bufmallocadjust: * @@ -842,7 +828,7 @@ } /* Wake up the buffer daemon if necessary */ -static __inline void +static void bd_wakeup(void) { @@ -1038,19 +1024,12 @@ KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); - mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); - mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); - for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) - mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); + bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); + bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); - rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); - /* next, make a null set of free lists */ - for (i = 0; i < BUFFER_QUEUES; i++) - TAILQ_INIT(&bufqueues[i]); - unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ @@ -1060,15 +1039,14 @@ bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; - bp->b_qindex = QUEUE_EMPTY; + bp->b_qindex = QUEUE_NONE; + bp->b_domain = -1; + bp->b_subqueue = mp_ncpus; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); -#ifdef INVARIANTS - bq_len[QUEUE_EMPTY]++; -#endif + bq_insert(&bqempty, bp, false); } /* @@ -1150,8 +1128,31 @@ * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ - clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); + clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS); + for (i = 0 ; i < clean_domains; i++) { + struct bufdomain *bd; + bd = &bdclean[i]; + bd_init(bd); + bd->bd_freebuffers = nbuf / clean_domains; + bd->bd_hifreebuffers = hifreebuffers / clean_domains; + bd->bd_lofreebuffers = lofreebuffers / clean_domains; + bd->bd_bufspace = 0; + bd->bd_maxbufspace = maxbufspace / clean_domains; + bd->bd_hibufspace = hibufspace / clean_domains; + bd->bd_lobufspace = lobufspace / clean_domains; + bd->bd_bufspacethresh = bufspacethresh / clean_domains; + /* Don't allow more than 2% of bufs in the per-cpu caches. */ + bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus; + } + getnewbufcalls = counter_u64_alloc(M_WAITOK); + getnewbufrestarts = counter_u64_alloc(M_WAITOK); + mappingrestarts = counter_u64_alloc(M_WAITOK); + numbufallocfails = counter_u64_alloc(M_WAITOK); + notbufdflushes = counter_u64_alloc(M_WAITOK); + buffreekvacnt = counter_u64_alloc(M_WAITOK); + bufdefragcnt = counter_u64_alloc(M_WAITOK); + bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS @@ -1326,58 +1327,92 @@ (vm_offset_t)(bp->b_offset & PAGE_MASK)); } +static struct bufqueue * +bufqueue(struct buf *bp) +{ + + switch (bp->b_qindex) { + case QUEUE_NONE: + /* FALLTHROUGH */ + case QUEUE_SENTINEL: + return (NULL); + case QUEUE_EMPTY: + return (&bqempty); + case QUEUE_DIRTY: + return (&bqdirty); + case QUEUE_CLEAN: + return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]); + default: + break; + } + panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); +} + /* + * Return the locked bufqueue that bp is a member of. + */ +static struct bufqueue * +bufqueue_acquire(struct buf *bp) +{ + struct bufqueue *bq, *nbq; + + /* + * bp can be pushed from a per-cpu queue to the + * cleanq while we're waiting on the lock. Retry + * if the queues don't match. + */ + bq = bufqueue(bp); + BQ_LOCK(bq); + for (;;) { + nbq = bufqueue(bp); + if (bq == nbq) + break; + BQ_UNLOCK(bq); + BQ_LOCK(nbq); + bq = nbq; + } + return (bq); +} + +/* * binsfree: * - * Insert the buffer into the appropriate free list. + * Insert the buffer into the appropriate free list. Requires a + * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { - struct mtx *olock, *nlock; + struct bufdomain *bd; + struct bufqueue *bq; - if (qindex != QUEUE_EMPTY) { - BUF_ASSERT_XLOCKED(bp); - } + KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, + ("binsfree: Invalid qindex %d", qindex)); + BUF_ASSERT_XLOCKED(bp); /* - * Stick to the same clean queue for the lifetime of the buf to - * limit locking below. Otherwise pick ont sequentially. - */ - if (qindex == QUEUE_CLEAN) { - if (bqisclean(bp->b_qindex)) - qindex = bp->b_qindex; - else - qindex = bqcleanq(); - } - - /* * Handle delayed bremfree() processing. */ - nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { - olock = bqlock(bp->b_qindex); - mtx_lock(olock); - bremfreel(bp); - if (olock != nlock) { - mtx_unlock(olock); - mtx_lock(nlock); + if (bp->b_qindex == qindex) { + bp->b_flags |= B_REUSE; + bp->b_flags &= ~B_REMFREE; + BUF_UNLOCK(bp); + return; } + bq = bufqueue_acquire(bp); + bq_remove(bq, bp); + BQ_UNLOCK(bq); + } + if (qindex == QUEUE_CLEAN) { + bd = &bdclean[bp->b_domain]; + if (bd->bd_lim != 0) + bq = &bd->bd_subq[PCPU_GET(cpuid)]; + else + bq = bd->bd_cleanq; } else - mtx_lock(nlock); - - if (bp->b_qindex != QUEUE_NONE) - panic("binsfree: free buffer onto another queue???"); - - bp->b_qindex = qindex; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); -#ifdef INVARIANTS - bq_len[bp->b_qindex]++; -#endif - mtx_unlock(nlock); + bq = &bqdirty; + bq_insert(bq, bp, true); } /* @@ -1404,10 +1439,9 @@ if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); + atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); - atomic_add_int(&numfreebuffers, 1); - bufspace_wakeup(); } /* @@ -1424,15 +1458,15 @@ struct buf *bp; int i; - mtx_lock(&bqlocks[QUEUE_EMPTY]); + BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { - bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; - bremfreel(bp); + bq_remove(&bqempty, bp); store[i] = bp; } - mtx_unlock(&bqlocks[QUEUE_EMPTY]); + BQ_UNLOCK(&bqempty); return (i); } @@ -1445,10 +1479,21 @@ static void buf_release(void *arg, void **store, int cnt) { + struct bufqueue *bq; + struct buf *bp; int i; - for (i = 0; i < cnt; i++) - binsfree(store[i], QUEUE_EMPTY); + bq = &bqempty; + BQ_LOCK(bq); + for (i = 0; i < cnt; i++) { + bp = store[i]; + /* Inline bq_insert() to batch locking. */ + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~(B_AGE | B_REUSE); + bq->bq_len++; + bp->b_qindex = bq->bq_index; + } + BQ_UNLOCK(bq); } /* @@ -1457,22 +1502,31 @@ * Allocate an empty buffer header. */ static struct buf * -buf_alloc(void) +buf_alloc(struct bufdomain *bd) { struct buf *bp; + int freebufs; - bp = uma_zalloc(buf_zone, M_NOWAIT); + /* + * We can only run out of bufs in the buf zone if the average buf + * is less than BKVASIZE. In this case the actual wait/block will + * come from buf_reycle() failing to flush one of these small bufs. + */ + bp = NULL; + freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); + if (freebufs > 0) + bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { - bufspace_daemonwakeup(); - atomic_add_int(&numbufallocfails, 1); + atomic_fetchadd_int(&bd->bd_freebuffers, 1); + bufspace_daemon_wakeup(bd); + counter_u64_add(numbufallocfails, 1); return (NULL); } - /* - * Wake-up the bufspace daemon on transition. + * Wake-up the bufspace daemon on transition below threshold. */ - if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) - bufspace_daemonwakeup(); + if (freebufs == bd->bd_lofreebuffers) + bufspace_daemon_wakeup(bd); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); @@ -1488,6 +1542,7 @@ KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; @@ -1512,22 +1567,26 @@ } /* - * buf_qrecycle: + * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int -buf_qrecycle(int qindex, bool kva) +buf_recycle(struct bufdomain *bd, bool kva) { + struct bufqueue *bq; struct buf *bp, *nbp; if (kva) - atomic_add_int(&bufdefragcnt, 1); + counter_u64_add(bufdefragcnt, 1); nbp = NULL; - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + bq = bd->bd_cleanq; + BQ_LOCK(bq); + KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), + ("buf_recycle: Locks don't match")); + nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly @@ -1551,6 +1610,18 @@ continue; /* + * Implement a second chance algorithm for frequently + * accessed buffers. + */ + if ((bp->b_flags & B_REUSE) != 0) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~B_REUSE; + BUF_UNLOCK(bp); + continue; + } + + /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { @@ -1558,14 +1629,18 @@ continue; } - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + KASSERT(bp->b_qindex == QUEUE_CLEAN, + ("buf_recycle: inconsistent queue %d bp %p", + bp->b_qindex, bp)); + KASSERT(bp->b_domain == BD_DOMAIN(bd), + ("getnewbuf: queue domain %d doesn't match request %d", + bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ - bremfreel(bp); - mtx_unlock(&bqlocks[qindex]); + bq_remove(bq, bp); + BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and @@ -1573,70 +1648,21 @@ */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); - mtx_lock(&bqlocks[qindex]); - nbp = TAILQ_FIRST(&bufqueues[qindex]); + BQ_LOCK(bq); + nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } - mtx_unlock(&bqlocks[qindex]); + bd->bd_wanted = 1; + BQ_UNLOCK(bq); return (ENOBUFS); } /* - * buf_recycle: - * - * Iterate through all clean queues until we find a buf to recycle or - * exhaust the search. - */ -static int -buf_recycle(bool kva) -{ - int qindex, first_qindex; - - qindex = first_qindex = bqcleanq(); - do { - if (buf_qrecycle(qindex, kva) == 0) - return (0); - if (++qindex == QUEUE_CLEAN + clean_queues) - qindex = QUEUE_CLEAN; - } while (qindex != first_qindex); - - return (ENOBUFS); -} - -/* - * buf_scan: - * - * Scan the clean queues looking for a buffer to recycle. needsbuffer - * is set on failure so that the caller may optionally bufspace_wait() - * in a race-free fashion. - */ -static int -buf_scan(bool defrag) -{ - int error; - - /* - * To avoid heavy synchronization and wakeup races we set - * needsbuffer and re-poll before failing. This ensures that - * no frees can be missed between an unsuccessful poll and - * going to sleep in a synchronized fashion. - */ - if ((error = buf_recycle(defrag)) != 0) { - atomic_set_int(&needsbuffer, 1); - bufspace_daemonwakeup(); - error = buf_recycle(defrag); - } - if (error == 0) - atomic_add_int(&getnewbufrestarts, 1); - return (error); -} - -/* * bremfree: * * Mark the buffer for removal from the appropriate free list. @@ -1665,43 +1691,158 @@ void bremfreef(struct buf *bp) { - struct mtx *qlock; + struct bufqueue *bq; - qlock = bqlock(bp->b_qindex); - mtx_lock(qlock); - bremfreel(bp); - mtx_unlock(qlock); + bq = bufqueue_acquire(bp); + bq_remove(bq, bp); + BQ_UNLOCK(bq); } +static void +bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) +{ + + mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); + TAILQ_INIT(&bq->bq_queue); + bq->bq_len = 0; + bq->bq_index = qindex; + bq->bq_subqueue = subqueue; +} + +static void +bd_init(struct bufdomain *bd) +{ + int domain; + int i; + + domain = bd - bdclean; + bd->bd_cleanq = &bd->bd_subq[mp_ncpus]; + bq_init(bd->bd_cleanq, QUEUE_CLEAN, -1, "bufq clean lock"); + for (i = 0; i <= mp_maxid; i++) + bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, + "bufq clean subqueue lock"); + mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); +} + /* - * bremfreel: + * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void -bremfreel(struct buf *bp) +bq_remove(struct bufqueue *bq, struct buf *bp) { - CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", + CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, - ("bremfreel: buffer %p not on a queue.", bp)); + ("bq_remove: buffer %p not on a queue.", bp)); + KASSERT(bufqueue(bp) == bq, + ("bq_remove: Remove buffer %p from wrong queue.", bp)); + + BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } - mtx_assert(bqlock(bp->b_qindex), MA_OWNED); - - TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); -#ifdef INVARIANTS - KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", - bp->b_qindex)); - bq_len[bp->b_qindex]--; -#endif + KASSERT(bq->bq_len >= 1, + ("queue %d underflow", bp->b_qindex)); + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + bq->bq_len--; bp->b_qindex = QUEUE_NONE; - bp->b_flags &= ~B_REMFREE; + bp->b_flags &= ~(B_REMFREE | B_REUSE); } +static void +bd_flush(struct bufdomain *bd, struct bufqueue *bq) +{ + struct buf *bp; + + BQ_ASSERT_LOCKED(bq); + if (bq != bd->bd_cleanq) { + BD_LOCK(bd); + while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, + b_freelist); + bp->b_subqueue = mp_ncpus; + } + bd->bd_cleanq->bq_len += bq->bq_len; + bq->bq_len = 0; + } + if (bd->bd_wanted) { + bd->bd_wanted = 0; + wakeup(&bd->bd_wanted); + } + if (bq != bd->bd_cleanq) + BD_UNLOCK(bd); +} + +static int +bd_flushall(struct bufdomain *bd) +{ + struct bufqueue *bq; + int flushed; + int i; + + if (bd->bd_lim == 0) + return (0); + flushed = 0; + for (i = 0; i < mp_maxid; i++) { + bq = &bd->bd_subq[i]; + if (bq->bq_len == 0) + continue; + BQ_LOCK(bq); + bd_flush(bd, bq); + BQ_UNLOCK(bq); + flushed++; + } + + return (flushed); +} + +static void +bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) +{ + struct bufdomain *bd; + + if (bp->b_qindex != QUEUE_NONE) + panic("bq_insert: free buffer %p onto another queue?", bp); + + bd = &bdclean[bp->b_domain]; + if (bp->b_flags & B_AGE) { + /* Place this buf directly on the real queue. */ + if (bq->bq_index == QUEUE_CLEAN) + bq = bd->bd_cleanq; + BQ_LOCK(bq); + TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); + } else { + BQ_LOCK(bq); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + } + bp->b_flags &= ~(B_AGE | B_REUSE); + bq->bq_len++; + bp->b_qindex = bq->bq_index; + bp->b_subqueue = bq->bq_subqueue; + + /* + * Unlock before we notify so that we don't wakeup a waiter that + * fails a trylock on the buf and sleeps again. + */ + if (unlock) + BUF_UNLOCK(bp); + + if (bp->b_qindex == QUEUE_CLEAN) { + /* + * Flush the per-cpu queue and notify any waiters. + */ + if (bd->bd_wanted || (bq != bd->bd_cleanq && + bq->bq_len >= bd->bd_lim)) + bd_flush(bd, bq); + } + BQ_UNLOCK(bq); +} + /* * bufkva_free: * @@ -1726,8 +1867,8 @@ return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); - atomic_subtract_long(&bufkvaspace, bp->b_kvasize); - atomic_add_int(&buffreekvacnt, 1); + counter_u64_add(bufkvaspace, -bp->b_kvasize); + counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } @@ -1759,7 +1900,7 @@ } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; - atomic_add_long(&bufkvaspace, bp->b_kvasize); + counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); @@ -1779,11 +1920,18 @@ static void bufkva_reclaim(vmem_t *vmem, int flags) { + bool done; + int q; int i; - for (i = 0; i < 5; i++) - if (buf_scan(true) != 0) + done = false; + for (i = 0; i < 5; i++) { + for (q = 0; q < clean_domains; q++) + if (buf_recycle(&bdclean[q], true) != 0) + done = true; + if (done) break; + } return; } @@ -2465,15 +2613,12 @@ else qindex = QUEUE_CLEAN; - binsfree(bp, qindex); - - bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); - /* unlock */ - BUF_UNLOCK(bp); - if (qindex == QUEUE_CLEAN) - bufspace_wakeup(); + + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); + /* binsfree unlocks bp. */ + binsfree(bp, qindex); } /* @@ -2527,14 +2672,15 @@ } qindex = QUEUE_CLEAN; } + buf_track(bp, __func__); + /* binsfree unlocks bp. */ binsfree(bp, qindex); + return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); - if (qindex == QUEUE_CLEAN) - bufspace_wakeup(); } /* @@ -2973,6 +3119,7 @@ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { + struct bufdomain *bd; struct buf *bp; bool metadata, reserved; @@ -2987,27 +3134,36 @@ metadata = true; else metadata = false; - atomic_add_int(&getnewbufcalls, 1); + if (vp == NULL) + bd = &bdclean[0]; + else + bd = &bdclean[vp->v_bufobj.bo_domain]; + + counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && - bufspace_reserve(maxsize, metadata) != 0) + bufspace_reserve(bd, maxsize, metadata) != 0) { + counter_u64_add(getnewbufrestarts, 1); continue; + } reserved = true; - if ((bp = buf_alloc()) == NULL) + if ((bp = buf_alloc(bd)) == NULL) { + counter_u64_add(getnewbufrestarts, 1); continue; + } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; - } while(buf_scan(false) == 0); + } while (buf_recycle(bd, false) == 0); if (reserved) - atomic_subtract_long(&bufspace, maxsize); + bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } - bufspace_wait(vp, gbflags, slpflag, slptimeo); + bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } @@ -3049,6 +3205,7 @@ buf_daemon() { int lodirty; + int i; /* * This process needs to be suspended prior to shutdown sync. @@ -3057,6 +3214,18 @@ SHUTDOWN_PRI_LAST); /* + * Start the buf clean daemons as children threads. + */ + for (i = 0 ; i < clean_domains; i++) { + int error; + + error = kthread_add((void (*)(void *))bufspace_daemon, + &bdclean[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); + if (error) + panic("error %d spawning bufspace daemon", error); + } + + /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; @@ -3134,34 +3303,34 @@ static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { + struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; - int queue; int error; bool unlock; flushed = 0; - queue = QUEUE_DIRTY; + bq = &bqdirty; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; - mtx_lock(&bqlocks[queue]); - TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); - mtx_unlock(&bqlocks[queue]); + BQ_LOCK(bq); + TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); + BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); - mtx_lock(&bqlocks[queue]); + BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { - TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); - TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, + TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); + TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { - mtx_unlock(&bqlocks[queue]); + BQ_UNLOCK(bq); break; } /* @@ -3173,11 +3342,11 @@ */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { - mtx_unlock(&bqlocks[queue]); + BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); - mtx_unlock(&bqlocks[queue]); + BQ_UNLOCK(bq); if (error != 0) continue; @@ -3237,7 +3406,7 @@ } else { bremfree(bp); bwrite(bp); - notbufdflushes++; + counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) @@ -3257,9 +3426,9 @@ vn_finished_write(mp); BUF_UNLOCK(bp); } - mtx_lock(&bqlocks[queue]); - TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); - mtx_unlock(&bqlocks[queue]); + BQ_LOCK(bq); + TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); + BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } @@ -3480,8 +3649,8 @@ */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } - atomic_add_int(&mappingrestarts, 1); - bufspace_wait(bp->b_vp, gbflags, 0, 0); + counter_u64_add(mappingrestarts, 1); + bufspace_wait(&bdclean[bp->b_domain], bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { @@ -3679,7 +3848,8 @@ */ if (flags & GB_NOCREAT) return NULL; - if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) + if (bdclean[bo->bo_domain].bd_freebuffers == 0 && + TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; @@ -3735,8 +3905,8 @@ if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; + bufspace_release(&bdclean[bp->b_domain], maxsize); brelse(bp); - bufspace_release(maxsize); goto loop; } @@ -3770,7 +3940,7 @@ } allocbuf(bp, size); - bufspace_release(maxsize); + bufspace_release(&bdclean[bp->b_domain], maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); @@ -3799,7 +3969,7 @@ return (NULL); } allocbuf(bp, size); - bufspace_release(maxsize); + bufspace_release(&bdclean[bp->b_domain], maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); @@ -4659,7 +4829,23 @@ KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } +/* + * Initialize a struct bufobj before use. Memory is assumed zero filled. + */ void +bufobj_init(struct bufobj *bo, void *private) +{ + static volatile int bufobj_cleanq; + + bo->bo_domain = + atomic_fetchadd_int(&bufobj_cleanq, 1) % clean_domains; + rw_init(BO_LOCKPTR(bo), "bufobj interlock"); + bo->bo_private = private; + TAILQ_INIT(&bo->bo_clean.bv_hd); + TAILQ_INIT(&bo->bo_dirty.bv_hd); +} + +void bufobj_wrefl(struct bufobj *bo) { @@ -4989,6 +5175,37 @@ #endif db_printf(" "); BUF_LOCKPRINTINFO(bp); +} + +DB_SHOW_COMMAND(bufqueues, bufqueues) +{ + struct bufdomain *bd; + int i, j; + + db_printf("bqempty: %d\n", bqempty.bq_len); + db_printf("bqdirty: %d\n", bqdirty.bq_len); + + for (i = 0; i < clean_domains; i++) { + bd = &bdclean[i]; + db_printf("Buf domain %d\n", i); + db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); + db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); + db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); + db_printf("\n"); + db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); + db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); + db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); + db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); + db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); + db_printf("\n"); + db_printf("\tcleanq count\t%d\n", bd->bd_cleanq->bq_len); + db_printf("\twakeup\t\t%d\n", bd->bd_wanted); + db_printf("\tlim\t\t%d\n", bd->bd_lim); + db_printf("\tCPU "); + for (j = 0; j < mp_maxid + 1; j++) + db_printf("%d, ", bd->bd_subq[j].bq_len); + db_printf("\n"); + } } DB_SHOW_COMMAND(lockedbufs, lockedbufs) Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -362,7 +362,6 @@ vnode_init(void *mem, int size, int flags) { struct vnode *vp; - struct bufobj *bo; vp = mem; bzero(vp, size); @@ -379,11 +378,7 @@ /* * Initialize bufobj. */ - bo = &vp->v_bufobj; - rw_init(BO_LOCKPTR(bo), "bufobj interlock"); - bo->bo_private = vp; - TAILQ_INIT(&bo->bo_clean.bv_hd); - TAILQ_INIT(&bo->bo_dirty.bv_hd); + bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h +++ head/sys/sys/buf.h @@ -112,7 +112,9 @@ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ - unsigned short b_qindex; /* (Q) buffer queue index */ + uint8_t b_qindex; /* (Q) buffer queue index */ + uint8_t b_domain; /* (Q) buf domain this resides in */ + uint16_t b_subqueue; /* (Q) per-cpu q if any */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ @@ -217,7 +219,7 @@ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ -#define B_00001000 0x00001000 /* Available flag. */ +#define B_REUSE 0x00001000 /* Contents reused, second chance. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ @@ -241,7 +243,7 @@ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ - "\15b12\14noreuse\13eintr\12done\11b8\10delwri" \ + "\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* Index: head/sys/sys/bufobj.h =================================================================== --- head/sys/sys/bufobj.h +++ head/sys/sys/bufobj.h @@ -106,6 +106,7 @@ struct bufv bo_dirty; /* i Dirty buffers */ long bo_numoutput; /* i Writes in progress */ u_int bo_flag; /* i Flags */ + int bo_domain; /* - Clean queue affinity */ int bo_bsize; /* - Block size for i/o */ }; @@ -126,6 +127,7 @@ #define ASSERT_BO_LOCKED(bo) rw_assert(BO_LOCKPTR((bo)), RA_LOCKED) #define ASSERT_BO_UNLOCKED(bo) rw_assert(BO_LOCKPTR((bo)), RA_UNLOCKED) +void bufobj_init(struct bufobj *bo, void *private); void bufobj_wdrop(struct bufobj *bo); void bufobj_wref(struct bufobj *bo); void bufobj_wrefl(struct bufobj *bo);