Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c +++ head/sys/kern/vfs_bio.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,52 @@ .bop_bdflush = bufbdflush, }; +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_subqueue; + int bq_len; +} __aligned(CACHE_LINE_SIZE); + +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufdomain { + struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ + struct bufqueue bd_dirtyq; + struct bufqueue *bd_cleanq; + struct mtx_padalign bd_run_lock; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_hidirtybuffers; + int bd_lodirtybuffers; + int bd_dirtybufthresh; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; + int __aligned(CACHE_LINE_SIZE) bd_running; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) +#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) +#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) +#define BD_DOMAIN(bd) (bd - bdomain) + static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; @@ -123,8 +170,8 @@ daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); -static int buf_flush(struct vnode *vp, int); -static int flushbufqueues(struct vnode *, int, int); +static int buf_flush(struct vnode *vp, struct bufdomain *, int); +static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); @@ -133,6 +180,16 @@ static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); +static inline struct bufdomain *bufdomain(struct buf *); +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); +static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); +static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; @@ -147,7 +204,9 @@ SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, + __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, @@ -156,14 +215,20 @@ SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, + __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, + __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; -SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, - 0, "Bufspace consumed before waking the daemon to free some"); +SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, + __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", + "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); @@ -190,26 +255,37 @@ static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); -static int numdirtybuffers; -SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, +static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; -SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, + __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "L", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; -SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, + __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "L", "When the number of dirty buffers is considered severe"); int dirtybufthresh; -SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, - 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); +SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, + __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "L", + "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; -SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, + __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "L", "Target number of free buffers"); static int hifreebuffers; -SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, +SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, + __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "L", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, @@ -294,66 +370,19 @@ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ -struct bufqueue { - struct mtx_padalign bq_lock; - TAILQ_HEAD(, buf) bq_queue; - uint8_t bq_index; - uint16_t bq_subqueue; - int bq_len; -} __aligned(CACHE_LINE_SIZE); +/* Maximum number of buffer domains. */ +#define BUF_DOMAINS 8 -#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) -#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) -#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) -#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) +struct bufdomainset bdlodirty; /* Domains > lodirty */ +struct bufdomainset bdhidirty; /* Domains > hidirty */ -struct bufqueue __exclusive_cache_line bqempty; -struct bufqueue __exclusive_cache_line bqdirty; - -struct bufdomain { - struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ - struct bufqueue *bd_cleanq; - struct mtx_padalign bd_run_lock; - /* Constants */ - long bd_maxbufspace; - long bd_hibufspace; - long bd_lobufspace; - long bd_bufspacethresh; - int bd_hifreebuffers; - int bd_lofreebuffers; - int bd_lim; - /* atomics */ - int bd_wanted; - int __aligned(CACHE_LINE_SIZE) bd_running; - long __aligned(CACHE_LINE_SIZE) bd_bufspace; - int __aligned(CACHE_LINE_SIZE) bd_freebuffers; -} __aligned(CACHE_LINE_SIZE); - -#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) -#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) -#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) -#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) -#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) -#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) -#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) -#define BD_DOMAIN(bd) (bd - bdclean) - -/* Maximum number of clean buffer domains. */ -#define CLEAN_DOMAINS 8 - /* Configured number of clean queues. */ -static int __read_mostly clean_domains; +static int __read_mostly buf_domains; -struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS]; +BITSET_DEFINE(bufdomainset, BUF_DOMAINS); +struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; +struct bufqueue __exclusive_cache_line bqempty; -static void bq_remove(struct bufqueue *bq, struct buf *bp); -static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); -static int buf_recycle(struct bufdomain *, bool kva); -static void bq_init(struct bufqueue *bq, int qindex, int cpu, - const char *lockname); -static void bd_init(struct bufdomain *bd); -static int bd_flushall(struct bufdomain *bd); - /* * per-cpu empty buffer cache. */ @@ -393,6 +422,44 @@ return (error); } +static int +sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) +{ + int error; + int value; + int i; + + value = *(int *)arg1; + error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(int *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(int *)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + +static int +sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) +{ + long value; + int error; + int i; + + value = *(long *)arg1; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(long *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(long *)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int @@ -403,8 +470,8 @@ int i; lvalue = 0; - for (i = 0; i < clean_domains; i++) - lvalue += bdclean[i].bd_bufspace; + for (i = 0; i < buf_domains; i++) + lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) @@ -421,12 +488,24 @@ int i; lvalue = 0; - for (i = 0; i < clean_domains; i++) - lvalue += bdclean[i].bd_bufspace; + for (i = 0; i < buf_domains; i++) + lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif +static int +sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) +{ + int value; + int i; + + value = 0; + for (i = 0; i < buf_domains; i++) + value += bdomain[i].bd_numdirtybuffers; + return (sysctl_handle_int(oidp, &value, 0, req)); +} + /* * bdirtywakeup: * @@ -444,18 +523,59 @@ } /* + * bd_clear: + * + * Clear a domain from the appropriate bitsets when dirtybuffers + * is decremented. + */ +static void +bd_clear(struct bufdomain *bd) +{ + + mtx_lock(&bdirtylock); + if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) + BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); + if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) + BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); + mtx_unlock(&bdirtylock); +} + +/* + * bd_set: + * + * Set a domain in the appropriate bitsets when dirtybuffers + * is incremented. + */ +static void +bd_set(struct bufdomain *bd) +{ + + mtx_lock(&bdirtylock); + if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) + BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); + if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) + BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); + mtx_unlock(&bdirtylock); +} + +/* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void -bdirtysub(void) +bdirtysub(struct buf *bp) { + struct bufdomain *bd; + int num; - if (atomic_fetchadd_int(&numdirtybuffers, -1) == - (lodirtybuffers + hidirtybuffers) / 2) + bd = bufdomain(bp); + num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); + if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); + if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) + bd_clear(bd); } /* @@ -465,16 +585,21 @@ * daemon if needed. */ static void -bdirtyadd(void) +bdirtyadd(struct buf *bp) { + struct bufdomain *bd; + int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ - if (atomic_fetchadd_int(&numdirtybuffers, 1) == - (lodirtybuffers + hidirtybuffers) / 2) + bd = bufdomain(bp); + num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); + if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); + if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) + bd_set(bd); } /* @@ -539,11 +664,11 @@ KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); - bd = &bdclean[bp->b_domain]; + bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); - } else { + } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && @@ -638,7 +763,7 @@ * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; - fl = buf_flush(vp, flushbufqtarget); + fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) @@ -700,6 +825,15 @@ if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; + /* + * Speedup dirty if we've run out of clean + * buffers. This is possible in particular + * because softdep may held many bufs locked + * pending writes to other bufs which are + * marked for delayed write, exhausting + * clean space until they are written. + */ + bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), @@ -1025,7 +1159,6 @@ ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); - bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); @@ -1093,7 +1226,6 @@ */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; - numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our @@ -1128,22 +1260,26 @@ * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ - clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS); - for (i = 0 ; i < clean_domains; i++) { + buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); + for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; - bd = &bdclean[i]; + bd = &bdomain[i]; bd_init(bd); - bd->bd_freebuffers = nbuf / clean_domains; - bd->bd_hifreebuffers = hifreebuffers / clean_domains; - bd->bd_lofreebuffers = lofreebuffers / clean_domains; + bd->bd_freebuffers = nbuf / buf_domains; + bd->bd_hifreebuffers = hifreebuffers / buf_domains; + bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; - bd->bd_maxbufspace = maxbufspace / clean_domains; - bd->bd_hibufspace = hibufspace / clean_domains; - bd->bd_lobufspace = lobufspace / clean_domains; - bd->bd_bufspacethresh = bufspacethresh / clean_domains; + bd->bd_maxbufspace = maxbufspace / buf_domains; + bd->bd_hibufspace = hibufspace / buf_domains; + bd->bd_lobufspace = lobufspace / buf_domains; + bd->bd_bufspacethresh = bufspacethresh / buf_domains; + bd->bd_numdirtybuffers = 0; + bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; + bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; + bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ - bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus; + bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); @@ -1327,6 +1463,13 @@ (vm_offset_t)(bp->b_offset & PAGE_MASK)); } +static inline struct bufdomain * +bufdomain(struct buf *bp) +{ + + return (&bdomain[bp->b_domain]); +} + static struct bufqueue * bufqueue(struct buf *bp) { @@ -1339,9 +1482,9 @@ case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: - return (&bqdirty); + return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: - return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]); + return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } @@ -1404,14 +1547,14 @@ bq_remove(bq, bp); BQ_UNLOCK(bq); } + bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { - bd = &bdclean[bp->b_domain]; if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else - bq = &bqdirty; + bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } @@ -1439,7 +1582,7 @@ if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); - atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1); + atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } @@ -1715,9 +1858,10 @@ int domain; int i; - domain = bd - bdclean; + domain = bd - bdomain; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); + bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); @@ -1809,7 +1953,7 @@ if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); - bd = &bdclean[bp->b_domain]; + bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) @@ -1926,8 +2070,8 @@ done = false; for (i = 0; i < 5; i++) { - for (q = 0; q < clean_domains; q++) - if (buf_recycle(&bdclean[q], true) != 0) + for (q = 0; q < buf_domains; q++) + if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; @@ -2319,7 +2463,7 @@ if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); - bdirtyadd(); + bdirtyadd(bp); } } @@ -2347,7 +2491,7 @@ if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); - bdirtysub(); + bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. @@ -2419,9 +2563,9 @@ bwillwrite(void) { - if (numdirtybuffers >= hidirtybuffers) { + if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); - while (numdirtybuffers >= hidirtybuffers) { + while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); @@ -2437,7 +2581,7 @@ buf_dirty_count_severe(void) { - return(numdirtybuffers >= hidirtybuffers); + return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* @@ -2523,7 +2667,7 @@ if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) - bdirtysub(); + bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); @@ -3138,9 +3282,9 @@ else metadata = false; if (vp == NULL) - bd = &bdclean[0]; + bd = &bdomain[0]; else - bd = &bdclean[vp->v_bufobj.bo_domain]; + bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; @@ -3186,11 +3330,11 @@ SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int -buf_flush(struct vnode *vp, int target) +buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; - flushed = flushbufqueues(vp, target, 0); + flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback @@ -3199,7 +3343,7 @@ */ if (vp != NULL && target > 2) target /= 2; - flushbufqueues(vp, target, 1); + flushbufqueues(vp, bd, target, 1); } return (flushed); } @@ -3207,6 +3351,8 @@ static void buf_daemon() { + struct bufdomain *bd; + int speedupreq; int lodirty; int i; @@ -3219,11 +3365,11 @@ /* * Start the buf clean daemons as children threads. */ - for (i = 0 ; i < clean_domains; i++) { + for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, - &bdclean[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); + &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } @@ -3238,20 +3384,30 @@ mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); - lodirty = lodirtybuffers; - if (bd_speedupreq) { - lodirty = numdirtybuffers / 2; - bd_speedupreq = 0; - } + /* - * Do the flush. Limit the amount of in-transit I/O we - * allow to build up, otherwise we would completely saturate - * the I/O system. + * Save speedupreq for this pass and reset to capture new + * requests. */ - while (numdirtybuffers > lodirty) { - if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) - break; - kern_yield(PRI_USER); + speedupreq = bd_speedupreq; + bd_speedupreq = 0; + + /* + * Flush each domain sequentially according to its level and + * the speedup request. + */ + for (i = 0; i < buf_domains; i++) { + bd = &bdomain[i]; + if (speedupreq) + lodirty = bd->bd_numdirtybuffers / 2; + else + lodirty = bd->bd_lodirtybuffers; + while (bd->bd_numdirtybuffers > lodirty) { + if (buf_flush(NULL, bd, + bd->bd_numdirtybuffers - lodirty) == 0) + break; + kern_yield(PRI_USER); + } } /* @@ -3265,7 +3421,7 @@ * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); - if (numdirtybuffers <= lodirtybuffers) { + if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. @@ -3304,7 +3460,8 @@ 0, "Number of buffers flushed with dependecies that require rollbacks"); static int -flushbufqueues(struct vnode *lvp, int target, int flushdeps) +flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, + int flushdeps) { struct bufqueue *bq; struct buf *sentinel; @@ -3317,7 +3474,7 @@ bool unlock; flushed = 0; - bq = &bqdirty; + bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; @@ -3653,7 +3810,7 @@ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); - bufspace_wait(&bdclean[bp->b_domain], bp->b_vp, gbflags, 0, 0); + bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { @@ -3851,7 +4008,7 @@ */ if (flags & GB_NOCREAT) return NULL; - if (bdclean[bo->bo_domain].bd_freebuffers == 0 && + if (bdomain[bo->bo_domain].bd_freebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; @@ -3908,7 +4065,7 @@ if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; - bufspace_release(&bdclean[bp->b_domain], maxsize); + bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } @@ -3943,7 +4100,7 @@ } allocbuf(bp, size); - bufspace_release(&bdclean[bp->b_domain], maxsize); + bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); @@ -3972,7 +4129,7 @@ return (NULL); } allocbuf(bp, size); - bufspace_release(&bdclean[bp->b_domain], maxsize); + bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); @@ -4841,7 +4998,7 @@ static volatile int bufobj_cleanq; bo->bo_domain = - atomic_fetchadd_int(&bufobj_cleanq, 1) % clean_domains; + atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); @@ -5164,6 +5321,7 @@ } db_printf("\n"); } + BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); @@ -5178,19 +5336,19 @@ db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); - BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; - int i, j; + struct buf *bp; + long total; + int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); - db_printf("bqdirty: %d\n", bqdirty.bq_len); - for (i = 0; i < clean_domains; i++) { - bd = &bdclean[i]; + for (i = 0; i < buf_domains; i++) { + bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); @@ -5202,13 +5360,43 @@ db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); - db_printf("\tcleanq count\t%d\n", bd->bd_cleanq->bq_len); + db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); + db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); + db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); + db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); + db_printf("\n"); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tcleanq count\t%d (%ld)\n", + bd->bd_cleanq->bq_len, total); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tdirtyq count\t%d (%ld)\n", + bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tLocked buffers: %d space %ld\n", cnt, total); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } }