diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -4231,35 +4231,29 @@ } /* - * This code is used to make sure that a buffer is not - * created while the getnewbuf routine is blocked. - * This can be a problem whether the vnode is locked or not. - * If the buffer is created out from under us, we have to - * throw away the one we just created. * - * Note: this must occur before we associate the buffer - * with the vp especially considering limitations in - * the splay tree implementation when dealing with duplicate - * lblkno's. - */ - BO_LOCK(bo); - if (gbincore(bo, blkno)) { - BO_UNLOCK(bo); - bp->b_flags |= B_INVAL; - bufspace_release(bufdomain(bp), maxsize); - brelse(bp); - goto loop; - } - - /* * Insert the buffer into the hash, so that it can * be found by incore. + * + * We don't hold the bufobj interlock while allocating the new + * buffer. Consequently, we can race on buffer creation. This + * can be a problem whether the vnode is locked or not. If the + * buffer is created out from under us, we have to throw away + * the one we just created. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; - bgetvp(vp, bp); - BO_UNLOCK(bo); + error = bgetvp(vp, bp); + if (error != 0) { + KASSERT(error == EEXIST, + ("getblk: unexpected error %d from bgetvp", + error)); + bp->b_flags |= B_INVAL; + bufspace_release(bufdomain(bp), maxsize); + brelse(bp); + goto loop; + } /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2697,12 +2697,12 @@ } /* - * Add the buffer to the sorted clean or dirty block list. - * - * NOTE: xflags is passed as a constant, optimizing this inline function! + * Add the buffer to the sorted clean or dirty block list. Return zero on + * success, EEXIST if a buffer with this identity already exists, or another + * error on allocation failure. */ -static void -buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) +static inline int +buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; @@ -2713,30 +2713,69 @@ ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); - KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, - ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); - bp->b_xflags |= xflags; + KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == xflags, + ("buf_vlist_add: b_xflags %#x not set on bp %p", xflags, bp)); + if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; - /* - * Keep the list ordered. Optimize empty list insertion. Assume - * we tend to grow at the tail so lookup_le should usually be cheaper - * than _ge. - */ - if (bv->bv_cnt == 0 || - bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) - TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); - else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) + error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, &n); + if (n == NULL) { + KASSERT(error != EEXIST, + ("buf_vlist_add: EEXIST but no existing buf found: bp %p", + bp)); + } else { + KASSERT((uint64_t)n->b_lblkno <= (uint64_t)bp->b_lblkno, + ("buf_vlist_add: out of order insert/lookup: bp %p n %p", + bp, n)); + KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST), + ("buf_vlist_add: inconsistent result for existing buf: " + "error %d bp %p n %p", error, bp, n)); + } + if (error != 0) + return (error); + + /* Keep the list ordered. */ + if (n == NULL) { + KASSERT(TAILQ_EMPTY(&bv->bv_hd) || + (uint64_t)bp->b_lblkno < + (uint64_t)TAILQ_FIRST(&bv->bv_hd)->b_lblkno, + ("buf_vlist_add: queue order: " + "%p should be before first %p", + bp, TAILQ_FIRST(&bv->bv_hd))); TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); - else + } else { + KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL || + (uint64_t)bp->b_lblkno < + (uint64_t)TAILQ_NEXT(n, b_bobufs)->b_lblkno, + ("buf_vlist_add: queue order: " + "%p should be before next %p", + bp, TAILQ_NEXT(n, b_bobufs))); TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); - error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); - if (error) - panic("buf_vlist_add: Preallocated nodes insufficient."); + } + bv->bv_cnt++; + return (0); +} + +/* + * Add the buffer to the sorted clean or dirty block list. + * + * NOTE: xflags is passed as a constant, optimizing this inline function! + */ +static void +buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) +{ + int error; + + KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, + ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); + bp->b_xflags |= xflags; + error = buf_vlist_find_or_add(bp, bo, xflags); + if (error) + panic("buf_vlist_add: error=%d", error); } /* @@ -2775,26 +2814,42 @@ /* * Associate a buffer with a vnode. */ -void +int bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; + int error; bo = &vp->v_bufobj; - ASSERT_BO_WLOCKED(bo); + ASSERT_BO_UNLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); - vhold(vp); - bp->b_vp = vp; - bp->b_bufobj = bo; /* - * Insert onto list for new vnode. + * Add the buf to the vnode's clean list unless we lost a race and find + * an existing buf in either dirty or clean. */ - buf_vlist_add(bp, bo, BX_VNCLEAN); + bp->b_vp = vp; + bp->b_bufobj = bo; + bp->b_xflags |= BX_VNCLEAN; + error = EEXIST; + BO_LOCK(bo); + if (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, bp->b_lblkno) == NULL) + error = buf_vlist_find_or_add(bp, bo, BX_VNCLEAN); + BO_UNLOCK(bo); + if (__predict_true(error == 0)) { + vhold(vp); + return (0); + } + if (error != EEXIST) + panic("bgetvp: buf_vlist_add error: %d", error); + bp->b_vp = NULL; + bp->b_bufobj = NULL; + bp->b_xflags &= ~BX_VNCLEAN; + return (error); } /* diff --git a/sys/sys/buf.h b/sys/sys/buf.h --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -603,7 +603,7 @@ int vmapbuf(struct buf *, void *, size_t, int); void vunmapbuf(struct buf *); void brelvp(struct buf *); -void bgetvp(struct vnode *, struct buf *); +int bgetvp(struct vnode *, struct buf *) __result_use_check; void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *);