Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -2135,30 +2135,37 @@ void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; - int readwait, rv; + struct thread *td; + int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); + td = curthread; /* - * Can only return NULL if GB_LOCK_NOWAIT flag is specified. + * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags + * are specified. */ - *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); - if (bp == NULL) - return (EBUSY); + error = getblkx(vp, blkno, size, 0, 0, flags, &bp); + if (error != 0) { + *bpp = NULL; + return (error); + } + flags &= ~GB_NOSPARSE; + *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) { + if (!TD_IS_IDLETHREAD(td)) { #ifdef RACCT if (racct_enable) { - PROC_LOCK(curproc); - racct_add_buf(curproc, bp, 0); - PROC_UNLOCK(curproc); + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, bp, 0); + PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ - curthread->td_ru.ru_inblock++; + td->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; @@ -3819,8 +3826,21 @@ } } +struct buf * +getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, + int flags) +{ + struct buf *bp; + int error; + + error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp); + if (error != 0) + return (NULL); + return (bp); +} + /* - * getblk: + * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost @@ -3855,12 +3875,13 @@ * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ -struct buf * -getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, - int flags) +int +getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, + int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; + daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; @@ -3875,6 +3896,7 @@ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; + d_blkno = blkno; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); @@ -3886,7 +3908,7 @@ */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; - if (flags & GB_LOCK_NOWAIT) + if ((flags & GB_LOCK_NOWAIT) != 0) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, @@ -3899,8 +3921,8 @@ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ - else if (error) - return (NULL); + else if (error != 0) + return (error); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; @@ -4005,10 +4027,10 @@ * here. */ if (flags & GB_NOCREAT) - return NULL; + return (EEXIST); if (bdomain[bo->bo_domain].bd_freebuffers == 0 && TD_IS_IDLETHREAD(curthread)) - return NULL; + return (EBUSY); bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); @@ -4022,11 +4044,22 @@ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); + if ((flags & GB_NOSPARSE) != 0 && vmio && + !vn_isdisk(vp, NULL)) { + error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); + KASSERT(error != EOPNOTSUPP, + ("GB_NOSPARSE from fs not supporting bmap, vp %p", + vp)); + if (error != 0) + return (error); + if (d_blkno == -1) + return (EJUSTRETURN); + } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) - return NULL; + return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. @@ -4072,7 +4105,8 @@ * Insert the buffer into the hash, so that it can * be found by incore. */ - bp->b_blkno = bp->b_lblkno = blkno; + bp->b_lblkno = blkno; + bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); @@ -4107,7 +4141,8 @@ buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); - return (bp); + *bpp = bp; + return (0); } /* Index: sys/kern/vfs_cluster.c =================================================================== --- sys/kern/vfs_cluster.c +++ sys/kern/vfs_cluster.c @@ -94,12 +94,14 @@ { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; + struct thread *td; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; + td = curthread; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; @@ -118,10 +120,14 @@ /* * get the requested block */ - *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); - if (bp == NULL) - return (EBUSY); + error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp); + if (error != 0) { + *bpp = NULL; + return (error); + } + gbflags &= ~GB_NOSPARSE; origblkno = lblkno; + *bpp = reqbp = bp; /* * if it is in the cache, then check to see if the reads have been @@ -243,12 +249,12 @@ bstrategy(bp); #ifdef RACCT if (racct_enable) { - PROC_LOCK(curproc); - racct_add_buf(curproc, bp, 0); - PROC_UNLOCK(curproc); + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, bp, 0); + PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ - curthread->td_ru.ru_inblock++; + td->td_ru.ru_inblock++; } /* @@ -303,12 +309,12 @@ bstrategy(rbp); #ifdef RACCT if (racct_enable) { - PROC_LOCK(curproc); - racct_add_buf(curproc, rbp, 0); - PROC_UNLOCK(curproc); + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, rbp, 0); + PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ - curthread->td_ru.ru_inblock++; + td->td_ru.ru_inblock++; } if (reqbp) { Index: sys/sys/buf.h =================================================================== --- sys/sys/buf.h +++ sys/sys/buf.h @@ -479,6 +479,7 @@ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #define GB_CKHASH 0x0020 /* If reading, calc checksum hash */ +#define GB_NOSPARSE 0x0040 /* Do not instantiate holes */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -540,6 +541,8 @@ struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); +int getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, + int slptimeo, int flags, struct buf **bpp); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c +++ sys/ufs/ffs/ffs_vnops.c @@ -462,6 +462,26 @@ #endif } +static int +ffs_read_hole(struct uio *uio, long xfersize, long *size) +{ + ssize_t saved_resid, tlen; + int error; + + while (xfersize > 0) { + tlen = min(xfersize, ZERO_REGION_SIZE); + saved_resid = uio->uio_resid; + error = vn_io_fault_uiomove(__DECONST(void *, zero_region), + tlen, uio); + if (error != 0) + return (error); + tlen = saved_resid - uio->uio_resid; + xfersize -= tlen; + *size -= tlen; + } + return (0); +} + /* * Vnode op for reading. */ @@ -483,9 +503,7 @@ off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; - int error; - int seqcount; - int ioflag; + int bflag, error, ioflag, seqcount; vp = ap->a_vp; uio = ap->a_uio; @@ -529,6 +547,7 @@ uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); + bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; @@ -565,8 +584,7 @@ /* * Don't do readahead if this is the end of the file. */ - error = bread_gb(vp, lbn, size, NOCRED, - GB_UNMAPPED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, @@ -577,7 +595,7 @@ */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, - seqcount, GB_UNMAPPED, &bp); + seqcount, bflag, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then @@ -589,17 +607,21 @@ */ u_int nextsize = blksize(fs, ip, nextlbn); error = breadn_flags(vp, lbn, size, &nextlbn, - &nextsize, 1, NOCRED, GB_UNMAPPED, NULL, &bp); + &nextsize, 1, NOCRED, bflag, NULL, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ - error = bread_gb(vp, lbn, size, NOCRED, - GB_UNMAPPED, &bp); + error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); } - if (error) { + if (error == EJUSTRETURN) { + error = ffs_read_hole(uio, xfersize, &size); + if (error == 0) + continue; + } + if (error != 0) { brelse(bp); bp = NULL; break;