diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 6e8d2018eb8b..8e8b9ad96e06 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,1501 +1,1521 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. This work was done expressly for inclusion into FreeBSD. Other use * is allowed if this notation is included. * 5. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: vfs_bio.c,v 1.59 1995/08/24 13:59:14 davidg Exp $ + * $Id: vfs_bio.c,v 1.60 1995/08/28 09:18:53 julian Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. */ #define VMIO #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ static void vfs_update __P((void)); struct proc *updateproc; static struct kproc_desc up_kp = { "update", vfs_update, &updateproc }; SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp) struct buf *buf; /* buffer header pool */ struct swqueue bswlist; void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); +static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno); int needsbuffer; /* * Internal update daemon, process 3 * The variable vfs_update_wakeup allows for internal syncs. */ int vfs_update_wakeup; /* * buffers base kva */ caddr_t buffers_kva; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; vm_offset_t bogus_offset; int bufspace, maxbufspace; /* * advisory minimum for size of LRU queue or VMIO queue */ int minbuf; struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; struct bqueues bufqueues[BUFFER_QUEUES]; /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); /* first, make a null hash table */ for (i = 0; i < BUFHSZ; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; bp->b_data = buffers_kva + i * MAXBSIZE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is currently calculated to support all filesystem blocks * to be 8K. If you happen to use a 16K filesystem, the size of the buffer * cache is still the same as it would be for 8K filesystems. This * keeps the size of the buffer cache "in check" for big block filesystems. */ minbuf = nbuf / 3; maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); } /* * remove the buffer from the appropriate free list */ void bremfree(struct buf * bp) { int s = splbio(); if (bp->b_qindex != QUEUE_NONE) { TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { panic("bremfree: removing a buffer when not on a queue"); } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); return (biowait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); VOP_STRATEGY(rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async.) */ int bwrite(struct buf * bp) { int oldflags = bp->b_flags; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); bp->b_flags |= B_WRITEINPROG; if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { reassignbuf(bp, bp->b_vp); } bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); if (curproc != NULL) curproc->p_stats->p_ru.ru_oublock++; VOP_STRATEGY(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } brelse(bp); return (rtval); } return (0); } int vn_bwrite(ap) struct vop_bwrite_args *ap; { return (bwrite(ap->a_bp)); } /* * Delayed write. (Buffer is marked dirty). */ void bdwrite(struct buf * bp) { if ((bp->b_flags & B_BUSY) == 0) { panic("bdwrite: buffer is not busy"); } if (bp->b_flags & B_INVAL) { brelse(bp); return; } if (bp->b_flags & B_TAPE) { bawrite(bp); return; } bp->b_flags &= ~(B_READ|B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); } /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if( bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); brelse(bp); return; } /* * Asynchronous write. * Start output on a buffer, but do not wait for it to complete. * The buffer is released when the output completes. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) VOP_BWRITE(bp); } /* * Release a buffer. */ void brelse(struct buf * bp) { int s; if (bp->b_flags & B_CLUSTER) { relpbuf(bp); return; } /* anyone need a "free" block? */ s = splbio(); if (needsbuffer) { needsbuffer = 0; wakeup(&needsbuffer); } /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_WANTED | B_AGE); wakeup(bp); } else if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_WANTED; wakeup(bp); } if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || (bp->b_bufsize <= 0)) { bp->b_flags |= B_INVAL; bp->b_flags &= ~(B_DELWRI | B_CACHE); if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) brelvp(bp); } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, so the B_INVAL flag is used to *invalidate* the buffer, * but the VM object is kept around. The B_NOCACHE flag is used to * invalidate the pages in the VM object. */ if (bp->b_flags & B_VMIO) { vm_offset_t foff; vm_object_t obj; int i, resid; vm_page_t m; int iototal = bp->b_bufsize; foff = 0; obj = 0; if (bp->b_npages) { if (bp->b_vp && bp->b_vp->v_mount) { foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; } else { /* * vnode pointer has been ripped away -- * probably file gone... */ foff = bp->b_pages[0]->offset; } } for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, foff); if (!m) { panic("brelse: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } resid = (m->offset + PAGE_SIZE) - foff; if (resid > iototal) resid = iototal; if (resid > 0) { /* * Don't invalidate the page if the local machine has already * modified it. This is the lesser of two evils, and should * be fixed. */ if (bp->b_flags & (B_NOCACHE | B_ERROR)) { vm_page_test_dirty(m); if (m->dirty == 0) { vm_page_set_invalid(m, foff, resid); if (m->valid == 0) vm_page_protect(m, VM_PROT_NONE); } } } foff += resid; iototal -= resid; } if (bp->b_flags & (B_INVAL | B_RELBUF)) { for(i=0;ib_npages;i++) { m = bp->b_pages[i]; --m->bmapped; if (m->bmapped == 0) { if (m->flags & PG_WANTED) { wakeup(m); m->flags &= ~PG_WANTED; } vm_page_test_dirty(m); if ((m->dirty & m->valid) == 0 && (m->flags & PG_REFERENCED) == 0 && !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { vm_page_cache(m); } else if ((m->flags & PG_ACTIVE) == 0) { vm_page_activate(m); m->act_count = 0; } } } bufspace -= bp->b_bufsize; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_AGE) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); /* buffers with valid and quite potentially reuseable contents */ } else { bp->b_qindex = QUEUE_LRU; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } /* unlock */ bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } +/* + * Check to see if a block is currently memory resident. + */ +static __inline struct buf * +gbincore(struct vnode * vp, daddr_t blkno) +{ + struct buf *bp; + struct bufhashhdr *bh; + + bh = BUFHASH(vp, blkno); + bp = bh->lh_first; + + /* Search hash chain */ + while (bp != NULL) { + /* hit */ + if (bp->b_vp == vp && bp->b_lblkno == blkno) { + break; + } + bp = bp->b_hash.le_next; + } + return (bp); +} + /* * this routine implements clustered async writes for * clearing out B_DELWRI buffers... This is much better * than the old way of writing only one buffer at a time. */ void vfs_bio_awrite(struct buf * bp) { int i; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; s = splbio(); if (vp->v_mount && (vp->v_flag & VVMIO) && (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { int size = vp->v_mount->mnt_stat.f_iosize; int maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { - if ((bpa = incore(vp, lblkno + i)) && + if ((bpa = gbincore(vp, lblkno + i)) && ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE)) break; } else { break; } } ncl = i; /* * this is a possible cluster write */ if (ncl != 1) { bremfree(bp); cluster_wbuild(vp, bp, size, lblkno, ncl, -1); splx(s); return; } } /* * default (old) behavior, writing out only one block */ bremfree(bp); bp->b_flags |= B_BUSY | B_ASYNC; (void) VOP_BWRITE(bp); splx(s); } /* * Find a buffer header which is available for use. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int doingvmio) { struct buf *bp; int s; int firstbp = 1; s = splbio(); start: if (bufspace >= maxbufspace) goto trytofreespace; /* can we constitute a new buffer? */ if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { if (bp->b_qindex != QUEUE_EMPTY) panic("getnewbuf: inconsistent EMPTY queue"); bremfree(bp); goto fillbuf; } trytofreespace: /* * We keep the file I/O from hogging metadata I/O * This is desirable because file data is cached in the * VM/Buffer cache even if a buffer is freed. */ if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { if (bp->b_qindex != QUEUE_AGE) panic("getnewbuf: inconsistent AGE queue"); } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue"); } if (!bp) { /* wait for a free buffer of any kind */ needsbuffer = 1; tsleep(&needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo); splx(s); return (0); } /* if we are a delayed write, convert to an async write */ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { vfs_bio_awrite(bp); if (!slpflag && !slptimeo) { splx(s); return (0); } goto start; } if (bp->b_flags & B_WANTED) { bp->b_flags &= ~B_WANTED; wakeup(bp); } bremfree(bp); if (bp->b_flags & B_VMIO) { bp->b_flags |= B_RELBUF | B_BUSY | B_DONE; brelse(bp); bremfree(bp); } if (bp->b_vp) brelvp(bp); /* we are not free, nor do we contain interesting data */ if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); fillbuf: bp->b_flags |= B_BUSY; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); splx(s); if (bp->b_bufsize) { allocbuf(bp, 0); } bp->b_flags = B_BUSY; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_wcred = bp->b_rcred = NOCRED; bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; if (bufspace >= maxbufspace) { s = splbio(); bp->b_flags |= B_INVAL; brelse(bp); goto trytofreespace; } return (bp); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; int s = splbio(); bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp != NULL) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { - splx(s); - return (bp); + break; } bp = bp->b_hash.le_next; } splx(s); - - return (NULL); + return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t off, toff, tinc; vm_page_t m; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) return 0; obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > vp->v_mount->mnt_stat.f_iosize) tinc = vp->v_mount->mnt_stat.f_iosize; off = blkno * vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { int mask; m = vm_page_lookup(obj, trunc_page(toff + off)); if (!m) return 0; if (vm_page_is_valid(m, toff + off, tinc) == 0) return 0; } return 1; } /* * now we set the dirty range for the buffer -- * for NFS -- if the file is mapped and pages have * been written to, let it know. We want the * entire range of the buffer to be marked dirty if * any of the pages have been written to for consistancy * with the b_validoff, b_validend set in the nfs write * code, and used by the nfs read code. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; vm_offset_t boffset, offset; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) && ((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) { /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * scan forwards for the first page modified */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) { break; } } boffset = i * PAGE_SIZE; if (boffset < bp->b_dirtyoff) { bp->b_dirtyoff = boffset; } /* * scan backwards for the last page modified */ for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } boffset = (i + 1) * PAGE_SIZE; offset = boffset + bp->b_pages[0]->offset; if (offset >= object->size) { boffset = object->size - bp->b_pages[0]->offset; } if (bp->b_dirtyend < boffset) { bp->b_dirtyend = boffset; } } } /* * Get a block given a specified block and offset into a file/device. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; vm_offset_t off; int nleft; s = splbio(); loop: - if (bp = incore(vp, blkno)) { - if (bp->b_flags & B_BUSY) { + if (bp = gbincore(vp, blkno)) { + if (bp->b_flags & (B_BUSY|B_INVAL)) { bp->b_flags |= B_WANTED; if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo)) goto loop; splx(s); return (struct buf *) NULL; } bp->b_flags |= B_BUSY | B_CACHE; bremfree(bp); /* * check for size inconsistancies */ if (bp->b_bcount != size) { allocbuf(bp, size); } splx(s); return (bp); } else { vm_object_t obj; int doingvmio; if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { doingvmio = 1; } else { doingvmio = 0; } if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { if (slpflag || slptimeo) return NULL; goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * Normally the vnode is locked so this isn't a problem. * VBLK type I/O requests, however, don't lock the vnode. */ - if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) { + if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); if (doingvmio) { bp->b_flags |= (B_VMIO | B_CACHE); #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } splx(s); allocbuf(bp, size); return (bp); } } /* * Get an empty, disassociated buffer of given size. */ struct buf * geteblk(int size) { struct buf *bp; while ((bp = getnewbuf(0, 0, 0)) == 0); allocbuf(bp, size); bp->b_flags |= B_INVAL; return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * * Modify the length of a buffer's underlying buffer storage without * destroying information (unless, of course the buffer is shrinking). */ int allocbuf(struct buf * bp, int size) { int s; - int newbsize; + int newbsize, mbsize; int i; if (!(bp->b_flags & B_BUSY)) panic("allocbuf: buffer not busy"); if ((bp->b_flags & B_VMIO) == 0) { /* * Just get anonymous memory from the kernel */ + mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; newbsize = round_page(size); if (newbsize < bp->b_bufsize) { vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); } } else { vm_page_t m; int desiredpages; newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; desiredpages = round_page(newbsize) / PAGE_SIZE; if (newbsize < bp->b_bufsize) { if (desiredpages < bp->b_npages) { pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages)); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; s = splhigh(); while ((m->flags & PG_BUSY) || (m->busy != 0)) { m->flags |= PG_WANTED; tsleep(m, PVM, "biodep", 0); } splx(s); if (m->bmapped == 0) { printf("allocbuf: bmapped is zero for page %d\n", i); panic("allocbuf: error"); } --m->bmapped; if (m->bmapped == 0) { vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } bp->b_pages[i] = NULL; } bp->b_npages = desiredpages; } } else if (newbsize > bp->b_bufsize) { vm_object_t obj; vm_offset_t tinc, off, toff, objoff; int pageindex, curbpnpages; struct vnode *vp; int bsize; vp = bp->b_vp; bsize = vp->v_mount->mnt_stat.f_iosize; if (bp->b_npages < desiredpages) { obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > bsize) tinc = bsize; off = bp->b_lblkno * bsize; doretry: curbpnpages = bp->b_npages; bp->b_flags |= B_CACHE; for (toff = 0; toff < newbsize; toff += tinc) { int mask; int bytesinpage; pageindex = toff / PAGE_SIZE; objoff = trunc_page(toff + off); if (pageindex < curbpnpages) { int pb; m = bp->b_pages[pageindex]; if (m->offset != objoff) panic("allocbuf: page changed offset??!!!?"); bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (!vm_page_is_valid(m, toff + off, bytesinpage)) { bp->b_flags &= ~B_CACHE; } if ((m->flags & PG_ACTIVE) == 0) { vm_page_activate(m); m->act_count = 0; } continue; } m = vm_page_lookup(obj, objoff); if (!m) { m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); if (!m) { int j; for (j = bp->b_npages; j < pageindex; j++) { PAGE_WAKEUP(bp->b_pages[j]); } VM_WAIT; goto doretry; } vm_page_activate(m); m->act_count = 0; m->valid = 0; bp->b_flags &= ~B_CACHE; } else if (m->flags & PG_BUSY) { int j; for (j = bp->b_npages; j < pageindex; j++) { PAGE_WAKEUP(bp->b_pages[j]); } s = splbio(); m->flags |= PG_WANTED; tsleep(m, PRIBIO, "pgtblk", 0); splx(s); goto doretry; } else { int pb; if ((curproc != pageproc) && (m->flags & PG_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { pagedaemon_wakeup(); } bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (!vm_page_is_valid(m, toff + off, bytesinpage)) { bp->b_flags &= ~B_CACHE; } if ((m->flags & PG_ACTIVE) == 0) { vm_page_activate(m); m->act_count = 0; } m->flags |= PG_BUSY; } bp->b_pages[pageindex] = m; curbpnpages = pageindex + 1; } for (i = bp->b_npages; i < curbpnpages; i++) { m = bp->b_pages[i]; m->bmapped++; PAGE_WAKEUP(m); } bp->b_npages = curbpnpages; bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); bp->b_data += off % PAGE_SIZE; } } } bufspace += (newbsize - bp->b_bufsize); bp->b_bufsize = newbsize; bp->b_bcount = size; return 1; } /* * Wait for buffer I/O completion, returning error status. */ int biowait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PRIBIO, "biowait", 0); splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_flags & B_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Finish I/O on a buffer, calling an optional function. * This is usually called from interrupt level, so process blocking * is not *a good idea*. */ void biodone(register struct buf * bp) { int s; s = splbio(); if (!(bp->b_flags & B_BUSY)) panic("biodone: buffer not busy"); if (bp->b_flags & B_DONE) { splx(s); printf("biodone: buffer already done\n"); return; } bp->b_flags |= B_DONE; if ((bp->b_flags & B_READ) == 0) { struct vnode *vp = bp->b_vp; vwakeup(bp); } #ifdef BOUNCE_BUFFERS if (bp->b_flags & B_BOUNCE) vm_bounce_free(bp); #endif /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (bp->b_flags & B_VMIO) { int i, resid; vm_offset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; obj = vp->v_object; if (!obj) { panic("biodone: no object"); } #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif iosize = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, foff); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif --obj->paging_in_progress; continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (trunc_page(foff) != m->offset) { printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset); } #endif resid = (m->offset + PAGE_SIZE) - foff; if (resid > iosize) resid = iosize; /* * In the write case, the valid and clean bits are * already changed correctly, so we only need to do this * here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { - vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid); - vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid); + vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "off: %ld, foff: %ld, " "resid: %d, index: %d\n", m->offset, foff, resid, i); printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", m->valid, m->dirty, m->bmapped); panic("biodone: page busy < 0\n"); } --m->busy; if ((m->busy == 0) && (m->flags & PG_WANTED)) { m->flags &= ~PG_WANTED; wakeup(m); } --obj->paging_in_progress; foff += resid; iosize -= resid; } if (obj && obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup(obj); } } /* * For asynchronous completions, release the buffer now. The brelse * checks for B_WANTED and will do the wakeup there if necessary - so * no need to do a wakeup here in the async case. */ if (bp->b_flags & B_ASYNC) { brelse(bp); } else { bp->b_flags &= ~B_WANTED; wakeup(bp); } splx(s); } int count_lock_queue() { int count; struct buf *bp; count = 0; for (bp = bufqueues[QUEUE_LOCKED].tqh_first; bp != NULL; bp = bp->b_freelist.tqe_next) count++; return (count); } int vfs_update_interval = 30; -static void +void vfs_update() { - (void) spl0(); /* XXX redundant? wrong place?*/ + (void) spl0(); while (1) { tsleep(&vfs_update_wakeup, PRIBIO, "update", hz * vfs_update_interval); vfs_update_wakeup = 0; sync(curproc, NULL, NULL); } } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; vm_offset_t foff; foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, foff + i * PAGE_SIZE); if (!m) { panic("vfs_unbusy_pages: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } --obj->paging_in_progress; --m->busy; if ((m->busy == 0) && (m->flags & PG_WANTED)) { m->flags &= ~PG_WANTED; wakeup(m); } } if (obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup(obj); } } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i; if (bp->b_flags & B_VMIO) { vm_object_t obj = bp->b_vp->v_object; vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; int iocount = bp->b_bufsize; vfs_setdirty(bp); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = (m->offset + PAGE_SIZE) - foff; if (resid > iocount) resid = iocount; - obj->paging_in_progress++; - m->busy++; + if ((bp->b_flags & B_CLUSTER) == 0) { + obj->paging_in_progress++; + m->busy++; + } if (clear_modify) { vm_page_protect(m, VM_PROT_READ); - vm_page_set_valid(m, - foff & (PAGE_SIZE-1), resid); - vm_page_set_clean(m, + vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid); } else if (bp->b_bcount >= PAGE_SIZE) { if (m->valid && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } } foff += resid; iocount -= resid; } } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. */ void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; int iocount = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = (m->offset + PAGE_SIZE) - foff; if (resid > iocount) resid = iocount; if (resid > 0) { - vm_page_set_valid(m, - foff & (PAGE_SIZE-1), resid); - vm_page_set_clean(m, + vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid); } foff += resid; iocount -= resid; } } } void vfs_bio_clrbuf(struct buf *bp) { int i; if( bp->b_flags & B_VMIO) { if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { int j; if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) { for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) { bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE); } } bp->b_resid = 0; return; } for(i=0;ib_npages;i++) { if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) continue; if( bp->b_pages[i]->valid == 0) { bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE); } else { int j; for(j=0;jb_pages[i]->valid & (1<b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE); } } bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; } bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for (pg = from; pg < to; pg += PAGE_SIZE) { tryagain: p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); if (!p) { VM_WAIT; goto tryagain; } vm_page_wire(p); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p; PAGE_WAKEUP(p); bp->b_npages++; } } void vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for (pg = from; pg < to; pg += PAGE_SIZE) { p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE]; bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0; pmap_kremove(pg); vm_page_free(p); --bp->b_npages; } } diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index e6bde8c6e831..3f55ff99f626 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -1,661 +1,740 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.16 1995/05/30 08:06:30 rgrimes Exp $ + * $Id: vfs_cluster.c,v 1.17 1995/06/28 12:31:47 davidg Exp $ */ #include #include #include #include #include #include #include #include #include #include #include +#include +#include #ifdef DEBUG #include #include int doreallocblks = 0; struct ctldebug debug13 = {"doreallocblks", &doreallocblks}; #else /* XXX for cluster_write */ #define doreallocblks 0 #endif /* * Local declarations */ -struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, - daddr_t, daddr_t, long, int, long)); +static struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, + daddr_t, daddr_t, long, int)); struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); int totreads; int totreadblocks; +extern vm_page_t bogus_page; #ifdef DIAGNOSTIC /* * Set to 1 if reads of block zero should cause readahead to be done. * Set to 0 treats a read of block zero as a non-sequential read. * * Setting to one assumes that most reads of block zero of files are due to * sequential passes over the files (e.g. cat, sum) where additional blocks * will soon be needed. Setting to zero assumes that the majority are * surgical strikes to get particular info (e.g. size, file) where readahead * blocks will not be used and, in fact, push out other potentially useful * blocks from the cache. The former seems intuitive, but some quick tests * showed that the latter performed better from a system-wide point of view. */ int doclusterraz = 0; #define ISSEQREAD(vp, blk) \ (((blk) != 0 || doclusterraz) && \ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) #else #define ISSEQREAD(vp, blk) \ (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) #endif +/* + * allow for three entire read-aheads... The system will + * adjust downwards rapidly if needed... + */ +#define RA_MULTIPLE_FAST 2 +#define RA_MULTIPLE_SLOW 3 +#define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */ /* * This replaces bread. If this is a bread at the beginning of a file and * lastr is 0, we assume this is the first read and we'll read up to two * blocks if they are sequential. After that, we'll do regular read ahead * in clustered chunks. * bp is the block requested. * rbp is the read-ahead block. * If either is NULL, then you don't have to do the I/O. */ int cluster_read(vp, filesize, lblkno, size, cred, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; struct buf **bpp; { struct buf *bp, *rbp; daddr_t blkno, rablkno, origlblkno; long flags; int error, num_ra, alreadyincore; + int i; + int seq; - origlblkno = lblkno; error = 0; /* * get the requested block */ + origlblkno = lblkno; *bpp = bp = getblk(vp, lblkno, size, 0, 0); + seq = ISSEQREAD(vp, lblkno); /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { - int i; - - if (!ISSEQREAD(vp, origlblkno)) { + if (!seq) { vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; - vp->v_ralen >>= 1; + vp->v_ralen >>= RA_SHIFTDOWN; return 0; - } else if( vp->v_maxra >= origlblkno) { - if ((vp->v_ralen + 1) < (MAXPHYS / size)) - vp->v_ralen++; - if ( vp->v_maxra >= (origlblkno + vp->v_ralen)) + } else if( vp->v_maxra > lblkno) { + if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) { + if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size)) + ++vp->v_ralen; return 0; + } lblkno = vp->v_maxra; + } else { + lblkno += 1; } bp = NULL; } else { /* * if it isn't in the cache, then get a chunk from disk if * sequential, otherwise just get the block. */ bp->b_flags |= B_READ; lblkno += 1; curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + vp->v_ralen = 0; } - /* - * if ralen is "none", then try a little - */ - if (vp->v_ralen == 0) - vp->v_ralen = 1; /* * assume no read-ahead */ alreadyincore = 1; rablkno = lblkno; /* * if we have been doing sequential I/O, then do some read-ahead */ - if (ISSEQREAD(vp, origlblkno)) { - int i; + if (seq) { + /* + * bump ralen a bit... + */ + if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size)) + ++vp->v_ralen; /* * this code makes sure that the stuff that we have read-ahead * is still in the cache. If it isn't, we have been reading * ahead too much, and we need to back-off, otherwise we might * try to read more. */ for (i = 0; i < vp->v_ralen; i++) { rablkno = lblkno + i; alreadyincore = (int) incore(vp, rablkno); if (!alreadyincore) { + if (inmem(vp, rablkno)) { + struct buf *bpt; + if (vp->v_maxra < rablkno) + vp->v_maxra = rablkno + 1; + continue; + } if (rablkno < vp->v_maxra) { vp->v_maxra = rablkno; - vp->v_ralen >>= 1; + vp->v_ralen >>= RA_SHIFTDOWN; alreadyincore = 1; - } else { - if (inmem(vp, rablkno)) { - if( vp->v_maxra < rablkno) - vp->v_maxra = rablkno + 1; - continue; - } - if ((vp->v_ralen + 1) < MAXPHYS / size) - vp->v_ralen++; } break; - } else if( vp->v_maxra < rablkno) { + } else if (vp->v_maxra < rablkno) { vp->v_maxra = rablkno + 1; } } } /* * we now build the read-ahead buffer if it is desirable. */ rbp = NULL; if (!alreadyincore && (rablkno + 1) * size <= filesize && - !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) && + !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) && blkno != -1) { - if ((vp->v_ralen + 1) < MAXPHYS / size) - vp->v_ralen++; if (num_ra > vp->v_ralen) num_ra = vp->v_ralen; if (num_ra) { - rbp = cluster_rbuild(vp, filesize, - NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC); + rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size, + num_ra + 1); } else { rbp = getblk(vp, rablkno, size, 0, 0); rbp->b_flags |= B_READ | B_ASYNC; rbp->b_blkno = blkno; } } /* - * if the synchronous read is a cluster, handle it, otherwise do a - * simple, non-clustered read. + * handle the synchronous read */ if (bp) { if (bp->b_flags & (B_DONE | B_DELWRI)) panic("cluster_read: DONE bp"); else { vfs_busy_pages(bp, 0); error = VOP_STRATEGY(bp); vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; totreads++; totreadblocks += bp->b_bcount / size; curproc->p_stats->p_ru.ru_inblock++; } } /* * and if we have read-aheads, do them too */ if (rbp) { vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; if (error || (rbp->b_flags & B_CACHE)) { rbp->b_flags &= ~(B_ASYNC | B_READ); brelse(rbp); } else { - vfs_busy_pages(rbp, 0); + if ((rbp->b_flags & B_CLUSTER) == 0) + vfs_busy_pages(rbp, 0); (void) VOP_STRATEGY(rbp); totreads++; totreadblocks += rbp->b_bcount / size; curproc->p_stats->p_ru.ru_inblock++; } } if (bp && ((bp->b_flags & B_ASYNC) == 0)) return (biowait(bp)); return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ -struct buf * -cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) +static struct buf * +cluster_rbuild(vp, filesize, lbn, blkno, size, run) struct vnode *vp; u_quad_t filesize; - struct buf *bp; daddr_t lbn; daddr_t blkno; long size; int run; - long flags; { struct cluster_save *b_save; - struct buf *tbp; + struct buf *bp, *tbp; daddr_t bn; int i, inc, j; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_rbuild: size %d != filesize %d\n", size, vp->v_mount->mnt_stat.f_iosize); #endif if (size * (lbn + run + 1) > filesize) --run; - if (run == 0) { - if (!bp) { - bp = getblk(vp, lbn, size, 0, 0); - bp->b_blkno = blkno; - bp->b_flags |= flags; - } - return (bp); - } - tbp = bp; - if (!tbp) { - tbp = getblk(vp, lbn, size, 0, 0); - } - if (tbp->b_flags & B_CACHE) { - return (tbp); - } else if (bp == NULL) { - tbp->b_flags |= B_ASYNC; - } - bp = getpbuf(); - bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER; + + tbp = getblk(vp, lbn, size, 0, 0); + if (tbp->b_flags & B_CACHE) + return tbp; + + tbp->b_blkno = blkno; + tbp->b_flags |= B_ASYNC | B_READ; + if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) + return tbp; + + bp = trypbuf(); + if (bp == 0) + return tbp; + + (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; + bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; pbgetvp(vp, bp); - b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save), + b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), M_SEGMENT, M_WAITOK); b_save->bs_nchildren = 0; b_save->bs_children = (struct buf **) (b_save + 1); bp->b_saveaddr = b_save; bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; - if (tbp->b_flags & B_VMIO) - bp->b_flags |= B_VMIO; - inc = btodb(size); - for (bn = blkno, i = 0; i <= run; ++i, bn += inc) { + for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i != 0) { + if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS) + break; + if (incore(vp, lbn + i)) + break; tbp = getblk(vp, lbn + i, size, 0, 0); + if ((tbp->b_flags & B_CACHE) || - (tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) { + (tbp->b_flags & B_VMIO) == 0) { + brelse(tbp); + break; + } + + for (j=0;jb_npages;j++) { + if (tbp->b_pages[j]->valid) { + break; + } + } + + if (j != tbp->b_npages) { + brelse(tbp); + break; + } + + tbp->b_flags |= B_READ | B_ASYNC; + if( tbp->b_blkno == tbp->b_lblkno) { + tbp->b_blkno = bn; + } else if (tbp->b_blkno != bn) { brelse(tbp); break; } - tbp->b_blkno = bn; - tbp->b_flags |= flags | B_READ | B_ASYNC; - } else { - tbp->b_flags |= flags | B_READ; } ++b_save->bs_nchildren; b_save->bs_children[i] = tbp; for (j = 0; j < tbp->b_npages; j += 1) { - bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; + vm_page_t m; + m = tbp->b_pages[j]; + ++m->busy; + ++m->object->paging_in_progress; + if (m->valid == VM_PAGE_BITS_ALL) { + m = bogus_page; + } + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } } - bp->b_npages += tbp->b_npages; - bp->b_bcount += size; - bp->b_bufsize += size; + bp->b_bcount += tbp->b_bcount; + bp->b_bufsize += tbp->b_bufsize; } - pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *)bp->b_pages, bp->b_npages); + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ void cluster_callback(bp) struct buf *bp; { struct cluster_save *b_save; struct buf **bpp, *tbp; caddr_t cp; int error = 0; /* * Must propogate errors to all the components. */ if (bp->b_flags & B_ERROR) error = bp->b_error; b_save = (struct cluster_save *) (bp->b_saveaddr); - pmap_qremove((vm_offset_t) bp->b_data, bp->b_npages); + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { tbp = *bpp; if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; } biodone(tbp); } free(b_save, M_SEGMENT); relpbuf(bp); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(bp, filesize) struct buf *bp; u_quad_t filesize; { struct vnode *vp; daddr_t lbn; int maxclen, cursize; int lblocksize; vp = bp->b_vp; lblocksize = vp->v_mount->mnt_stat.f_iosize; lbn = bp->b_lblkno; /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = MAXPHYS / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. */ cursize = vp->v_lastw - vp->v_cstart + 1; - cluster_wbuild(vp, NULL, lblocksize, - vp->v_cstart, cursize, lbn); + if (!doreallocblks || + (lbn + 1) * lblocksize != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + cluster_wbuild(vp, NULL, lblocksize, + vp->v_cstart, cursize, lbn); + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp); + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + cluster_wbuild(vp, NULL, lblocksize, + vp->v_cstart, cursize, lbn); + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((lbn + 1) * lblocksize != filesize && (bp->b_blkno == bp->b_lblkno) && - (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || - bp->b_blkno == -1)) { + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || + bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out. */ cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, vp->v_clen + 1, lbn); vp->v_clen = 0; vp->v_cstart = lbn + 1; } else /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ void cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) struct vnode *vp; struct buf *last_bp; long size; daddr_t start_lbn; int len; daddr_t lbn; { struct cluster_save *b_save; struct buf *bp, *tbp, *pb; caddr_t cp; int i, j, s; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_wbuild: size %d != filesize %d\n", size, vp->v_mount->mnt_stat.f_iosize); #endif redo: if( (lbn != -1) || (last_bp == 0)) { while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY) || (start_lbn == lbn)) && len) { ++start_lbn; --len; } pb = trypbuf(); /* Get more memory for current buffer */ if (len <= 1 || pb == NULL) { if (pb != NULL) relpbuf(pb); if (last_bp) { bawrite(last_bp); } else if (len) { bp = getblk(vp, start_lbn, size, 0, 0); bawrite(bp); } return; } tbp = getblk(vp, start_lbn, size, 0, 0); } else { tbp = last_bp; if( tbp->b_flags & B_BUSY) { printf("vfs_cluster: warning: buffer already busy\n"); } tbp->b_flags |= B_BUSY; last_bp = 0; pb = trypbuf(); if (pb == NULL) { bawrite(tbp); return; } } if (!(tbp->b_flags & B_DELWRI)) { relpbuf(pb); ++start_lbn; --len; brelse(tbp); goto redo; } /* * Extra memory in the buffer, punt on this buffer. XXX we could * handle this in most cases, but we would have to push the extra * memory down to after our max possible cluster size and then * potentially pull it back up if the cluster was terminated * prematurely--too much hassle. */ if (tbp->b_bcount != tbp->b_bufsize) { relpbuf(pb); ++start_lbn; --len; bawrite(tbp); goto redo; } bp = pb; b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save), M_SEGMENT, M_WAITOK); b_save->bs_nchildren = 0; b_save->bs_children = (struct buf **) (b_save + 1); bp->b_saveaddr = b_save; bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_flags & B_VMIO) bp->b_flags |= B_VMIO; bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; + (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER; bp->b_iodone = cluster_callback; pbgetvp(vp, bp); for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* * Block is not in core or the non-sequential block * ending our cluster was part of the cluster (in * which case we don't want to write it twice). */ if (!(tbp = incore(vp, start_lbn)) || (last_bp == NULL && start_lbn == lbn)) break; if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK) break; if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE)) break; + if ( (tbp->b_blkno != tbp->b_lblkno) && + ((bp->b_blkno + btodb(size) * i) != tbp->b_blkno)) + break; + /* * Get the desired block buffer (unless it is the * final sequential block whose buffer was passed in * explictly as last_bp). */ if (last_bp == NULL || start_lbn != lbn) { if( tbp->b_flags & B_BUSY) break; tbp = getblk(vp, start_lbn, size, 0, 0); if (!(tbp->b_flags & B_DELWRI) || ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) { brelse(tbp); break; } } else tbp = last_bp; } for (j = 0; j < tbp->b_npages; j += 1) { - bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; + vm_page_t m; + m = tbp->b_pages[j]; + ++m->busy; + ++m->object->paging_in_progress; + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } } - bp->b_npages += tbp->b_npages; bp->b_bcount += size; bp->b_bufsize += size; tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); tbp->b_flags |= B_ASYNC; s = splbio(); reassignbuf(tbp, tbp->b_vp); /* put on clean list */ ++tbp->b_vp->v_numoutput; splx(s); b_save->bs_children[i] = tbp; } b_save->bs_nchildren = i; - pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *) bp->b_pages, bp->b_npages); + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *) bp->b_pages, bp->b_npages); bawrite(bp); if (i < len) { len -= i; goto redo; } } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ struct cluster_save * cluster_collectbufs(vp, last_bp) struct vnode *vp; struct buf *last_bp; { struct cluster_save *buflist; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &buflist->bs_children[i]); buflist->bs_children[i] = last_bp; buflist->bs_nchildren = i + 1; return (buflist); }