diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 6e8d2018eb8b..8e8b9ad96e06 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,1501 +1,1521 @@
 /*
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
  *    is allowed if this notation is included.
  * 5. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
- * $Id: vfs_bio.c,v 1.59 1995/08/24 13:59:14 davidg Exp $
+ * $Id: vfs_bio.c,v 1.60 1995/08/28 09:18:53 julian Exp $
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  */
 
 #define VMIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 
 #include <miscfs/specfs/specdev.h>
 
 /*
  * System initialization
  */
 
 static void vfs_update __P((void));
 struct	proc *updateproc;
 
 static struct kproc_desc up_kp = {
 	"update",
 	vfs_update,
 	&updateproc
 };
 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, (caddr_t)&up_kp)
 
 
 struct buf *buf;		/* buffer header pool */
 struct swqueue bswlist;
 
 void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
 void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
 void vfs_clean_pages(struct buf * bp);
 static void vfs_setdirty(struct buf *bp);
+static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno);
 
 int needsbuffer;
 
 /*
  * Internal update daemon, process 3
  *	The variable vfs_update_wakeup allows for internal syncs.
  */
 int vfs_update_wakeup;
 
 
 /*
  * buffers base kva
  */
 caddr_t buffers_kva;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 vm_offset_t bogus_offset;
 
 int bufspace, maxbufspace;
 
 /*
  * advisory minimum for size of LRU queue or VMIO queue
  */
 int minbuf;
 
 struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
 struct bqueues bufqueues[BUFFER_QUEUES];
 
 /*
  * Initialize buffer headers and related structures.
  */
 void
 bufinit()
 {
 	struct buf *bp;
 	int i;
 
 	TAILQ_INIT(&bswlist);
 	LIST_INIT(&invalhash);
 
 	/* first, make a null hash table */
 	for (i = 0; i < BUFHSZ; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_dev = NODEV;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vnbufs.le_next = NOLIST;
 		bp->b_data = buffers_kva + i * MAXBSIZE;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
 /*
  * maxbufspace is currently calculated to support all filesystem blocks
  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
  * cache is still the same as it would be for 8K filesystems.  This
  * keeps the size of the buffer cache "in check" for big block filesystems.
  */
 	minbuf = nbuf / 3;
 	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
 			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
 
 }
 
 /*
  * remove the buffer from the appropriate free list
  */
 void
 bremfree(struct buf * bp)
 {
 	int s = splbio();
 
 	if (bp->b_qindex != QUEUE_NONE) {
 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
 	} else {
 		panic("bremfree: removing a buffer when not on a queue");
 	}
 	splx(s);
 }
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf ** bpp)
 {
 	struct buf *bp;
 
 	bp = getblk(vp, blkno, size, 0, 0);
 	*bpp = bp;
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(bp);
 		return (biowait(bp));
 	}
 	return (0);
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf ** bpp)
 {
 	struct buf *bp, *rabp;
 	int i;
 	int rv = 0, readwait = 0;
 
 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(bp);
 		++readwait;
 	}
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (curproc != NULL)
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_READ | B_ASYNC;
 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
 				rabp->b_rcred = cred;
 			}
 			vfs_busy_pages(rabp, 0);
 			VOP_STRATEGY(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 
 	if (readwait) {
 		rv = biowait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async.)
  */
 int
 bwrite(struct buf * bp)
 {
 	int oldflags = bp->b_flags;
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 	if (!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
 	bp->b_flags |= B_WRITEINPROG;
 
 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
 		reassignbuf(bp, bp->b_vp);
 	}
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
 	if (curproc != NULL)
 		curproc->p_stats->p_ru.ru_oublock++;
 	VOP_STRATEGY(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = biowait(bp);
 
 		if (oldflags & B_DELWRI) {
 			reassignbuf(bp, bp->b_vp);
 		}
 		brelse(bp);
 		return (rtval);
 	}
 	return (0);
 }
 
 int
 vn_bwrite(ap)
 	struct vop_bwrite_args *ap;
 {
 	return (bwrite(ap->a_bp));
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).
  */
 void
 bdwrite(struct buf * bp)
 {
 
 	if ((bp->b_flags & B_BUSY) == 0) {
 		panic("bdwrite: buffer is not busy");
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 	if (bp->b_flags & B_TAPE) {
 		bawrite(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_READ|B_RELBUF);
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 	}
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if( bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
 	brelse(bp);
 	return;
 }
 
 /*
  * Asynchronous write.
  * Start output on a buffer, but do not wait for it to complete.
  * The buffer is released when the output completes.
  */
 void
 bawrite(struct buf * bp)
 {
 	bp->b_flags |= B_ASYNC;
 	(void) VOP_BWRITE(bp);
 }
 
 /*
  * Release a buffer.
  */
 void
 brelse(struct buf * bp)
 {
 	int s;
 
 	if (bp->b_flags & B_CLUSTER) {
 		relpbuf(bp);
 		return;
 	}
 	/* anyone need a "free" block? */
 	s = splbio();
 
 	if (needsbuffer) {
 		needsbuffer = 0;
 		wakeup(&needsbuffer);
 	}
 
 	/* anyone need this block? */
 	if (bp->b_flags & B_WANTED) {
 		bp->b_flags &= ~(B_WANTED | B_AGE);
 		wakeup(bp);
 	} else if (bp->b_flags & B_VMIO) {
 		bp->b_flags &= ~B_WANTED;
 		wakeup(bp);
 	}
 	if (bp->b_flags & B_LOCKED)
 		bp->b_flags &= ~B_ERROR;
 
 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
 	    (bp->b_bufsize <= 0)) {
 		bp->b_flags |= B_INVAL;
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
 			brelvp(bp);
 	}
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
 	 * invalidate the pages in the VM object.
 	 */
 	if (bp->b_flags & B_VMIO) {
 		vm_offset_t foff;
 		vm_object_t obj;
 		int i, resid;
 		vm_page_t m;
 		int iototal = bp->b_bufsize;
 
 		foff = 0;
 		obj = 0;
 		if (bp->b_npages) {
 			if (bp->b_vp && bp->b_vp->v_mount) {
 				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 			} else {
 				/*
 				 * vnode pointer has been ripped away --
 				 * probably file gone...
 				 */
 				foff = bp->b_pages[0]->offset;
 			}
 		}
 		for (i = 0; i < bp->b_npages; i++) {
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				m = vm_page_lookup(obj, foff);
 				if (!m) {
 					panic("brelse: page missing\n");
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 			}
 			resid = (m->offset + PAGE_SIZE) - foff;
 			if (resid > iototal)
 				resid = iototal;
 			if (resid > 0) {
 				/*
 				 * Don't invalidate the page if the local machine has already
 				 * modified it.  This is the lesser of two evils, and should
 				 * be fixed.
 				 */
 				if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
 					vm_page_test_dirty(m);
 					if (m->dirty == 0) {
 						vm_page_set_invalid(m, foff, resid);
 						if (m->valid == 0)
 							vm_page_protect(m, VM_PROT_NONE);
 					}
 				}
 			}
 			foff += resid;
 			iototal -= resid;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
 			for(i=0;i<bp->b_npages;i++) {
 				m = bp->b_pages[i];
 				--m->bmapped;
 				if (m->bmapped == 0) {
 					if (m->flags & PG_WANTED) {
 						wakeup(m);
 						m->flags &= ~PG_WANTED;
 					}
 					vm_page_test_dirty(m);
 					if ((m->dirty & m->valid) == 0 &&
 						(m->flags & PG_REFERENCED) == 0 &&
 							!pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
 						vm_page_cache(m);
 					} else if ((m->flags & PG_ACTIVE) == 0) {
 						vm_page_activate(m);
 						m->act_count = 0;
 					}
 				}
 			}
 			bufspace -= bp->b_bufsize;
 			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 			bp->b_npages = 0;
 			bp->b_bufsize = 0;
 			bp->b_flags &= ~B_VMIO;
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 
 	/* enqueue */
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_qindex = QUEUE_EMPTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 		/* buffers with junk contents */
 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 		bp->b_qindex = QUEUE_AGE;
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 		/* buffers that are locked */
 	} else if (bp->b_flags & B_LOCKED) {
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 		/* buffers with stale but valid contents */
 	} else if (bp->b_flags & B_AGE) {
 		bp->b_qindex = QUEUE_AGE;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
 		/* buffers with valid and quite potentially reuseable contents */
 	} else {
 		bp->b_qindex = QUEUE_LRU;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
 	}
 
 	/* unlock */
 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	splx(s);
 }
 
+/*
+ * Check to see if a block is currently memory resident.
+ */
+static __inline struct buf *
+gbincore(struct vnode * vp, daddr_t blkno)
+{
+	struct buf *bp;
+	struct bufhashhdr *bh;
+
+	bh = BUFHASH(vp, blkno);
+	bp = bh->lh_first;
+
+	/* Search hash chain */
+	while (bp != NULL) {
+		/* hit */
+		if (bp->b_vp == vp && bp->b_lblkno == blkno) {
+			break;
+		}
+		bp = bp->b_hash.le_next;
+	}
+	return (bp);
+}
+
 /*
  * this routine implements clustered async writes for
  * clearing out B_DELWRI buffers...  This is much better
  * than the old way of writing only one buffer at a time.
  */
 void
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
 	int ncl;
 	struct buf *bpa;
 
 	s = splbio();
 	if (vp->v_mount && (vp->v_flag & VVMIO) &&
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 		int size = vp->v_mount->mnt_stat.f_iosize;
 		int maxcl = MAXPHYS / size;
 
 		for (i = 1; i < maxcl; i++) {
-			if ((bpa = incore(vp, lblkno + i)) &&
+			if ((bpa = gbincore(vp, lblkno + i)) &&
 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
 					break;
 			} else {
 				break;
 			}
 		}
 		ncl = i;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			bremfree(bp);
 			cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
 			splx(s);
 			return;
 		}
 	}
 	/*
 	 * default (old) behavior, writing out only one block
 	 */
 	bremfree(bp);
 	bp->b_flags |= B_BUSY | B_ASYNC;
 	(void) VOP_BWRITE(bp);
 	splx(s);
 }
 
 
 /*
  * Find a buffer header which is available for use.
  */
 static struct buf *
 getnewbuf(int slpflag, int slptimeo, int doingvmio)
 {
 	struct buf *bp;
 	int s;
 	int firstbp = 1;
 
 	s = splbio();
 start:
 	if (bufspace >= maxbufspace)
 		goto trytofreespace;
 
 	/* can we constitute a new buffer? */
 	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
 		if (bp->b_qindex != QUEUE_EMPTY)
 			panic("getnewbuf: inconsistent EMPTY queue");
 		bremfree(bp);
 		goto fillbuf;
 	}
 trytofreespace:
 	/*
 	 * We keep the file I/O from hogging metadata I/O
 	 * This is desirable because file data is cached in the
 	 * VM/Buffer cache even if a buffer is freed.
 	 */
 	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
 		if (bp->b_qindex != QUEUE_AGE)
 			panic("getnewbuf: inconsistent AGE queue");
 	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
 		if (bp->b_qindex != QUEUE_LRU)
 			panic("getnewbuf: inconsistent LRU queue");
 	}
 	if (!bp) {
 		/* wait for a free buffer of any kind */
 		needsbuffer = 1;
 		tsleep(&needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
 		splx(s);
 		return (0);
 	}
 
 	/* if we are a delayed write, convert to an async write */
 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
 		vfs_bio_awrite(bp);
 		if (!slpflag && !slptimeo) {
 			splx(s);
 			return (0);
 		}
 		goto start;
 	}
 
 	if (bp->b_flags & B_WANTED) {
 		bp->b_flags &= ~B_WANTED;
 		wakeup(bp);
 	}
 	bremfree(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
 		brelse(bp);
 		bremfree(bp);
 	}
 
 	if (bp->b_vp)
 		brelvp(bp);
 
 	/* we are not free, nor do we contain interesting data */
 	if (bp->b_rcred != NOCRED)
 		crfree(bp->b_rcred);
 	if (bp->b_wcred != NOCRED)
 		crfree(bp->b_wcred);
 fillbuf:
 	bp->b_flags |= B_BUSY;
 	LIST_REMOVE(bp, b_hash);
 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	splx(s);
 	if (bp->b_bufsize) {
 		allocbuf(bp, 0);
 	}
 	bp->b_flags = B_BUSY;
 	bp->b_dev = NODEV;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_wcred = bp->b_rcred = NOCRED;
 	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_validoff = bp->b_validend = 0;
 	if (bufspace >= maxbufspace) {
 		s = splbio();
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		goto trytofreespace;
 	}
 	return (bp);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
 	int s = splbio();
 
 	bh = BUFHASH(vp, blkno);
 	bp = bh->lh_first;
 
 	/* Search hash chain */
 	while (bp != NULL) {
 		/* hit */
 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 		    (bp->b_flags & B_INVAL) == 0) {
-			splx(s);
-			return (bp);
+			break;
 		}
 		bp = bp->b_hash.le_next;
 	}
 	splx(s);
-
-	return (NULL);
+	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t off, toff, tinc;
 	vm_page_t m;
 
 	if (incore(vp, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
 		return 0;
 
 	obj = vp->v_object;
 	tinc = PAGE_SIZE;
 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
 		tinc = vp->v_mount->mnt_stat.f_iosize;
 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		int mask;
 
 		m = vm_page_lookup(obj, trunc_page(toff + off));
 		if (!m)
 			return 0;
 		if (vm_page_is_valid(m, toff + off, tinc) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 /*
  * now we set the dirty range for the buffer --
  * for NFS -- if the file is mapped and pages have
  * been written to, let it know.  We want the
  * entire range of the buffer to be marked dirty if
  * any of the pages have been written to for consistancy
  * with the b_validoff, b_validend set in the nfs write
  * code, and used by the nfs read code.
  */
 static void
 vfs_setdirty(struct buf *bp) {
 	int i;
 	vm_object_t object;
 	vm_offset_t boffset, offset;
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 	 * is not cleared simply by protecting pages off.
 	 */
 	if ((bp->b_flags & B_VMIO) &&
 		((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * scan forwards for the first page modified
 		 */
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		boffset = i * PAGE_SIZE;
 		if (boffset < bp->b_dirtyoff) {
 			bp->b_dirtyoff = boffset;
 		}
 
 		/*
 		 * scan backwards for the last page modified
 		 */
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		boffset = (i + 1) * PAGE_SIZE;
 		offset = boffset + bp->b_pages[0]->offset;
 		if (offset >= object->size) {
 			boffset = object->size - bp->b_pages[0]->offset;
 		}
 		if (bp->b_dirtyend < boffset) {
 			bp->b_dirtyend = boffset;
 		}
 	}
 }
 
 /*
  * Get a block given a specified block and offset into a file/device.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
 	int s;
 	struct bufhashhdr *bh;
 	vm_offset_t off;
 	int nleft;
 
 	s = splbio();
 loop:
-	if (bp = incore(vp, blkno)) {
-		if (bp->b_flags & B_BUSY) {
+	if (bp = gbincore(vp, blkno)) {
+		if (bp->b_flags & (B_BUSY|B_INVAL)) {
 			bp->b_flags |= B_WANTED;
 			if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo))
 				goto loop;
 
 			splx(s);
 			return (struct buf *) NULL;
 		}
 		bp->b_flags |= B_BUSY | B_CACHE;
 		bremfree(bp);
 		/*
 		 * check for size inconsistancies
 		 */
 		if (bp->b_bcount != size) {
 			allocbuf(bp, size);
 		}
 		splx(s);
 		return (bp);
 	} else {
 		vm_object_t obj;
 		int doingvmio;
 
 		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
 			doingvmio = 1;
 		} else {
 			doingvmio = 0;
 		}
 		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
 			if (slpflag || slptimeo)
 				return NULL;
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * Normally the vnode is locked so this isn't a problem.
 		 * VBLK type I/O requests, however, don't lock the vnode.
 		 */
-		if (!VOP_ISLOCKED(vp) && incore(vp, blkno)) {
+		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		if (doingvmio) {
 			bp->b_flags |= (B_VMIO | B_CACHE);
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 #endif
 		} else {
 			bp->b_flags &= ~B_VMIO;
 		}
 		splx(s);
 
 		allocbuf(bp, size);
 		return (bp);
 	}
 }
 
 /*
  * Get an empty, disassociated buffer of given size.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 
 	while ((bp = getnewbuf(0, 0, 0)) == 0);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;
 	return (bp);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!!
  *
  * Modify the length of a buffer's underlying buffer storage without
  * destroying information (unless, of course the buffer is shrinking).
  */
 int
 allocbuf(struct buf * bp, int size)
 {
 
 	int s;
-	int newbsize;
+	int newbsize, mbsize;
 	int i;
 
 	if (!(bp->b_flags & B_BUSY))
 		panic("allocbuf: buffer not busy");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		/*
 		 * Just get anonymous memory from the kernel
 		 */
+		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
 		newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 		}
 	} else {
 		vm_page_t m;
 		int desiredpages;
 
 		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
 		desiredpages = round_page(newbsize) / PAGE_SIZE;
 
 		if (newbsize < bp->b_bufsize) {
 			if (desiredpages < bp->b_npages) {
 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
 				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					m = bp->b_pages[i];
 					s = splhigh();
 					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
 						m->flags |= PG_WANTED;
 						tsleep(m, PVM, "biodep", 0);
 					}
 					splx(s);
 
 					if (m->bmapped == 0) {
 						printf("allocbuf: bmapped is zero for page %d\n", i);
 						panic("allocbuf: error");
 					}
 					--m->bmapped;
 					if (m->bmapped == 0) {
 						vm_page_protect(m, VM_PROT_NONE);
 						vm_page_free(m);
 					}
 					bp->b_pages[i] = NULL;
 				}
 				bp->b_npages = desiredpages;
 			}
 		} else if (newbsize > bp->b_bufsize) {
 			vm_object_t obj;
 			vm_offset_t tinc, off, toff, objoff;
 			int pageindex, curbpnpages;
 			struct vnode *vp;
 			int bsize;
 
 			vp = bp->b_vp;
 			bsize = vp->v_mount->mnt_stat.f_iosize;
 
 			if (bp->b_npages < desiredpages) {
 				obj = vp->v_object;
 				tinc = PAGE_SIZE;
 				if (tinc > bsize)
 					tinc = bsize;
 				off = bp->b_lblkno * bsize;
 		doretry:
 				curbpnpages = bp->b_npages;
 				bp->b_flags |= B_CACHE;
 				for (toff = 0; toff < newbsize; toff += tinc) {
 					int mask;
 					int bytesinpage;
 
 					pageindex = toff / PAGE_SIZE;
 					objoff = trunc_page(toff + off);
 					if (pageindex < curbpnpages) {
 						int pb;
 
 						m = bp->b_pages[pageindex];
 						if (m->offset != objoff)
 							panic("allocbuf: page changed offset??!!!?");
 						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
 							bytesinpage = newbsize - toff;
 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
 							bp->b_flags &= ~B_CACHE;
 						}
 						if ((m->flags & PG_ACTIVE) == 0) {
 							vm_page_activate(m);
 							m->act_count = 0;
 						}
 						continue;
 					}
 					m = vm_page_lookup(obj, objoff);
 					if (!m) {
 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
 						if (!m) {
 							int j;
 
 							for (j = bp->b_npages; j < pageindex; j++) {
 								PAGE_WAKEUP(bp->b_pages[j]);
 							}
 							VM_WAIT;
 							goto doretry;
 						}
 						vm_page_activate(m);
 						m->act_count = 0;
 						m->valid = 0;
 						bp->b_flags &= ~B_CACHE;
 					} else if (m->flags & PG_BUSY) {
 						int j;
 
 						for (j = bp->b_npages; j < pageindex; j++) {
 							PAGE_WAKEUP(bp->b_pages[j]);
 						}
 
 						s = splbio();
 						m->flags |= PG_WANTED;
 						tsleep(m, PRIBIO, "pgtblk", 0);
 						splx(s);
 
 						goto doretry;
 					} else {
 						int pb;
 						if ((curproc != pageproc) &&
 							(m->flags & PG_CACHE) &&
 						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
 							pagedaemon_wakeup();
 						}
 						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
 							bytesinpage = newbsize - toff;
 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
 							bp->b_flags &= ~B_CACHE;
 						}
 						if ((m->flags & PG_ACTIVE) == 0) {
 							vm_page_activate(m);
 							m->act_count = 0;
 						}
 						m->flags |= PG_BUSY;
 					}
 					bp->b_pages[pageindex] = m;
 					curbpnpages = pageindex + 1;
 				}
 				for (i = bp->b_npages; i < curbpnpages; i++) {
 					m = bp->b_pages[i];
 					m->bmapped++;
 					PAGE_WAKEUP(m);
 				}
 				bp->b_npages = curbpnpages;
 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
 				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
 				bp->b_data += off % PAGE_SIZE;
 			}
 		}
 	}
 	bufspace += (newbsize - bp->b_bufsize);
 	bp->b_bufsize = newbsize;
 	bp->b_bcount = size;
 	return 1;
 }
 
 /*
  * Wait for buffer I/O completion, returning error status.
  */
 int
 biowait(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 	while ((bp->b_flags & B_DONE) == 0)
 		tsleep(bp, PRIBIO, "biowait", 0);
 	splx(s);
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_flags & B_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  * Finish I/O on a buffer, calling an optional function.
  * This is usually called from interrupt level, so process blocking
  * is not *a good idea*.
  */
 void
 biodone(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 	if (!(bp->b_flags & B_BUSY))
 		panic("biodone: buffer not busy");
 
 	if (bp->b_flags & B_DONE) {
 		splx(s);
 		printf("biodone: buffer already done\n");
 		return;
 	}
 	bp->b_flags |= B_DONE;
 
 	if ((bp->b_flags & B_READ) == 0) {
 		struct vnode *vp = bp->b_vp;
 		vwakeup(bp);
 	}
 #ifdef BOUNCE_BUFFERS
 	if (bp->b_flags & B_BOUNCE)
 		vm_bounce_free(bp);
 #endif
 
 	/* call optional completion function if requested */
 	if (bp->b_flags & B_CALL) {
 		bp->b_flags &= ~B_CALL;
 		(*bp->b_iodone) (bp);
 		splx(s);
 		return;
 	}
 	if (bp->b_flags & B_VMIO) {
 		int i, resid;
 		vm_offset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 
 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 		obj = vp->v_object;
 		if (!obj) {
 			panic("biodone: no object");
 		}
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 		iosize = bp->b_bufsize;
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, foff);
 				if (!m) {
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					--obj->paging_in_progress;
 					continue;
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (trunc_page(foff) != m->offset) {
 				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
 			}
 #endif
 			resid = (m->offset + PAGE_SIZE) - foff;
 			if (resid > iosize)
 				resid = iosize;
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly, so we only need to do this
 			 * here in the read case.
 			 */
 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
-				vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid);
-				vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid);
+				vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid);
 			}
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 				printf("biodone: page busy < 0, "
 				    "off: %ld, foff: %ld, "
 				    "resid: %d, index: %d\n",
 				    m->offset, foff, resid, i);
 				printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
 				    bp->b_vp->v_mount->mnt_stat.f_iosize,
 				    bp->b_lblkno, bp->b_flags, bp->b_npages);
 				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
 				    m->valid, m->dirty, m->bmapped);
 				panic("biodone: page busy < 0\n");
 			}
 			--m->busy;
 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 			--obj->paging_in_progress;
 			foff += resid;
 			iosize -= resid;
 		}
 		if (obj && obj->paging_in_progress == 0 &&
 		    (obj->flags & OBJ_PIPWNT)) {
 			obj->flags &= ~OBJ_PIPWNT;
 			wakeup(obj);
 		}
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * checks for B_WANTED and will do the wakeup there if necessary - so
 	 * no need to do a wakeup here in the async case.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		brelse(bp);
 	} else {
 		bp->b_flags &= ~B_WANTED;
 		wakeup(bp);
 	}
 	splx(s);
 }
 
 int
 count_lock_queue()
 {
 	int count;
 	struct buf *bp;
 
 	count = 0;
 	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
 	    bp != NULL;
 	    bp = bp->b_freelist.tqe_next)
 		count++;
 	return (count);
 }
 
 int vfs_update_interval = 30;
 
-static void
+void
 vfs_update()
 {
-	(void) spl0();		/* XXX redundant?  wrong place?*/
+	(void) spl0();
 	while (1) {
 		tsleep(&vfs_update_wakeup, PRIBIO, "update",
 		    hz * vfs_update_interval);
 		vfs_update_wakeup = 0;
 		sync(curproc, NULL, NULL);
 	}
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj = vp->v_object;
 		vm_offset_t foff;
 
 		foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (m == bogus_page) {
 				m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
 				if (!m) {
 					panic("vfs_unbusy_pages: page missing\n");
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 			}
 			--obj->paging_in_progress;
 			--m->busy;
 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 		}
 		if (obj->paging_in_progress == 0 &&
 		    (obj->flags & OBJ_PIPWNT)) {
 			obj->flags &= ~OBJ_PIPWNT;
 			wakeup(obj);
 		}
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_object_t obj = bp->b_vp->v_object;
 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 		int iocount = bp->b_bufsize;
 
 		vfs_setdirty(bp);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			int resid = (m->offset + PAGE_SIZE) - foff;
 
 			if (resid > iocount)
 				resid = iocount;
-			obj->paging_in_progress++;
-			m->busy++;
+			if ((bp->b_flags & B_CLUSTER) == 0) {
+				obj->paging_in_progress++;
+				m->busy++;
+			}
 			if (clear_modify) {
 				vm_page_protect(m, VM_PROT_READ);
-				vm_page_set_valid(m,
-					foff & (PAGE_SIZE-1), resid);
-				vm_page_set_clean(m,
+				vm_page_set_validclean(m,
 					foff & (PAGE_SIZE-1), resid);
 			} else if (bp->b_bcount >= PAGE_SIZE) {
 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
 					bp->b_pages[i] = bogus_page;
 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 				}
 			}
 			foff += resid;
 			iocount -= resid;
 		}
 	}
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  */
 void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_offset_t foff =
 			bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 		int iocount = bp->b_bufsize;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			int resid = (m->offset + PAGE_SIZE) - foff;
 
 			if (resid > iocount)
 				resid = iocount;
 			if (resid > 0) {
-				vm_page_set_valid(m,
-					foff & (PAGE_SIZE-1), resid);
-				vm_page_set_clean(m,
+				vm_page_set_validclean(m,
 					foff & (PAGE_SIZE-1), resid);
 			}
 			foff += resid;
 			iocount -= resid;
 		}
 	}
 }
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i;
 	if( bp->b_flags & B_VMIO) {
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
 			int j;
 			if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
 				for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
 					bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
 				}
 			}
 			bp->b_resid = 0;
 			return;
 		}
 		for(i=0;i<bp->b_npages;i++) {
 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
 				continue;
 			if( bp->b_pages[i]->valid == 0) {
 				bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
 			} else {
 				int j;
 				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
 					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
 						bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
 				}
 			}
 			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
 		}
 		bp->b_resid = 0;
 	} else {
 		clrbuf(bp);
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_unload pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 void
 vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	vm_offset_t from = round_page(froma);
 	vm_offset_t to = round_page(toa);
 
 	for (pg = from; pg < to; pg += PAGE_SIZE) {
 
 tryagain:
 
 		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
 		    VM_ALLOC_NORMAL);
 		if (!p) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		vm_page_wire(p);
 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
 		PAGE_WAKEUP(p);
 		bp->b_npages++;
 	}
 }
 
 void
 vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	vm_offset_t from = round_page(froma);
 	vm_offset_t to = round_page(toa);
 
 	for (pg = from; pg < to; pg += PAGE_SIZE) {
 		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
 		pmap_kremove(pg);
 		vm_page_free(p);
 		--bp->b_npages;
 	}
 }
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index e6bde8c6e831..3f55ff99f626 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -1,661 +1,740 @@
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  * Modifications/enhancements:
  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.16 1995/05/30 08:06:30 rgrimes Exp $
+ * $Id: vfs_cluster.c,v 1.17 1995/06/28 12:31:47 davidg Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 #include <miscfs/specfs/specdev.h>
 #include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
 
 #ifdef DEBUG
 #include <vm/vm.h>
 #include <sys/sysctl.h>
 int doreallocblks = 0;
 struct ctldebug debug13 = {"doreallocblks", &doreallocblks};
 
 #else
 /* XXX for cluster_write */
 #define doreallocblks 0
 #endif
 
 /*
  * Local declarations
  */
-struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
-    daddr_t, daddr_t, long, int, long));
+static struct buf *cluster_rbuild __P((struct vnode *, u_quad_t,
+    daddr_t, daddr_t, long, int));
 struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
 
 int totreads;
 int totreadblocks;
+extern vm_page_t bogus_page;
 
 #ifdef DIAGNOSTIC
 /*
  * Set to 1 if reads of block zero should cause readahead to be done.
  * Set to 0 treats a read of block zero as a non-sequential read.
  *
  * Setting to one assumes that most reads of block zero of files are due to
  * sequential passes over the files (e.g. cat, sum) where additional blocks
  * will soon be needed.  Setting to zero assumes that the majority are
  * surgical strikes to get particular info (e.g. size, file) where readahead
  * blocks will not be used and, in fact, push out other potentially useful
  * blocks from the cache.  The former seems intuitive, but some quick tests
  * showed that the latter performed better from a system-wide point of view.
  */
 	int doclusterraz = 0;
 
 #define ISSEQREAD(vp, blk) \
 	(((blk) != 0 || doclusterraz) && \
 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
 #else
 #define ISSEQREAD(vp, blk) \
 	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
 #endif
 
+/*
+ * allow for three entire read-aheads...  The system will
+ * adjust downwards rapidly if needed...
+ */
+#define RA_MULTIPLE_FAST	2
+#define RA_MULTIPLE_SLOW	3
+#define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
 /*
  * This replaces bread.  If this is a bread at the beginning of a file and
  * lastr is 0, we assume this is the first read and we'll read up to two
  * blocks if they are sequential.  After that, we'll do regular read ahead
  * in clustered chunks.
  * 	bp is the block requested.
  *	rbp is the read-ahead block.
  *	If either is NULL, then you don't have to do the I/O.
  */
 int
 cluster_read(vp, filesize, lblkno, size, cred, bpp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lblkno;
 	long size;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	struct buf *bp, *rbp;
 	daddr_t blkno, rablkno, origlblkno;
 	long flags;
 	int error, num_ra, alreadyincore;
+	int i;
+	int seq;
 
-	origlblkno = lblkno;
 	error = 0;
 	/*
 	 * get the requested block
 	 */
+	origlblkno = lblkno;
 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
+	seq = ISSEQREAD(vp, lblkno);
 	/*
 	 * if it is in the cache, then check to see if the reads have been
 	 * sequential.  If they have, then try some read-ahead, otherwise
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
-		int i;
-
-		if (!ISSEQREAD(vp, origlblkno)) {
+		if (!seq) {
 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
-			vp->v_ralen >>= 1;
+			vp->v_ralen >>= RA_SHIFTDOWN;
 			return 0;
-		} else if( vp->v_maxra >= origlblkno) {
-			if ((vp->v_ralen + 1) < (MAXPHYS / size))
-				vp->v_ralen++;
-			if ( vp->v_maxra >= (origlblkno + vp->v_ralen))
+		} else if( vp->v_maxra > lblkno) {
+			if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) {
+				if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size))
+					++vp->v_ralen;
 				return 0;
+			}
 			lblkno = vp->v_maxra;
+		} else {
+			lblkno += 1;
 		}
 		bp = NULL;
 	} else {
 		/*
 		 * if it isn't in the cache, then get a chunk from disk if
 		 * sequential, otherwise just get the block.
 		 */
 		bp->b_flags |= B_READ;
 		lblkno += 1;
 		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
+		vp->v_ralen = 0;
 	}
-	/*
-	 * if ralen is "none", then try a little
-	 */
-	if (vp->v_ralen == 0)
-		vp->v_ralen = 1;
 	/*
 	 * assume no read-ahead
 	 */
 	alreadyincore = 1;
 	rablkno = lblkno;
 
 	/*
 	 * if we have been doing sequential I/O, then do some read-ahead
 	 */
-	if (ISSEQREAD(vp, origlblkno)) {
-		int i;
+	if (seq) {
 
+	/*
+	 * bump ralen a bit...
+	 */
+		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
+			++vp->v_ralen;
 		/*
 		 * this code makes sure that the stuff that we have read-ahead
 		 * is still in the cache.  If it isn't, we have been reading
 		 * ahead too much, and we need to back-off, otherwise we might
 		 * try to read more.
 		 */
 		for (i = 0; i < vp->v_ralen; i++) {
 			rablkno = lblkno + i;
 			alreadyincore = (int) incore(vp, rablkno);
 			if (!alreadyincore) {
+				if (inmem(vp, rablkno)) {
+					struct buf *bpt;
+					if (vp->v_maxra < rablkno)
+						vp->v_maxra = rablkno + 1;
+					continue;
+				}
 				if (rablkno < vp->v_maxra) {
 					vp->v_maxra = rablkno;
-					vp->v_ralen >>= 1;
+					vp->v_ralen >>= RA_SHIFTDOWN;
 					alreadyincore = 1;
-				} else {
-					if (inmem(vp, rablkno)) {
-						if( vp->v_maxra < rablkno)
-							vp->v_maxra = rablkno + 1;
-						continue;
-					}
-					if ((vp->v_ralen + 1) < MAXPHYS / size)
-						vp->v_ralen++;
 				}
 				break;
-			} else if( vp->v_maxra < rablkno) {
+			} else if (vp->v_maxra < rablkno) {
 				vp->v_maxra = rablkno + 1;
 			}
 		}
 	}
 	/*
 	 * we now build the read-ahead buffer if it is desirable.
 	 */
 	rbp = NULL;
 	if (!alreadyincore &&
 	    (rablkno + 1) * size <= filesize &&
-	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) &&
+	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
 	    blkno != -1) {
-		if ((vp->v_ralen + 1) < MAXPHYS / size)
-			vp->v_ralen++;
 		if (num_ra > vp->v_ralen)
 			num_ra = vp->v_ralen;
 
 		if (num_ra) {
-			rbp = cluster_rbuild(vp, filesize,
-			    NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC);
+			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
+				num_ra + 1);
 		} else {
 			rbp = getblk(vp, rablkno, size, 0, 0);
 			rbp->b_flags |= B_READ | B_ASYNC;
 			rbp->b_blkno = blkno;
 		}
 	}
 
 	/*
-	 * if the synchronous read is a cluster, handle it, otherwise do a
-	 * simple, non-clustered read.
+	 * handle the synchronous read
 	 */
 	if (bp) {
 		if (bp->b_flags & (B_DONE | B_DELWRI))
 			panic("cluster_read: DONE bp");
 		else {
 			vfs_busy_pages(bp, 0);
 			error = VOP_STRATEGY(bp);
 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
 			totreads++;
 			totreadblocks += bp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
 	/*
 	 * and if we have read-aheads, do them too
 	 */
 	if (rbp) {
 		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
 		if (error || (rbp->b_flags & B_CACHE)) {
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			brelse(rbp);
 		} else {
-			vfs_busy_pages(rbp, 0);
+			if ((rbp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(rbp, 0);
 			(void) VOP_STRATEGY(rbp);
 			totreads++;
 			totreadblocks += rbp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
 	if (bp && ((bp->b_flags & B_ASYNC) == 0))
 		return (biowait(bp));
 	return (error);
 }
 
 /*
  * If blocks are contiguous on disk, use this to provide clustered
  * read ahead.  We will read as many blocks as possible sequentially
  * and then parcel them up into logical blocks in the buffer hash table.
  */
-struct buf *
-cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 	struct vnode *vp;
 	u_quad_t filesize;
-	struct buf *bp;
 	daddr_t lbn;
 	daddr_t blkno;
 	long size;
 	int run;
-	long flags;
 {
 	struct cluster_save *b_save;
-	struct buf *tbp;
+	struct buf *bp, *tbp;
 	daddr_t bn;
 	int i, inc, j;
 
 #ifdef DIAGNOSTIC
 	if (size != vp->v_mount->mnt_stat.f_iosize)
 		panic("cluster_rbuild: size %d != filesize %d\n",
 		    size, vp->v_mount->mnt_stat.f_iosize);
 #endif
 	if (size * (lbn + run + 1) > filesize)
 		--run;
-	if (run == 0) {
-		if (!bp) {
-			bp = getblk(vp, lbn, size, 0, 0);
-			bp->b_blkno = blkno;
-			bp->b_flags |= flags;
-		}
-		return (bp);
-	}
-	tbp = bp;
-	if (!tbp) {
-		tbp = getblk(vp, lbn, size, 0, 0);
-	}
-	if (tbp->b_flags & B_CACHE) {
-		return (tbp);
-	} else if (bp == NULL) {
-		tbp->b_flags |= B_ASYNC;
-	}
-	bp = getpbuf();
-	bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER;
+
+	tbp = getblk(vp, lbn, size, 0, 0);
+	if (tbp->b_flags & B_CACHE)
+		return tbp;
+
+	tbp->b_blkno = blkno;
+	tbp->b_flags |= B_ASYNC | B_READ; 
+	if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
+
+	bp = trypbuf();
+	if (bp == 0)
+		return tbp;
+
+	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
+	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
 	bp->b_iodone = cluster_callback;
 	bp->b_blkno = blkno;
 	bp->b_lblkno = lbn;
 	pbgetvp(vp, bp);
 
-	b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save),
+	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
 	    M_SEGMENT, M_WAITOK);
 	b_save->bs_nchildren = 0;
 	b_save->bs_children = (struct buf **) (b_save + 1);
 	bp->b_saveaddr = b_save;
 
 	bp->b_bcount = 0;
 	bp->b_bufsize = 0;
 	bp->b_npages = 0;
 
-	if (tbp->b_flags & B_VMIO)
-		bp->b_flags |= B_VMIO;
-
 	inc = btodb(size);
-	for (bn = blkno, i = 0; i <= run; ++i, bn += inc) {
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
 		if (i != 0) {
+			if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS)
+				break;
+			if (incore(vp, lbn + i))
+				break;
 			tbp = getblk(vp, lbn + i, size, 0, 0);
+
 			if ((tbp->b_flags & B_CACHE) ||
-			    (tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) {
+				(tbp->b_flags & B_VMIO) == 0) {
+				brelse(tbp);
+				break;
+			}
+
+			for (j=0;j<tbp->b_npages;j++) {
+				if (tbp->b_pages[j]->valid) {
+					break;
+				}
+			}
+
+			if (j != tbp->b_npages) {
+				brelse(tbp);
+				break;
+			}
+
+			tbp->b_flags |= B_READ | B_ASYNC;
+			if( tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
 				brelse(tbp);
 				break;
 			}
-			tbp->b_blkno = bn;
-			tbp->b_flags |= flags | B_READ | B_ASYNC;
-		} else {
-			tbp->b_flags |= flags | B_READ;
 		}
 		++b_save->bs_nchildren;
 		b_save->bs_children[i] = tbp;
 		for (j = 0; j < tbp->b_npages; j += 1) {
-			bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			++m->busy;
+			++m->object->paging_in_progress;
+			if (m->valid == VM_PAGE_BITS_ALL) {
+				m = bogus_page;
+			}
+			if ((bp->b_npages == 0) ||
+				(bp->b_pages[bp->b_npages - 1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
 		}
-		bp->b_npages += tbp->b_npages;
-		bp->b_bcount += size;
-		bp->b_bufsize += size;
+		bp->b_bcount += tbp->b_bcount;
+		bp->b_bufsize += tbp->b_bufsize;
 	}
-	pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *)bp->b_pages, bp->b_npages);
+	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		(vm_page_t *)bp->b_pages, bp->b_npages);
 	return (bp);
 }
 
 /*
  * Cleanup after a clustered read or write.
  * This is complicated by the fact that any of the buffers might have
  * extra memory (if there were no empty buffer headers at allocbuf time)
  * that we will need to shift around.
  */
 void
 cluster_callback(bp)
 	struct buf *bp;
 {
 	struct cluster_save *b_save;
 	struct buf **bpp, *tbp;
 	caddr_t cp;
 	int error = 0;
 
 	/*
 	 * Must propogate errors to all the components.
 	 */
 	if (bp->b_flags & B_ERROR)
 		error = bp->b_error;
 
 	b_save = (struct cluster_save *) (bp->b_saveaddr);
-	pmap_qremove((vm_offset_t) bp->b_data, bp->b_npages);
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
 	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
 		tbp = *bpp;
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
 		}
 		biodone(tbp);
 	}
 	free(b_save, M_SEGMENT);
 	relpbuf(bp);
 }
 
 /*
  * Do clustered write for FFS.
  *
  * Three cases:
  *	1. Write is not sequential (write asynchronously)
  *	Write is sequential:
  *	2.	beginning of cluster - begin cluster
  *	3.	middle of a cluster - add to cluster
  *	4.	end of a cluster - asynchronously write cluster
  */
 void
 cluster_write(bp, filesize)
 	struct buf *bp;
 	u_quad_t filesize;
 {
 	struct vnode *vp;
 	daddr_t lbn;
 	int maxclen, cursize;
 	int lblocksize;
 
 	vp = bp->b_vp;
 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
 	lbn = bp->b_lblkno;
 
 	/* Initialize vnode to beginning of file. */
 	if (lbn == 0)
 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 
 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
 		maxclen = MAXPHYS / lblocksize - 1;
 		if (vp->v_clen != 0) {
 			/*
 			 * Next block is not sequential.
 			 *
 			 * If we are not writing at end of file, the process
 			 * seeked to another point in the file since its last
 			 * write, or we have reached our maximum cluster size,
 			 * then push the previous cluster. Otherwise try
 			 * reallocating to make it sequential.
 			 */
 			cursize = vp->v_lastw - vp->v_cstart + 1;
-			cluster_wbuild(vp, NULL, lblocksize,
-			    vp->v_cstart, cursize, lbn);
+			if (!doreallocblks ||
+			    (lbn + 1) * lblocksize != filesize ||
+			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+				cluster_wbuild(vp, NULL, lblocksize,
+				    vp->v_cstart, cursize, lbn);
+			} else {
+				struct buf **bpp, **endbp;
+				struct cluster_save *buflist;
+
+				buflist = cluster_collectbufs(vp, bp);
+				endbp = &buflist->bs_children
+				    [buflist->bs_nchildren - 1];
+				if (VOP_REALLOCBLKS(vp, buflist)) {
+					/*
+					 * Failed, push the previous cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp < endbp; bpp++)
+						brelse(*bpp);
+					free(buflist, M_SEGMENT);
+					cluster_wbuild(vp, NULL, lblocksize,
+					    vp->v_cstart, cursize, lbn);
+				} else {
+					/*
+					 * Succeeded, keep building cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp <= endbp; bpp++)
+						bdwrite(*bpp);
+					free(buflist, M_SEGMENT);
+					vp->v_lastw = lbn;
+					vp->v_lasta = bp->b_blkno;
+					return;
+				}
+			}
 		}
 		/*
 		 * Consider beginning a cluster. If at end of file, make
 		 * cluster as large as possible, otherwise find size of
 		 * existing cluster.
 		 */
 		if ((lbn + 1) * lblocksize != filesize &&
 		    (bp->b_blkno == bp->b_lblkno) &&
-		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
-			bp->b_blkno == -1)) {
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+		     bp->b_blkno == -1)) {
 			bawrite(bp);
 			vp->v_clen = 0;
 			vp->v_lasta = bp->b_blkno;
 			vp->v_cstart = lbn + 1;
 			vp->v_lastw = lbn;
 			return;
 		}
 		vp->v_clen = maxclen;
 		if (maxclen == 0) {	/* I/O not contiguous */
 			vp->v_cstart = lbn + 1;
 			bawrite(bp);
 		} else {	/* Wait for rest of cluster */
 			vp->v_cstart = lbn;
 			bdwrite(bp);
 		}
 	} else if (lbn == vp->v_cstart + vp->v_clen) {
 		/*
 		 * At end of cluster, write it out.
 		 */
 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
 		    vp->v_clen + 1, lbn);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
 		 */
 		bdwrite(bp);
 	vp->v_lastw = lbn;
 	vp->v_lasta = bp->b_blkno;
 }
 
 
 /*
  * This is an awful lot like cluster_rbuild...wish they could be combined.
  * The last lbn argument is the current block on which I/O is being
  * performed.  Check to see that it doesn't fall in the middle of
  * the current block (if last_bp == NULL).
  */
 void
 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
 	struct vnode *vp;
 	struct buf *last_bp;
 	long size;
 	daddr_t start_lbn;
 	int len;
 	daddr_t lbn;
 {
 	struct cluster_save *b_save;
 	struct buf *bp, *tbp, *pb;
 	caddr_t cp;
 	int i, j, s;
 
 #ifdef DIAGNOSTIC
 	if (size != vp->v_mount->mnt_stat.f_iosize)
 		panic("cluster_wbuild: size %d != filesize %d\n",
 		    size, vp->v_mount->mnt_stat.f_iosize);
 #endif
 redo:
 	if( (lbn != -1) || (last_bp == 0)) {
 		while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY)
 			|| (start_lbn == lbn)) && len) {
 			++start_lbn;
 			--len;
 		}
 
 		pb = trypbuf();
 		/* Get more memory for current buffer */
 		if (len <= 1 || pb == NULL) {
 			if (pb != NULL)
 				relpbuf(pb);
 			if (last_bp) {
 				bawrite(last_bp);
 			} else if (len) {
 				bp = getblk(vp, start_lbn, size, 0, 0);
 				bawrite(bp);
 			}
 			return;
 		}
 		tbp = getblk(vp, start_lbn, size, 0, 0);
 	} else {
 		tbp = last_bp;
 		if( tbp->b_flags & B_BUSY) {
 			printf("vfs_cluster: warning: buffer already busy\n");
 		}
 		tbp->b_flags |= B_BUSY;
 		last_bp = 0;
 		pb = trypbuf();
 		if (pb == NULL) {
 			bawrite(tbp);
 			return;
 		}
 	}
 
 	if (!(tbp->b_flags & B_DELWRI)) {
 		relpbuf(pb);
 		++start_lbn;
 		--len;
 		brelse(tbp);
 		goto redo;
 	}
 	/*
 	 * Extra memory in the buffer, punt on this buffer. XXX we could
 	 * handle this in most cases, but we would have to push the extra
 	 * memory down to after our max possible cluster size and then
 	 * potentially pull it back up if the cluster was terminated
 	 * prematurely--too much hassle.
 	 */
 	if (tbp->b_bcount != tbp->b_bufsize) {
 		relpbuf(pb);
 		++start_lbn;
 		--len;
 		bawrite(tbp);
 		goto redo;
 	}
 	bp = pb;
 	b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save),
 	    M_SEGMENT, M_WAITOK);
 	b_save->bs_nchildren = 0;
 	b_save->bs_children = (struct buf **) (b_save + 1);
 	bp->b_saveaddr = b_save;
 	bp->b_bcount = 0;
 	bp->b_bufsize = 0;
 	bp->b_npages = 0;
 
 	if (tbp->b_flags & B_VMIO)
 		bp->b_flags |= B_VMIO;
 
 	bp->b_blkno = tbp->b_blkno;
 	bp->b_lblkno = tbp->b_lblkno;
+	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
 	bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER;
 	bp->b_iodone = cluster_callback;
 	pbgetvp(vp, bp);
 
 	for (i = 0; i < len; ++i, ++start_lbn) {
 		if (i != 0) {
 			/*
 			 * Block is not in core or the non-sequential block
 			 * ending our cluster was part of the cluster (in
 			 * which case we don't want to write it twice).
 			 */
 			if (!(tbp = incore(vp, start_lbn)) ||
 			    (last_bp == NULL && start_lbn == lbn))
 				break;
 
 			if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK)
 				break;
 
 			if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))
 				break;
 
+			if ( (tbp->b_blkno != tbp->b_lblkno) &&
+				((bp->b_blkno + btodb(size) * i) != tbp->b_blkno))
+				break;
+
 			/*
 			 * Get the desired block buffer (unless it is the
 			 * final sequential block whose buffer was passed in
 			 * explictly as last_bp).
 			 */
 			if (last_bp == NULL || start_lbn != lbn) {
 				if( tbp->b_flags & B_BUSY)
 					break;
 				tbp = getblk(vp, start_lbn, size, 0, 0);
 				if (!(tbp->b_flags & B_DELWRI) ||
 				    ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) {
 					brelse(tbp);
 					break;
 				}
 			} else
 				tbp = last_bp;
 		}
 		for (j = 0; j < tbp->b_npages; j += 1) {
-			bp->b_pages[j + bp->b_npages] = tbp->b_pages[j];
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			++m->busy;
+			++m->object->paging_in_progress;
+			if ((bp->b_npages == 0) ||
+				(bp->b_pages[bp->b_npages - 1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
 		}
-		bp->b_npages += tbp->b_npages;
 		bp->b_bcount += size;
 		bp->b_bufsize += size;
 
 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
 		tbp->b_flags |= B_ASYNC;
 		s = splbio();
 		reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
 		++tbp->b_vp->v_numoutput;
 		splx(s);
 		b_save->bs_children[i] = tbp;
 	}
 	b_save->bs_nchildren = i;
-	pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *) bp->b_pages, bp->b_npages);
+	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		(vm_page_t *) bp->b_pages, bp->b_npages);
 	bawrite(bp);
 
 	if (i < len) {
 		len -= i;
 		goto redo;
 	}
 }
 
 /*
  * Collect together all the buffers in a cluster.
  * Plus add one additional buffer.
  */
 struct cluster_save *
 cluster_collectbufs(vp, last_bp)
 	struct vnode *vp;
 	struct buf *last_bp;
 {
 	struct cluster_save *buflist;
 	daddr_t lbn;
 	int i, len;
 
 	len = vp->v_lastw - vp->v_cstart + 1;
 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
 	    M_SEGMENT, M_WAITOK);
 	buflist->bs_nchildren = 0;
 	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
 		    &buflist->bs_children[i]);
 	buflist->bs_children[i] = last_bp;
 	buflist->bs_nchildren = i + 1;
 	return (buflist);
 }