Index: stable/12/sys/sys/vnode.h
===================================================================
--- stable/12/sys/sys/vnode.h	(revision 354393)
+++ stable/12/sys/sys/vnode.h	(revision 354394)
@@ -1,924 +1,925 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_actfreelist;	/* l vnode active/free lists */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	u_int	v_mflag;			/* l mnt-specific vnode flags */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	u_int	v_hash;
 	enum	vtype v_type;			/* u vnode type */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* unpcb, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VI_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VI_TEXT_REF	0x0001	/* Text ref grabbed use ref */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_ACTIVE	0x0200	/* This vnode is on the active list */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
+#define	VV_VMSIZEVNLOCK	0x0020	/* object size check requires vnode lock */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 
 #define	VMP_TMPMNTFREELIST	0x0001	/* Vnode is on mnt's tmp free list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(int newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(u_int count);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf,
 	    u_int *buflen);
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 #define	vdrop(vp)	_vdrop((vp), 0)
 #define	vdropl(vp)	_vdrop((vp), 1)
 void	_vdrop(struct vnode *, bool);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 #define	vhold(vp)	_vhold((vp), 0)
 #define	vholdl(vp)	_vhold((vp), 1)
 void	_vhold(struct vnode *, bool);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	_vn_lock(struct vnode *vp, int flags, char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_post(void *a, int rc);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_post(void *a, int rc);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_strategy_pre(void *a);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 #else
 #define	vop_strategy_pre(x)	do { } while (0)
 #define	vop_lock_pre(x)		do { } while (0)
 #define	vop_lock_post(x, y)	do { } while (0)
 #define	vop_unlock_post(x, y)	do { } while (0)
 #define	vop_unlock_pre(x)	do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefl(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(int newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct ucred *cred);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: stable/12/sys/vm/vm_fault.c
===================================================================
--- stable/12/sys/vm/vm_fault.c	(revision 354393)
+++ stable/12/sys/vm/vm_fault.c	(revision 354394)
@@ -1,1940 +1,1947 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Page fault handling module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_reserv.h>
 
 #define PFBAK 4
 #define PFFOR 4
 
 #define	VM_FAULT_READ_DEFAULT	(1 + VM_FAULT_READ_AHEAD_INIT)
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
 
 #define	VM_FAULT_DONTNEED_MIN	1048576
 
 struct faultstate {
 	vm_page_t m;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_page_t first_m;
 	vm_object_t	first_object;
 	vm_pindex_t first_pindex;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	int map_generation;
 	bool lookup_still_valid;
 	struct vnode *vp;
 };
 
 static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
 	    int ahead);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
 	    int backward, int forward, bool obj_locked);
 
 static int vm_pfault_oom_attempts = 3;
 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN,
     &vm_pfault_oom_attempts, 0,
     "Number of page allocation attempts in page fault handler before it "
     "triggers OOM handling");
 
 static int vm_pfault_oom_wait = 10;
 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN,
     &vm_pfault_oom_wait, 0,
     "Number of seconds to wait for free pages before retrying "
     "the page fault handler");
 
 static inline void
 release_page(struct faultstate *fs)
 {
 
 	if (fs->m != NULL) {
 		vm_page_xunbusy(fs->m);
 		vm_page_lock(fs->m);
 		vm_page_deactivate(fs->m);
 		vm_page_unlock(fs->m);
 		fs->m = NULL;
 	}
 }
 
 static inline void
 unlock_map(struct faultstate *fs)
 {
 
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
 		fs->lookup_still_valid = false;
 	}
 }
 
 static void
 unlock_vp(struct faultstate *fs)
 {
 
 	if (fs->vp != NULL) {
 		vput(fs->vp);
 		fs->vp = NULL;
 	}
 }
 
 static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
 	vm_object_pip_wakeup(fs->object);
 	VM_OBJECT_WUNLOCK(fs->object);
 	if (fs->object != fs->first_object) {
 		VM_OBJECT_WLOCK(fs->first_object);
 		vm_page_lock(fs->first_m);
 		vm_page_free(fs->first_m);
 		vm_page_unlock(fs->first_m);
 		vm_object_pip_wakeup(fs->first_object);
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		fs->first_m = NULL;
 	}
 	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);
 	unlock_vp(fs);
 }
 
 static void
 vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
     vm_prot_t fault_type, int fault_flags, bool set_wd)
 {
 	bool need_dirty;
 
 	if (((prot & VM_PROT_WRITE) == 0 &&
 	    (fault_flags & VM_FAULT_DIRTY) == 0) ||
 	    (m->oflags & VPO_UNMANAGED) != 0)
 		return;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 
 	need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
 	    (fault_flags & VM_FAULT_WIRE) == 0) ||
 	    (fault_flags & VM_FAULT_DIRTY) != 0;
 
 	if (set_wd)
 		vm_object_set_writeable_dirty(m->object);
 	else
 		/*
 		 * If two callers of vm_fault_dirty() with set_wd ==
 		 * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
 		 * flag set, other with flag clear, race, it is
 		 * possible for the no-NOSYNC thread to see m->dirty
 		 * != 0 and not clear VPO_NOSYNC.  Take vm_page lock
 		 * around manipulation of VPO_NOSYNC and
 		 * vm_page_dirty() call, to avoid the race and keep
 		 * m->oflags consistent.
 		 */
 		vm_page_lock(m);
 
 	/*
 	 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
 	 * if the page is already dirty to prevent data written with
 	 * the expectation of being synced from not being synced.
 	 * Likewise if this entry does not request NOSYNC then make
 	 * sure the page isn't marked NOSYNC.  Applications sharing
 	 * data should use the same flags to avoid ping ponging.
 	 */
 	if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
 		if (m->dirty == 0) {
 			m->oflags |= VPO_NOSYNC;
 		}
 	} else {
 		m->oflags &= ~VPO_NOSYNC;
 	}
 
 	/*
 	 * If the fault is a write, we know that this page is being
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
 	 * Also, since the page is now dirty, we can possibly tell
 	 * the pager to release any swap backing the page.  Calling
 	 * the pager requires a write lock on the object.
 	 */
 	if (need_dirty)
 		vm_page_dirty(m);
 	if (!set_wd)
 		vm_page_unlock(m);
 	else if (need_dirty)
 		vm_pager_page_unswapped(m);
 }
 
 static void
 vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m)
 {
 
 	if (m_hold != NULL) {
 		*m_hold = m;
 		vm_page_lock(m);
 		vm_page_hold(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  * Unlocks fs.first_object and fs.map on success.
  */
 static int
 vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
     int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
 {
 	vm_page_t m, m_map;
 #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
     __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \
     VM_NRESERVLEVEL > 0
 	vm_page_t m_super;
 	int flags;
 #endif
 	int psind, rv;
 
 	MPASS(fs->vp == NULL);
 	m = vm_page_lookup(fs->first_object, fs->first_pindex);
 	/* A busy page can be mapped for read|execute access. */
 	if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
 	    vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
 		return (KERN_FAILURE);
 	m_map = m;
 	psind = 0;
 #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
     __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \
     VM_NRESERVLEVEL > 0
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    (m_super = vm_reserv_to_superpage(m)) != NULL &&
 	    rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
 	    roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
 	    (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
 	    (pagesizes[m_super->psind] - 1)) &&
 	    pmap_ps_enabled(fs->map->pmap)) {
 		flags = PS_ALL_VALID;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			/*
 			 * Create a superpage mapping allowing write access
 			 * only if none of the constituent pages are busy and
 			 * all of them are already dirty (except possibly for
 			 * the page that was faulted on).
 			 */
 			flags |= PS_NONE_BUSY;
 			if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
 				flags |= PS_ALL_DIRTY;
 		}
 		if (vm_page_ps_test(m_super, flags, m)) {
 			m_map = m_super;
 			psind = m_super->psind;
 			vaddr = rounddown2(vaddr, pagesizes[psind]);
 			/* Preset the modified bit for dirty superpages. */
 			if ((flags & PS_ALL_DIRTY) != 0)
 				fault_type |= VM_PROT_WRITE;
 		}
 	}
 #endif
 	rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
 	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	vm_fault_fill_hold(m_hold, m);
 	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
 	if (psind == 0 && !wired)
 		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
 	vm_map_lookup_done(fs->map, fs->entry);
 	curthread->td_ru.ru_minflt++;
 	return (KERN_SUCCESS);
 }
 
 static void
 vm_fault_restore_map_lock(struct faultstate *fs)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	MPASS(fs->first_object->paging_in_progress > 0);
 
 	if (!vm_map_trylock_read(fs->map)) {
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		vm_map_lock_read(fs->map);
 		VM_OBJECT_WLOCK(fs->first_object);
 	}
 	fs->lookup_still_valid = true;
 }
 
 static void
 vm_fault_populate_check_page(vm_page_t m)
 {
 
 	/*
 	 * Check each page to ensure that the pager is obeying the
 	 * interface: the page must be installed in the object, fully
 	 * valid, and exclusively busied.
 	 */
 	MPASS(m != NULL);
 	MPASS(m->valid == VM_PAGE_BITS_ALL);
 	MPASS(vm_page_xbusied(m));
 }
 
 static void
 vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
     vm_pindex_t last)
 {
 	vm_page_t m;
 	vm_pindex_t pidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	MPASS(first <= last);
 	for (pidx = first, m = vm_page_lookup(object, pidx);
 	    pidx <= last; pidx++, m = vm_page_next(m)) {
 		vm_fault_populate_check_page(m);
 		vm_page_lock(m);
 		vm_page_deactivate(m);
 		vm_page_unlock(m);
 		vm_page_xunbusy(m);
 	}
 }
 
 static int
 vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type,
     int fault_flags, boolean_t wired, vm_page_t *m_hold)
 {
 	struct mtx *m_mtx;
 	vm_offset_t vaddr;
 	vm_page_t m;
 	vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
 	int i, npages, psind, rv;
 
 	MPASS(fs->object == fs->first_object);
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	MPASS(fs->first_object->paging_in_progress > 0);
 	MPASS(fs->first_object->backing_object == NULL);
 	MPASS(fs->lookup_still_valid);
 
 	pager_first = OFF_TO_IDX(fs->entry->offset);
 	pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
 	unlock_map(fs);
 	unlock_vp(fs);
 
 	/*
 	 * Call the pager (driver) populate() method.
 	 *
 	 * There is no guarantee that the method will be called again
 	 * if the current fault is for read, and a future fault is
 	 * for write.  Report the entry's maximum allowed protection
 	 * to the driver.
 	 */
 	rv = vm_pager_populate(fs->first_object, fs->first_pindex,
 	    fault_type, fs->entry->max_protection, &pager_first, &pager_last);
 
 	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
 	if (rv == VM_PAGER_BAD) {
 		/*
 		 * VM_PAGER_BAD is the backdoor for a pager to request
 		 * normal fault handling.
 		 */
 		vm_fault_restore_map_lock(fs);
 		if (fs->map->timestamp != fs->map_generation)
 			return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
 		return (KERN_NOT_RECEIVER);
 	}
 	if (rv != VM_PAGER_OK)
 		return (KERN_FAILURE); /* AKA SIGSEGV */
 
 	/* Ensure that the driver is obeying the interface. */
 	MPASS(pager_first <= pager_last);
 	MPASS(fs->first_pindex <= pager_last);
 	MPASS(fs->first_pindex >= pager_first);
 	MPASS(pager_last < fs->first_object->size);
 
 	vm_fault_restore_map_lock(fs);
 	if (fs->map->timestamp != fs->map_generation) {
 		vm_fault_populate_cleanup(fs->first_object, pager_first,
 		    pager_last);
 		return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
 	}
 
 	/*
 	 * The map is unchanged after our last unlock.  Process the fault.
 	 *
 	 * The range [pager_first, pager_last] that is given to the
 	 * pager is only a hint.  The pager may populate any range
 	 * within the object that includes the requested page index.
 	 * In case the pager expanded the range, clip it to fit into
 	 * the map entry.
 	 */
 	map_first = OFF_TO_IDX(fs->entry->offset);
 	if (map_first > pager_first) {
 		vm_fault_populate_cleanup(fs->first_object, pager_first,
 		    map_first - 1);
 		pager_first = map_first;
 	}
 	map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
 	if (map_last < pager_last) {
 		vm_fault_populate_cleanup(fs->first_object, map_last + 1,
 		    pager_last);
 		pager_last = map_last;
 	}
 	for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
 	    pidx <= pager_last;
 	    pidx += npages, m = vm_page_next(&m[npages - 1])) {
 		vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
 #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
     __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)
 		psind = m->psind;
 		if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
 		    pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||
 		    !pmap_ps_enabled(fs->map->pmap)))
 			psind = 0;
 #else
 		psind = 0;
 #endif		
 		npages = atop(pagesizes[psind]);
 		for (i = 0; i < npages; i++) {
 			vm_fault_populate_check_page(&m[i]);
 			vm_fault_dirty(fs->entry, &m[i], prot, fault_type,
 			    fault_flags, true);
 		}
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
 		    (wired ? PMAP_ENTER_WIRED : 0), psind);
 #if defined(__amd64__)
 		if (psind > 0 && rv == KERN_FAILURE) {
 			for (i = 0; i < npages; i++) {
 				rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
 				    &m[i], prot, fault_type |
 				    (wired ? PMAP_ENTER_WIRED : 0), 0);
 				MPASS(rv == KERN_SUCCESS);
 			}
 		}
 #else
 		MPASS(rv == KERN_SUCCESS);
 #endif
 		VM_OBJECT_WLOCK(fs->first_object);
 		m_mtx = NULL;
 		for (i = 0; i < npages; i++) {
 			vm_page_change_lock(&m[i], &m_mtx);
 			if ((fault_flags & VM_FAULT_WIRE) != 0)
 				vm_page_wire(&m[i]);
 			else
 				vm_page_activate(&m[i]);
 			if (m_hold != NULL && m[i].pindex == fs->first_pindex) {
 				*m_hold = &m[i];
 				vm_page_hold(&m[i]);
 			}
 			vm_page_xunbusy_maybelocked(&m[i]);
 		}
 		if (m_mtx != NULL)
 			mtx_unlock(m_mtx);
 	}
 	curthread->td_ru.ru_majflt++;
 	return (KERN_SUCCESS);
 }
 
 static int prot_fault_translation;
 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
     &prot_fault_translation, 0,
     "Control signal to deliver on protection fault");
 
 /* compat definition to keep common code for signal translation */
 #define	UCODE_PAGEFLT	12
 #ifdef T_PAGEFLT
 _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
 #endif
 
 /*
  *	vm_fault_trap:
  *
  *	Handle a page fault occurring at the given address,
  *	requiring the given permissions, in the map specified.
  *	If successful, the page is inserted into the
  *	associated physical map.
  *
  *	NOTE: the given address should be truncated to the
  *	proper page address.
  *
  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
  *	a standard error specifying why the fault is fatal is returned.
  *
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
 int
 vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, int *signo, int *ucode)
 {
 	int result;
 
 	MPASS(signo == NULL || ucode != NULL);
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT))
 		ktrfault(vaddr, fault_type);
 #endif
 	result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags,
 	    NULL);
 	KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE ||
 	    result == KERN_INVALID_ADDRESS ||
 	    result == KERN_RESOURCE_SHORTAGE ||
 	    result == KERN_PROTECTION_FAILURE ||
 	    result == KERN_OUT_OF_BOUNDS,
 	    ("Unexpected Mach error %d from vm_fault()", result));
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND))
 		ktrfaultend(result);
 #endif
 	if (result != KERN_SUCCESS && signo != NULL) {
 		switch (result) {
 		case KERN_FAILURE:
 		case KERN_INVALID_ADDRESS:
 			*signo = SIGSEGV;
 			*ucode = SEGV_MAPERR;
 			break;
 		case KERN_RESOURCE_SHORTAGE:
 			*signo = SIGBUS;
 			*ucode = BUS_OOMERR;
 			break;
 		case KERN_OUT_OF_BOUNDS:
 			*signo = SIGBUS;
 			*ucode = BUS_OBJERR;
 			break;
 		case KERN_PROTECTION_FAILURE:
 			if (prot_fault_translation == 0) {
 				/*
 				 * Autodetect.  This check also covers
 				 * the images without the ABI-tag ELF
 				 * note.
 				 */
 				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
 				    curproc->p_osrel >= P_OSREL_SIGSEGV) {
 					*signo = SIGSEGV;
 					*ucode = SEGV_ACCERR;
 				} else {
 					*signo = SIGBUS;
 					*ucode = UCODE_PAGEFLT;
 				}
 			} else if (prot_fault_translation == 1) {
 				/* Always compat mode. */
 				*signo = SIGBUS;
 				*ucode = UCODE_PAGEFLT;
 			} else {
 				/* Always SIGSEGV mode. */
 				*signo = SIGSEGV;
 				*ucode = SEGV_ACCERR;
 			}
 			break;
 		default:
 			KASSERT(0, ("Unexpected Mach error %d from vm_fault()",
 			    result));
 			break;
 		}
 	}
 	return (result);
 }
 
 static int
 vm_fault_lock_vnode(struct faultstate *fs)
 {
 	struct vnode *vp;
 	int error, locked;
 
 	if (fs->object->type != OBJT_VNODE)
 		return (KERN_SUCCESS);
 	vp = fs->object->handle;
 	if (vp == fs->vp) {
 		ASSERT_VOP_LOCKED(vp, "saved vnode is not locked");
 		return (KERN_SUCCESS);
 	}
 
 	/*
 	 * Perform an unlock in case the desired vnode changed while
 	 * the map was unlocked during a retry.
 	 */
 	unlock_vp(fs);
 
 	locked = VOP_ISLOCKED(vp);
 	if (locked != LK_EXCLUSIVE)
 		locked = LK_SHARED;
 
 	/*
 	 * We must not sleep acquiring the vnode lock while we have
 	 * the page exclusive busied or the object's
 	 * paging-in-progress count incremented.  Otherwise, we could
 	 * deadlock.
 	 */
 	error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread);
 	if (error == 0) {
 		fs->vp = vp;
 		return (KERN_SUCCESS);
 	}
 
 	vhold(vp);
 	release_page(fs);
 	unlock_and_deallocate(fs);
 	error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread);
 	vdrop(vp);
 	fs->vp = vp;
 	KASSERT(error == 0, ("vm_fault: vget failed %d", error));
 	return (KERN_RESOURCE_SHORTAGE);
 }
 
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
 {
 	struct faultstate fs;
 	struct domainset *dset;
 	vm_object_t next_object, retry_object;
 	vm_offset_t e_end, e_start;
 	vm_pindex_t retry_pindex;
 	vm_prot_t prot, retry_prot;
 	int ahead, alloc_req, behind, cluster_offset, era, faultcount;
 	int nera, oom, result, rv;
 	u_char behavior;
 	boolean_t wired;	/* Passed by reference. */
 	bool dead, hardfault, is_first_object_locked;
 
 	VM_CNT_INC(v_vm_faults);
 
 	if ((curthread->td_pflags & TDP_NOFAULTING) != 0)
 		return (KERN_PROTECTION_FAILURE);
 
 	fs.vp = NULL;
 	faultcount = 0;
 	nera = -1;
 	hardfault = false;
 
 RetryFault:
 	oom = 0;
 RetryFault_oom:
 
 	/*
 	 * Find the backing store object and offset into it to begin the
 	 * search.
 	 */
 	fs.map = map;
 	result = vm_map_lookup(&fs.map, vaddr, fault_type |
 	    VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object,
 	    &fs.first_pindex, &prot, &wired);
 	if (result != KERN_SUCCESS) {
 		unlock_vp(&fs);
 		return (result);
 	}
 
 	fs.map_generation = fs.map->timestamp;
 
 	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
 		panic("%s: fault on nofault entry, addr: %#lx",
 		    __func__, (u_long)vaddr);
 	}
 
 	if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
 	    fs.entry->wiring_thread != curthread) {
 		vm_map_unlock_read(fs.map);
 		vm_map_lock(fs.map);
 		if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
 		    (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
 			unlock_vp(&fs);
 			fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			vm_map_unlock_and_wait(fs.map, 0);
 		} else
 			vm_map_unlock(fs.map);
 		goto RetryFault;
 	}
 
 	MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0);
 
 	if (wired)
 		fault_type = prot | (fault_type & VM_PROT_COPY);
 	else
 		KASSERT((fault_flags & VM_FAULT_WIRE) == 0,
 		    ("!wired && VM_FAULT_WIRE"));
 
 	/*
 	 * Try to avoid lock contention on the top-level object through
 	 * special-case handling of some types of page faults, specifically,
 	 * those that are both (1) mapping an existing page from the top-
 	 * level object and (2) not having to mark that object as containing
 	 * dirty pages.  Under these conditions, a read lock on the top-level
 	 * object suffices, allowing multiple page faults of a similar type to
 	 * run in parallel on the same top-level object.
 	 */
 	if (fs.vp == NULL /* avoid locked vnode leak */ &&
 	    (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 &&
 	    /* avoid calling vm_object_set_writeable_dirty() */
 	    ((prot & VM_PROT_WRITE) == 0 ||
 	    (fs.first_object->type != OBJT_VNODE &&
 	    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
 	    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) {
 		VM_OBJECT_RLOCK(fs.first_object);
 		if ((prot & VM_PROT_WRITE) == 0 ||
 		    (fs.first_object->type != OBJT_VNODE &&
 		    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
 		    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 			rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type,
 			    fault_flags, wired, m_hold);
 			if (rv == KERN_SUCCESS)
 				return (rv);
 		}
 		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
 			VM_OBJECT_RUNLOCK(fs.first_object);
 			VM_OBJECT_WLOCK(fs.first_object);
 		}
 	} else {
 		VM_OBJECT_WLOCK(fs.first_object);
 	}
 
 	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
 	 * truncation operations) during I/O.
 	 */
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
 	fs.lookup_still_valid = true;
 
 	fs.first_m = NULL;
 
 	/*
 	 * Search for the page at object/offset.
 	 */
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 	while (TRUE) {
 		/*
 		 * If the object is marked for imminent termination,
 		 * we retry here, since the collapse pass has raced
 		 * with us.  Otherwise, if we see terminally dead
 		 * object, return fail.
 		 */
 		if ((fs.object->flags & OBJ_DEAD) != 0) {
 			dead = fs.object->type == OBJT_DEAD;
 			unlock_and_deallocate(&fs);
 			if (dead)
 				return (KERN_PROTECTION_FAILURE);
 			pause("vmf_de", 1);
 			goto RetryFault;
 		}
 
 		/*
 		 * See if page is resident
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			/*
 			 * Wait/Retry if the page is busy.  We have to do this
 			 * if the page is either exclusive or shared busy
 			 * because the vm_pager may be using read busy for
 			 * pageouts (and even pageins if it is the vnode
 			 * pager), and we could end up trying to pagein and
 			 * pageout the same page simultaneously.
 			 *
 			 * We can theoretically allow the busy case on a read
 			 * fault if the page is marked valid, but since such
 			 * pages are typically already pmap'd, putting that
 			 * special case in might be more effort then it is 
 			 * worth.  We cannot under any circumstances mess
 			 * around with a shared busied page except, perhaps,
 			 * to pmap it.
 			 */
 			if (vm_page_busied(fs.m)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_aflag_set(fs.m, PGA_REFERENCED);
 				if (fs.object != fs.first_object) {
 					if (!VM_OBJECT_TRYWLOCK(
 					    fs.first_object)) {
 						VM_OBJECT_WUNLOCK(fs.object);
 						VM_OBJECT_WLOCK(fs.first_object);
 						VM_OBJECT_WLOCK(fs.object);
 					}
 					vm_page_lock(fs.first_m);
 					vm_page_free(fs.first_m);
 					vm_page_unlock(fs.first_m);
 					vm_object_pip_wakeup(fs.first_object);
 					VM_OBJECT_WUNLOCK(fs.first_object);
 					fs.first_m = NULL;
 				}
 				unlock_map(&fs);
 				if (fs.m == vm_page_lookup(fs.object,
 				    fs.pindex)) {
 					vm_page_sleep_if_busy(fs.m, "vmpfw");
 				}
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_WUNLOCK(fs.object);
 				VM_CNT_INC(v_intrans);
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
 
 			/*
 			 * Mark page busy for other processes, and the 
 			 * pagedaemon.  If it still isn't completely valid
 			 * (readable), jump to readrest, else break-out ( we
 			 * found the page ).
 			 */
 			vm_page_xbusy(fs.m);
 			if (fs.m->valid != VM_PAGE_BITS_ALL)
 				goto readrest;
 			break; /* break to PAGE HAS BEEN FOUND */
 		}
 		KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m));
 
 		/*
 		 * Page is not resident.  If the pager might contain the page
 		 * or this is the beginning of the search, allocate a new
 		 * page.  (Default objects are zero-fill, so there is no real
 		 * pager for them.)
 		 */
 		if (fs.object->type != OBJT_DEFAULT ||
 		    fs.object == fs.first_object) {
+			if ((fs.object->flags & OBJ_SIZEVNLOCK) != 0) {
+				rv = vm_fault_lock_vnode(&fs);
+				MPASS(rv == KERN_SUCCESS ||
+				    rv == KERN_RESOURCE_SHORTAGE);
+				if (rv == KERN_RESOURCE_SHORTAGE)
+					goto RetryFault;
+			}
 			if (fs.pindex >= fs.object->size) {
 				unlock_and_deallocate(&fs);
 				return (KERN_OUT_OF_BOUNDS);
 			}
 
 			if (fs.object == fs.first_object &&
 			    (fs.first_object->flags & OBJ_POPULATE) != 0 &&
 			    fs.first_object->shadow_count == 0) {
 				rv = vm_fault_populate(&fs, prot, fault_type,
 				    fault_flags, wired, m_hold);
 				switch (rv) {
 				case KERN_SUCCESS:
 				case KERN_FAILURE:
 					unlock_and_deallocate(&fs);
 					return (rv);
 				case KERN_RESOURCE_SHORTAGE:
 					unlock_and_deallocate(&fs);
 					goto RetryFault;
 				case KERN_NOT_RECEIVER:
 					/*
 					 * Pager's populate() method
 					 * returned VM_PAGER_BAD.
 					 */
 					break;
 				default:
 					panic("inconsistent return codes");
 				}
 			}
 
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 *
 			 * Unlocked read of the p_flag is harmless. At
 			 * worst, the P_KILLED might be not observed
 			 * there, and allocation can fail, causing
 			 * restart and new reading of the p_flag.
 			 */
 			dset = fs.object->domain.dr_policy;
 			if (dset == NULL)
 				dset = curthread->td_domain.dr_policy;
 			if (!vm_page_count_severe_set(&dset->ds_mask) ||
 			    P_KILLED(curproc)) {
 #if VM_NRESERVLEVEL > 0
 				vm_object_color(fs.object, atop(vaddr) -
 				    fs.pindex);
 #endif
 				alloc_req = P_KILLED(curproc) ?
 				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
 				if (fs.object->type != OBJT_VNODE &&
 				    fs.object->backing_object == NULL)
 					alloc_req |= VM_ALLOC_ZERO;
 				fs.m = vm_page_alloc(fs.object, fs.pindex,
 				    alloc_req);
 			}
 			if (fs.m == NULL) {
 				unlock_and_deallocate(&fs);
 				if (vm_pfault_oom_attempts < 0 ||
 				    oom < vm_pfault_oom_attempts) {
 					oom++;
 					vm_waitpfault(dset,
 					    vm_pfault_oom_wait * hz);
 					goto RetryFault_oom;
 				}
 				if (bootverbose)
 					printf(
 	"proc %d (%s) failed to alloc page on fault, starting OOM\n",
 					    curproc->p_pid, curproc->p_comm);
 				vm_pageout_oom(VM_OOM_MEM_PF);
 				goto RetryFault;
 			}
 		}
 
 readrest:
 		/*
 		 * At this point, we have either allocated a new page or found
 		 * an existing page that is only partially valid.
 		 *
 		 * We hold a reference on the current object and the page is
 		 * exclusive busied.
 		 */
 
 		/*
 		 * If the pager for the current object might have the page,
 		 * then determine the number of additional pages to read and
 		 * potentially reprioritize previously read pages for earlier
 		 * reclamation.  These operations should only be performed
 		 * once per page fault.  Even if the current pager doesn't
 		 * have the page, the number of additional pages to read will
 		 * apply to subsequent objects in the shadow chain.
 		 */
 		if (fs.object->type != OBJT_DEFAULT && nera == -1 &&
 		    !P_KILLED(curproc)) {
 			KASSERT(fs.lookup_still_valid, ("map unlocked"));
 			era = fs.entry->read_ahead;
 			behavior = vm_map_entry_behavior(fs.entry);
 			if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
 				nera = 0;
 			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
 				nera = VM_FAULT_READ_AHEAD_MAX;
 				if (vaddr == fs.entry->next_read)
 					vm_fault_dontneed(&fs, vaddr, nera);
 			} else if (vaddr == fs.entry->next_read) {
 				/*
 				 * This is a sequential fault.  Arithmetically
 				 * increase the requested number of pages in
 				 * the read-ahead window.  The requested
 				 * number of pages is "# of sequential faults
 				 * x (read ahead min + 1) + read ahead min"
 				 */
 				nera = VM_FAULT_READ_AHEAD_MIN;
 				if (era > 0) {
 					nera += era + 1;
 					if (nera > VM_FAULT_READ_AHEAD_MAX)
 						nera = VM_FAULT_READ_AHEAD_MAX;
 				}
 				if (era == VM_FAULT_READ_AHEAD_MAX)
 					vm_fault_dontneed(&fs, vaddr, nera);
 			} else {
 				/*
 				 * This is a non-sequential fault.
 				 */
 				nera = 0;
 			}
 			if (era != nera) {
 				/*
 				 * A read lock on the map suffices to update
 				 * the read ahead count safely.
 				 */
 				fs.entry->read_ahead = nera;
 			}
 
 			/*
 			 * Prepare for unlocking the map.  Save the map
 			 * entry's start and end addresses, which are used to
 			 * optimize the size of the pager operation below.
 			 * Even if the map entry's addresses change after
 			 * unlocking the map, using the saved addresses is
 			 * safe.
 			 */
 			e_start = fs.entry->start;
 			e_end = fs.entry->end;
 		}
 
 		/*
 		 * Call the pager to retrieve the page if there is a chance
 		 * that the pager has it, and potentially retrieve additional
 		 * pages at the same time.
 		 */
 		if (fs.object->type != OBJT_DEFAULT) {
 			/*
 			 * Release the map lock before locking the vnode or
 			 * sleeping in the pager.  (If the current object has
 			 * a shadow, then an earlier iteration of this loop
 			 * may have already unlocked the map.)
 			 */
 			unlock_map(&fs);
 
 			rv = vm_fault_lock_vnode(&fs);
 			MPASS(rv == KERN_SUCCESS ||
 			    rv == KERN_RESOURCE_SHORTAGE);
 			if (rv == KERN_RESOURCE_SHORTAGE)
 				goto RetryFault;
 			KASSERT(fs.vp == NULL || !fs.map->system_map,
 			    ("vm_fault: vnode-backed object mapped by system map"));
 
 			/*
 			 * Page in the requested page and hint the pager,
 			 * that it may bring up surrounding pages.
 			 */
 			if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
 			    P_KILLED(curproc)) {
 				behind = 0;
 				ahead = 0;
 			} else {
 				/* Is this a sequential fault? */
 				if (nera > 0) {
 					behind = 0;
 					ahead = nera;
 				} else {
 					/*
 					 * Request a cluster of pages that is
 					 * aligned to a VM_FAULT_READ_DEFAULT
 					 * page offset boundary within the
 					 * object.  Alignment to a page offset
 					 * boundary is more likely to coincide
 					 * with the underlying file system
 					 * block than alignment to a virtual
 					 * address boundary.
 					 */
 					cluster_offset = fs.pindex %
 					    VM_FAULT_READ_DEFAULT;
 					behind = ulmin(cluster_offset,
 					    atop(vaddr - e_start));
 					ahead = VM_FAULT_READ_DEFAULT - 1 -
 					    cluster_offset;
 				}
 				ahead = ulmin(ahead, atop(e_end - vaddr) - 1);
 			}
 			rv = vm_pager_get_pages(fs.object, &fs.m, 1,
 			    &behind, &ahead);
 			if (rv == VM_PAGER_OK) {
 				faultcount = behind + 1 + ahead;
 				hardfault = true;
 				break; /* break to PAGE HAS BEEN FOUND */
 			}
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager read error, pid %d (%s)\n",
 				    curproc->p_pid, curproc->p_comm);
 
 			/*
 			 * If an I/O error occurred or the requested page was
 			 * outside the range of the pager, clean up and return
 			 * an error.
 			 */
 			if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
 				vm_page_lock(fs.m);
 				if (!vm_page_wired(fs.m))
 					vm_page_free(fs.m);
 				else
 					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 				unlock_and_deallocate(&fs);
 				return (KERN_OUT_OF_BOUNDS);
 			}
 
 			/*
 			 * The requested page does not exist at this object/
 			 * offset.  Remove the invalid page from the object,
 			 * waking up anyone waiting for it, and continue on to
 			 * the next object.  However, if this is the top-level
 			 * object, we must leave the busy page in place to
 			 * prevent another process from rushing past us, and
 			 * inserting the page in that object at the same time
 			 * that we are.
 			 */
 			if (fs.object != fs.first_object) {
 				vm_page_lock(fs.m);
 				if (!vm_page_wired(fs.m))
 					vm_page_free(fs.m);
 				else
 					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 			}
 		}
 
 		/*
 		 * We get here if the object has default pager (or unwiring) 
 		 * or the pager doesn't have the page.
 		 */
 		if (fs.object == fs.first_object)
 			fs.first_m = fs.m;
 
 		/*
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
 		next_object = fs.object->backing_object;
 		if (next_object == NULL) {
 			/*
 			 * If there's no object left, fill the page in the top
 			 * object with zeros.
 			 */
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_WUNLOCK(fs.object);
 
 				fs.object = fs.first_object;
 				fs.pindex = fs.first_pindex;
 				fs.m = fs.first_m;
 				VM_OBJECT_WLOCK(fs.object);
 			}
 			fs.first_m = NULL;
 
 			/*
 			 * Zero the page if necessary and mark it valid.
 			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				pmap_zero_page(fs.m);
 			} else {
 				VM_CNT_INC(v_ozfod);
 			}
 			VM_CNT_INC(v_zfod);
 			fs.m->valid = VM_PAGE_BITS_ALL;
 			/* Don't try to prefault neighboring pages. */
 			faultcount = 1;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			KASSERT(fs.object != next_object,
 			    ("object loop %p", next_object));
 			VM_OBJECT_WLOCK(next_object);
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
 			fs.pindex +=
 			    OFF_TO_IDX(fs.object->backing_object_offset);
 			VM_OBJECT_WUNLOCK(fs.object);
 			fs.object = next_object;
 		}
 	}
 
 	vm_page_assert_xbusied(fs.m);
 
 	/*
 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 	 * is held.]
 	 */
 
 	/*
 	 * If the page is being written, but isn't already owned by the
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
 	if (fs.object != fs.first_object) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
 		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 			/*
 			 * This allows pages to be virtually copied from a 
 			 * backing_object into the first_object, where the 
 			 * backing object has no other refs to it, and cannot
 			 * gain any more refs.  Instead of a bcopy, we just 
 			 * move the page from the backing object to the 
 			 * first object.  Note that we must mark the page 
 			 * dirty in the first object so that it will go out 
 			 * to swap when needed.
 			 */
 			is_first_object_locked = false;
 			if (
 				/*
 				 * Only one shadow object
 				 */
 				(fs.object->shadow_count == 1) &&
 				/*
 				 * No COW refs, except us
 				 */
 				(fs.object->ref_count == 1) &&
 				/*
 				 * No one else can look this object up
 				 */
 				(fs.object->handle == NULL) &&
 				/*
 				 * No other ways to look the object up
 				 */
 				((fs.object->type == OBJT_DEFAULT) ||
 				 (fs.object->type == OBJT_SWAP)) &&
 			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
 				/*
 				 * We don't chase down the shadow chain
 				 */
 			    fs.object == fs.first_object->backing_object) {
 				vm_page_lock(fs.m);
 				vm_page_dequeue(fs.m);
 				(void)vm_page_remove(fs.m);
 				vm_page_unlock(fs.m);
 				vm_page_lock(fs.first_m);
 				vm_page_replace_checked(fs.m, fs.first_object,
 				    fs.first_pindex, fs.first_m);
 				vm_page_free(fs.first_m);
 				vm_page_unlock(fs.first_m);
 				vm_page_dirty(fs.m);
 #if VM_NRESERVLEVEL > 0
 				/*
 				 * Rename the reservation.
 				 */
 				vm_reserv_rename(fs.m, fs.first_object,
 				    fs.object, OFF_TO_IDX(
 				    fs.first_object->backing_object_offset));
 #endif
 				/*
 				 * Removing the page from the backing object
 				 * unbusied it.
 				 */
 				vm_page_xbusy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
 				VM_CNT_INC(v_cow_optim);
 			} else {
 				/*
 				 * Oh, well, lets copy it.
 				 */
 				pmap_copy_page(fs.m, fs.first_m);
 				fs.first_m->valid = VM_PAGE_BITS_ALL;
 				if (wired && (fault_flags &
 				    VM_FAULT_WIRE) == 0) {
 					vm_page_lock(fs.first_m);
 					vm_page_wire(fs.first_m);
 					vm_page_unlock(fs.first_m);
 					
 					vm_page_lock(fs.m);
 					vm_page_unwire(fs.m, PQ_INACTIVE);
 					vm_page_unlock(fs.m);
 				}
 				/*
 				 * We no longer need the old page or object.
 				 */
 				release_page(&fs);
 			}
 			/*
 			 * fs.object != fs.first_object due to above 
 			 * conditional
 			 */
 			vm_object_pip_wakeup(fs.object);
 			VM_OBJECT_WUNLOCK(fs.object);
 			/*
 			 * Only use the new page below...
 			 */
 			fs.object = fs.first_object;
 			fs.pindex = fs.first_pindex;
 			fs.m = fs.first_m;
 			if (!is_first_object_locked)
 				VM_OBJECT_WLOCK(fs.object);
 			VM_CNT_INC(v_cow_faults);
 			curthread->td_cow++;
 		} else {
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * We must verify that the maps have not changed since our last
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
 		if (!vm_map_trylock_read(fs.map)) {
 			release_page(&fs);
 			unlock_and_deallocate(&fs);
 			goto RetryFault;
 		}
 		fs.lookup_still_valid = true;
 		if (fs.map->timestamp != fs.map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 
 			/*
 			 * If we don't need the page any longer, put it on the inactive
 			 * list (the easiest thing to do here).  If no one needs it,
 			 * pageout will grab it eventually.
 			 */
 			if (result != KERN_SUCCESS) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 
 				/*
 				 * If retry of map lookup would have blocked then
 				 * retry fault from start.
 				 */
 				if (result == KERN_FAILURE)
 					goto RetryFault;
 				return (result);
 			}
 			if ((retry_object != fs.first_object) ||
 			    (retry_pindex != fs.first_pindex)) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 				goto RetryFault;
 			}
 
 			/*
 			 * Check whether the protection has changed or the object has
 			 * been copied while we left the map unlocked. Changing from
 			 * read to write permission is OK - we leave the page
 			 * write-protected, and catch the write fault. Changing from
 			 * write to read permission means that we can't mark the page
 			 * write-enabled after all.
 			 */
 			prot &= retry_prot;
 			fault_type &= retry_prot;
 			if (prot == 0) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 				goto RetryFault;
 			}
 
 			/* Reassert because wired may have changed. */
 			KASSERT(wired || (fault_flags & VM_FAULT_WIRE) == 0,
 			    ("!wired && VM_FAULT_WIRE"));
 		}
 	}
 
 	/*
 	 * If the page was filled by a pager, save the virtual address that
 	 * should be faulted on next under a sequential access pattern to the
 	 * map entry.  A read lock on the map suffices to update this address
 	 * safely.
 	 */
 	if (hardfault)
 		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 
 	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
 	vm_page_assert_xbusied(fs.m);
 
 	/*
 	 * Page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
 	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_fault: page %p partially invalid", fs.m));
 	VM_OBJECT_WUNLOCK(fs.object);
 
 	/*
 	 * Put this page into the physical map.  We had to do the unlock above
 	 * because pmap_enter() may sleep.  We don't put the page
 	 * back on the active queue until later so that the pageout daemon
 	 * won't find it (yet).
 	 */
 	pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
 	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
 	if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
 	    wired == 0)
 		vm_fault_prefault(&fs, vaddr,
 		    faultcount > 0 ? behind : PFBAK,
 		    faultcount > 0 ? ahead : PFFOR, false);
 	VM_OBJECT_WLOCK(fs.object);
 	vm_page_lock(fs.m);
 
 	/*
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
 	if ((fault_flags & VM_FAULT_WIRE) != 0)
 		vm_page_wire(fs.m);
 	else
 		vm_page_activate(fs.m);
 	if (m_hold != NULL) {
 		*m_hold = fs.m;
 		vm_page_hold(fs.m);
 	}
 	vm_page_unlock(fs.m);
 	vm_page_xunbusy(fs.m);
 
 	/*
 	 * Unlock everything, and return
 	 */
 	unlock_and_deallocate(&fs);
 	if (hardfault) {
 		VM_CNT_INC(v_io_faults);
 		curthread->td_ru.ru_majflt++;
 #ifdef RACCT
 		if (racct_enable && fs.object->type == OBJT_VNODE) {
 			PROC_LOCK(curproc);
 			if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 				racct_add_force(curproc, RACCT_WRITEBPS,
 				    PAGE_SIZE + behind * PAGE_SIZE);
 				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
 			} else {
 				racct_add_force(curproc, RACCT_READBPS,
 				    PAGE_SIZE + ahead * PAGE_SIZE);
 				racct_add_force(curproc, RACCT_READIOPS, 1);
 			}
 			PROC_UNLOCK(curproc);
 		}
 #endif
 	} else 
 		curthread->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Speed up the reclamation of pages that precede the faulting pindex within
  * the first object of the shadow chain.  Essentially, perform the equivalent
  * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
  * the faulting pindex by the cluster size when the pages read by vm_fault()
  * cross a cluster-size boundary.  The cluster size is the greater of the
  * smallest superpage size and VM_FAULT_DONTNEED_MIN.
  *
  * When "fs->first_object" is a shadow object, the pages in the backing object
  * that precede the faulting pindex are deactivated by vm_fault().  So, this
  * function must only be concerned with pages in the first object.
  */
 static void
 vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 {
 	vm_map_entry_t entry;
 	vm_object_t first_object, object;
 	vm_offset_t end, start;
 	vm_page_t m, m_next;
 	vm_pindex_t pend, pstart;
 	vm_size_t size;
 
 	object = fs->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	first_object = fs->first_object;
 	if (first_object != object) {
 		if (!VM_OBJECT_TRYWLOCK(first_object)) {
 			VM_OBJECT_WUNLOCK(object);
 			VM_OBJECT_WLOCK(first_object);
 			VM_OBJECT_WLOCK(object);
 		}
 	}
 	/* Neither fictitious nor unmanaged pages can be reclaimed. */
 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 		size = VM_FAULT_DONTNEED_MIN;
 		if (MAXPAGESIZES > 1 && size < pagesizes[1])
 			size = pagesizes[1];
 		end = rounddown2(vaddr, size);
 		if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
 		    (entry = fs->entry)->start < end) {
 			if (end - entry->start < size)
 				start = entry->start;
 			else
 				start = end - size;
 			pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
 			pstart = OFF_TO_IDX(entry->offset) + atop(start -
 			    entry->start);
 			m_next = vm_page_find_least(first_object, pstart);
 			pend = OFF_TO_IDX(entry->offset) + atop(end -
 			    entry->start);
 			while ((m = m_next) != NULL && m->pindex < pend) {
 				m_next = TAILQ_NEXT(m, listq);
 				if (m->valid != VM_PAGE_BITS_ALL ||
 				    vm_page_busied(m))
 					continue;
 
 				/*
 				 * Don't clear PGA_REFERENCED, since it would
 				 * likely represent a reference by a different
 				 * process.
 				 *
 				 * Typically, at this point, prefetched pages
 				 * are still in the inactive queue.  Only
 				 * pages that triggered page faults are in the
 				 * active queue.
 				 */
 				vm_page_lock(m);
 				if (!vm_page_inactive(m))
 					vm_page_deactivate(m);
 				vm_page_unlock(m);
 			}
 		}
 	}
 	if (first_object != object)
 		VM_OBJECT_WUNLOCK(first_object);
 }
 
 /*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of vm_map_pmap_enter, except it runs at page fault time instead
  * of mmap time.
  */
 static void
 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
     int backward, int forward, bool obj_locked)
 {
 	pmap_t pmap;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, lobject;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
 	int i;
 
 	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
 	entry = fs->entry;
 
 	if (addra < backward * PAGE_SIZE) {
 		starta = entry->start;
 	} else {
 		starta = addra - backward * PAGE_SIZE;
 		if (starta < entry->start)
 			starta = entry->start;
 	}
 
 	/*
 	 * Generate the sequence of virtual addresses that are candidates for
 	 * prefaulting in an outward spiral from the faulting virtual address,
 	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
 	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
 	 * If the candidate address doesn't have a backing physical page, then
 	 * the loop immediately terminates.
 	 */
 	for (i = 0; i < 2 * imax(backward, forward); i++) {
 		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
 		    PAGE_SIZE);
 		if (addr > addra + forward * PAGE_SIZE)
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if (!pmap_is_prefaultable(pmap, addr))
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = entry->object.vm_object;
 		if (!obj_locked)
 			VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
 			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_RLOCK(backing_object);
 			if (!obj_locked || lobject != entry->object.vm_object)
 				VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
 		if (m == NULL) {
 			if (!obj_locked || lobject != entry->object.vm_object)
 				VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (m->valid == VM_PAGE_BITS_ALL &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
 		if (!obj_locked || lobject != entry->object.vm_object)
 			VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
 /*
  * Hold each of the physical pages that are mapped by the specified range of
  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
  * and allow the specified types of access, "prot".  If all of the implied
  * pages are successfully held, then the number of held pages is returned
  * together with pointers to those pages in the array "ma".  However, if any
  * of the pages cannot be held, -1 is returned.
  */
 int
 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count)
 {
 	vm_offset_t end, va;
 	vm_page_t *mp;
 	int count;
 	boolean_t pmap_failed;
 
 	if (len == 0)
 		return (0);
 	end = round_page(addr + len);
 	addr = trunc_page(addr);
 
 	/*
 	 * Check for illegal addresses.
 	 */
 	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
 		return (-1);
 
 	if (atop(end - addr) > max_count)
 		panic("vm_fault_quick_hold_pages: count > max_count");
 	count = atop(end - addr);
 
 	/*
 	 * Most likely, the physical pages are resident in the pmap, so it is
 	 * faster to try pmap_extract_and_hold() first.
 	 */
 	pmap_failed = FALSE;
 	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
 		*mp = pmap_extract_and_hold(map->pmap, va, prot);
 		if (*mp == NULL)
 			pmap_failed = TRUE;
 		else if ((prot & VM_PROT_WRITE) != 0 &&
 		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
 			/*
 			 * Explicitly dirty the physical page.  Otherwise, the
 			 * caller's changes may go unnoticed because they are
 			 * performed through an unmanaged mapping or by a DMA
 			 * operation.
 			 *
 			 * The object lock is not held here.
 			 * See vm_page_clear_dirty_mask().
 			 */
 			vm_page_dirty(*mp);
 		}
 	}
 	if (pmap_failed) {
 		/*
 		 * One or more pages could not be held by the pmap.  Either no
 		 * page was mapped at the specified virtual address or that
 		 * mapping had insufficient permissions.  Attempt to fault in
 		 * and hold these pages.
 		 *
 		 * If vm_fault_disable_pagefaults() was called,
 		 * i.e., TDP_NOFAULTING is set, we must not sleep nor
 		 * acquire MD VM locks, which means we must not call
 		 * vm_fault().  Some (out of tree) callers mark
 		 * too wide a code area with vm_fault_disable_pagefaults()
 		 * already, use the VM_PROT_QUICK_NOFAULT flag to request
 		 * the proper behaviour explicitly.
 		 */
 		if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
 		    (curthread->td_pflags & TDP_NOFAULTING) != 0)
 			goto error;
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 			if (*mp == NULL && vm_fault(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 				goto error;
 	}
 	return (count);
 error:	
 	for (mp = ma; mp < ma + count; mp++)
 		if (*mp != NULL) {
 			vm_page_lock(*mp);
 			vm_page_unhold(*mp);
 			vm_page_unlock(*mp);
 		}
 	return (-1);
 }
 
 /*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
  *		Create new shadow object backing dst_entry with private copy of
  *		all underlying pages. When src_entry is equal to dst_entry,
  *		function implements COW for wired-down map entry. Otherwise,
  *		it forks wired entry into dst_map.
  *
  *	In/out conditions:
  *		The source and destination maps must be locked for write.
  *		The source map entry must be wired down (or be a sharing map
  *		entry corresponding to a main map entry that is wired down).
  */
 void
 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
     vm_ooffset_t *fork_charge)
 {
 	vm_object_t backing_object, dst_object, object, src_object;
 	vm_pindex_t dst_pindex, pindex, src_pindex;
 	vm_prot_t access, prot;
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
 	boolean_t upgrade;
 
 #ifdef	lint
 	src_map++;
 #endif	/* lint */
 
 	upgrade = src_entry == dst_entry;
 	access = prot = dst_entry->protection;
 
 	src_object = src_entry->object.vm_object;
 	src_pindex = OFF_TO_IDX(src_entry->offset);
 
 	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 		dst_object = src_object;
 		vm_object_reference(dst_object);
 	} else {
 		/*
 		 * Create the top-level object for the destination entry. (Doesn't
 		 * actually shadow anything - we copy the pages directly.)
 		 */
 		dst_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(dst_entry->end - dst_entry->start));
 #if VM_NRESERVLEVEL > 0
 		dst_object->flags |= OBJ_COLORED;
 		dst_object->pg_color = atop(dst_entry->start);
 #endif
 		dst_object->domain = src_object->domain;
 		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 
 	VM_OBJECT_WLOCK(dst_object);
 	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 	    ("vm_fault_copy_entry: vm_object not NULL"));
 	if (src_object != dst_object) {
 		dst_entry->object.vm_object = dst_object;
 		dst_entry->offset = 0;
 		dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC;
 	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
 		    ("vm_fault_copy_entry: leaked swp charge"));
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
 	} else if ((dst_object->type == OBJT_DEFAULT ||
 	    dst_object->type == OBJT_SWAP) &&
 	    dst_object->cred == NULL) {
 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 		    dst_entry));
 		dst_object->cred = dst_entry->cred;
 		dst_entry->cred = NULL;
 	}
 
 	/*
 	 * If not an upgrade, then enter the mappings in the pmap as
 	 * read and/or execute accesses.  Otherwise, enter them as
 	 * write accesses.
 	 *
 	 * A writeable large page mapping is only created if all of
 	 * the constituent small page mappings are modified. Marking
 	 * PTEs as modified on inception allows promotion to happen
 	 * without taking potentially large number of soft faults.
 	 */
 	if (!upgrade)
 		access &= ~VM_PROT_WRITE;
 
 	/*
 	 * Loop through all of the virtual pages within the entry's
 	 * range, copying each page from the source object to the
 	 * destination object.  Since the source is wired, those pages
 	 * must exist.  In contrast, the destination is pageable.
 	 * Since the destination object doesn't share any backing storage
 	 * with the source object, all of its pages must be dirtied,
 	 * regardless of whether they can be written.
 	 */
 	for (vaddr = dst_entry->start, dst_pindex = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_pindex++) {
 again:
 		/*
 		 * Find the page in the source object, and copy it in.
 		 * Because the source is wired down, the page will be
 		 * in memory.
 		 */
 		if (src_object != dst_object)
 			VM_OBJECT_RLOCK(src_object);
 		object = src_object;
 		pindex = src_pindex + dst_pindex;
 		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
 			 * Unless the source mapping is read-only or
 			 * it is presently being upgraded from
 			 * read-only, the first object in the shadow
 			 * chain should provide all of the pages.  In
 			 * other words, this loop body should never be
 			 * executed when the source mapping is already
 			 * read/write.
 			 */
 			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
 			    upgrade,
 			    ("vm_fault_copy_entry: main object missing page"));
 
 			VM_OBJECT_RLOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
 			if (object != dst_object)
 				VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 
 		if (object != dst_object) {
 			/*
 			 * Allocate a page in the destination object.
 			 */
 			dst_m = vm_page_alloc(dst_object, (src_object ==
 			    dst_object ? src_pindex : 0) + dst_pindex,
 			    VM_ALLOC_NORMAL);
 			if (dst_m == NULL) {
 				VM_OBJECT_WUNLOCK(dst_object);
 				VM_OBJECT_RUNLOCK(object);
 				vm_wait(dst_object);
 				VM_OBJECT_WLOCK(dst_object);
 				goto again;
 			}
 			pmap_copy_page(src_m, dst_m);
 			VM_OBJECT_RUNLOCK(object);
 			dst_m->dirty = dst_m->valid = src_m->valid;
 		} else {
 			dst_m = src_m;
 			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
 				goto again;
 			if (dst_m->pindex >= dst_object->size)
 				/*
 				 * We are upgrading.  Index can occur
 				 * out of bounds if the object type is
 				 * vnode and the file was truncated.
 				 */
 				break;
 			vm_page_xbusy(dst_m);
 		}
 		VM_OBJECT_WUNLOCK(dst_object);
 
 		/*
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
 		 *
 		 * The page can be invalid if the user called
 		 * msync(MS_INVALIDATE) or truncated the backing vnode
 		 * or shared memory object.  In this case, do not
 		 * insert it into pmap, but still do the copy so that
 		 * all copies of the wired map entry have similar
 		 * backing pages.
 		 */
 		if (dst_m->valid == VM_PAGE_BITS_ALL) {
 			pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
 			    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
 		}
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
 		VM_OBJECT_WLOCK(dst_object);
 		
 		if (upgrade) {
 			if (src_m != dst_m) {
 				vm_page_lock(src_m);
 				vm_page_unwire(src_m, PQ_INACTIVE);
 				vm_page_unlock(src_m);
 				vm_page_lock(dst_m);
 				vm_page_wire(dst_m);
 				vm_page_unlock(dst_m);
 			} else {
 				KASSERT(vm_page_wired(dst_m),
 				    ("dst_m %p is not wired", dst_m));
 			}
 		} else {
 			vm_page_lock(dst_m);
 			vm_page_activate(dst_m);
 			vm_page_unlock(dst_m);
 		}
 		vm_page_xunbusy(dst_m);
 	}
 	VM_OBJECT_WUNLOCK(dst_object);
 	if (upgrade) {
 		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 		vm_object_deallocate(src_object);
 	}
 }
 
 /*
  * Block entry into the machine-independent layer's page fault handler by
  * the calling thread.  Subsequent calls to vm_fault() by that thread will
  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
  * spurious page faults. 
  */
 int
 vm_fault_disable_pagefaults(void)
 {
 
 	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 }
 
 void
 vm_fault_enable_pagefaults(int save)
 {
 
 	curthread_pflags_restore(save);
 }
Index: stable/12/sys/vm/vm_object.h
===================================================================
--- stable/12/sys/vm/vm_object.h	(revision 354393)
+++ stable/12/sys/vm/vm_object.h	(revision 354394)
@@ -1,347 +1,348 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.h	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory object module definitions.
  */
 
 #ifndef	_VM_OBJECT_
 #define	_VM_OBJECT_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_pctrie.h>
 #include <sys/_rwlock.h>
 #include <sys/_domainset.h>
 
 #include <vm/_vm_radix.h>
 
 /*
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
  *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object lock 
  *	(f)	free pages queue mutex
  *
  */
 
 #ifndef VM_PAGE_HAVE_PGLIST
 TAILQ_HEAD(pglist, vm_page);
 #define VM_PAGE_HAVE_PGLIST
 #endif
 
 struct vm_object {
 	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	struct pglist memq;		/* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	struct domainset_ref domain;	/* NUMA policy. */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	vm_memattr_t memattr;		/* default memory attribute for pages */
 	objtype_t type;			/* type of pager */
 	u_short flags;			/* see below */
 	u_short pg_color;		/* (c) color of first page in obj */
 	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
 	void *handle;
 	union {
 		/*
 		 * VNode pager
 		 *
 		 *	vnp_size - current size of file
 		 */
 		struct {
 			off_t vnp_size;
 			vm_ooffset_t writemappings;
 		} vnp;
 
 		/*
 		 * Device pager
 		 *
 		 *	devp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) devp_pglist;
 			struct cdev_pager_ops *ops;
 			struct cdev *dev;
 		} devp;
 
 		/*
 		 * SG pager
 		 *
 		 *	sgp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) sgp_pglist;
 		} sgp;
 
 		/*
 		 * Swap pager
 		 *
 		 *	swp_tmpfs - back-pointer to the tmpfs vnode,
 		 *		     if any, which uses the vm object
 		 *		     as backing store.  The handle
 		 *		     cannot be reused for linking,
 		 *		     because the vnode can be
 		 *		     reclaimed and recreated, making
 		 *		     the handle changed and hash-chain
 		 *		     invalid.
 		 *
 		 *	swp_blks -   pc-trie of the allocated swap blocks.
 		 *
 		 */
 		struct {
 			void *swp_tmpfs;
 			struct pctrie swp_blks;
 			vm_ooffset_t writemappings;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
 	void *umtx_data;
 };
 
 /*
  * Flags
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
 #define	OBJ_POPULATE	0x0004		/* pager implements populate() */
 #define	OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define	OBJ_UMTXDEAD	0x0020		/* umtx pshared was terminated */
 #define	OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define	OBJ_PG_DTOR	0x0080		/* dont reset object, leave that for dtor */
 #define	OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
 #define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
+#define	OBJ_SIZEVNLOCK	0x0800		/* lock vnode to check obj size */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 #define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
 /*
  * Helpers to perform conversion between vm_object page indexes and offsets.
  * IDX_TO_OFF() converts an index into an offset.
  * OFF_TO_IDX() converts an offset into an index.
  * OBJ_MAX_SIZE specifies the maximum page index corresponding to the
  *   maximum unsigned offset.
  */
 #define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define	OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 #define	UOFF_TO_IDX(off) OFF_TO_IDX(off)
 #define	OBJ_MAX_SIZE	(OFF_TO_IDX(UINT64_MAX) + 1)
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
 #define OBJPC_NOSYNC	0x4			/* skip if VPO_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
  */
 #define	OBJPR_CLEANONLY	0x1		/* Don't remove dirty pages. */
 #define	OBJPR_NOTMAPPED	0x2		/* Don't unmap pages. */
 
 TAILQ_HEAD(object_q, vm_object);
 
 extern struct object_q vm_object_list;	/* list of allocated objects */
 extern struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 extern struct vm_object kernel_object_store;
 
 /* kernel and kmem are aliased for backwards KPI compat. */
 #define	kernel_object	(&kernel_object_store)
 #define	kmem_object	(&kernel_object_store)
 
 #define	VM_OBJECT_ASSERT_LOCKED(object)					\
 	rw_assert(&(object)->lock, RA_LOCKED)
 #define	VM_OBJECT_ASSERT_RLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_RLOCKED)
 #define	VM_OBJECT_ASSERT_WLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_WLOCKED)
 #define	VM_OBJECT_ASSERT_UNLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_UNLOCKED)
 #define	VM_OBJECT_LOCK_DOWNGRADE(object)				\
 	rw_downgrade(&(object)->lock)
 #define	VM_OBJECT_RLOCK(object)						\
 	rw_rlock(&(object)->lock)
 #define	VM_OBJECT_RUNLOCK(object)					\
 	rw_runlock(&(object)->lock)
 #define	VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo)		\
 	rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo))
 #define	VM_OBJECT_TRYRLOCK(object)					\
 	rw_try_rlock(&(object)->lock)
 #define	VM_OBJECT_TRYWLOCK(object)					\
 	rw_try_wlock(&(object)->lock)
 #define	VM_OBJECT_TRYUPGRADE(object)					\
 	rw_try_upgrade(&(object)->lock)
 #define	VM_OBJECT_WLOCK(object)						\
 	rw_wlock(&(object)->lock)
 #define	VM_OBJECT_WOWNED(object)					\
 	rw_wowned(&(object)->lock)
 #define	VM_OBJECT_WUNLOCK(object)					\
 	rw_wunlock(&(object)->lock)
 
 /*
  *	The object must be locked or thread private.
  */
 static __inline void
 vm_object_set_flag(vm_object_t object, u_short bits)
 {
 
 	object->flags |= bits;
 }
 
 /*
  *	Conditionally set the object's color, which (1) enables the allocation
  *	of physical memory reservations for anonymous objects and larger-than-
  *	superpage-sized named objects and (2) determines the first page offset
  *	within the object at which a reservation may be allocated.  In other
  *	words, the color determines the alignment of the object with respect
  *	to the largest superpage boundary.  When mapping named objects, like
  *	files or POSIX shared memory objects, the color should be set to zero
  *	before a virtual address is selected for the mapping.  In contrast,
  *	for anonymous objects, the color may be set after the virtual address
  *	is selected.
  *
  *	The object must be locked.
  */
 static __inline void
 vm_object_color(vm_object_t object, u_short color)
 {
 
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->pg_color = color;
 		object->flags |= OBJ_COLORED;
 	}
 }
 
 static __inline bool
 vm_object_reserv(vm_object_t object)
 {
 
 	if (object != NULL &&
 	    (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED) {
 		return (true);
 	}
 	return (false);
 }
 
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
 void vm_object_pip_wakeup(vm_object_t object);
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
 void umtx_shm_object_init(vm_object_t object);
 void umtx_shm_object_terminated(vm_object_t object);
 extern int umtx_shm_vnobj_persistent;
 
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
 void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_print(long addr, boolean_t have_addr, long count, char *modif);
 void vm_object_reference (vm_object_t);
 void vm_object_reference_locked(vm_object_t);
 int  vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr);
 void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t);
 void vm_object_split(vm_map_entry_t);
 boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
     boolean_t);
 void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
     vm_size_t length, uint8_t queue);
 struct vnode *vm_object_vnode(vm_object_t object);
 #endif				/* _KERNEL */
 
 #endif				/* _VM_OBJECT_ */
Index: stable/12/sys/vm/vnode_pager.c
===================================================================
--- stable/12/sys/vm/vnode_pager.c	(revision 354393)
+++ stable/12/sys/vm/vnode_pager.c	(revision 354394)
@@ -1,1587 +1,1591 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/domainset.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
     daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, vop_getpages_iodone_t, void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
 static int vnode_pager_generic_getpages_done(struct buf *);
 static void vnode_pager_generic_getpages_done_async(struct buf *);
 static void vnode_pager_update_writecount(vm_object_t, vm_offset_t,
     vm_offset_t);
 static void vnode_pager_release_writecount(vm_object_t, vm_offset_t,
     vm_offset_t);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
 	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 	.pgo_update_writecount = vnode_pager_update_writecount,
 	.pgo_release_writecount = vnode_pager_release_writecount,
 };
 
 int vnode_pbuf_freecnt;
 int vnode_async_pbuf_freecnt;
 
 static struct domainset *vnode_domainset = NULL;
 
 SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW,
     &vnode_domainset, 0, sysctl_handle_domainset, "A",
     "Default vnode NUMA policy");
 
 /* Create the VM system backing object for this vnode */
 int
 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
 	struct vattr va;
 
 	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (size == 0) {
 		if (vn_isdisk(vp, NULL)) {
 			size = IDX_TO_OFF(INT_MAX);
 		} else {
 			if (VOP_GETATTR(vp, &va, td->td_ucred))
 				return (0);
 			size = va.va_size;
 		}
 	}
 
 	object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred);
 	/*
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
 	VM_OBJECT_WLOCK(object);
 	object->ref_count--;
 	VM_OBJECT_WUNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 
 	return (0);
 }
 
 void
 vnode_destroy_vobject(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_WLOCK(obj);
 	umtx_shm_object_terminated(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_terminate(obj);
 		} else {
 			/*
 			 * Waiters were already handled during object
 			 * termination.  The exclusive vnode lock hopefully
 			 * prevented new waiters from referencing the dying
 			 * object.
 			 */
 			KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0,
 			    ("OBJ_DISCONNECTWNT set obj %p flags %x",
 			    obj, obj->flags));
 			vp->v_object = NULL;
 			VM_OBJECT_WUNLOCK(obj);
 		}
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
 }
 
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  *
  * MPSAFE
  */
 vm_object_t
 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
 retry:
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0);
 	}
 
 	KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference"));
 
 	if (object == NULL) {
 		/*
 		 * Add an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 
 		object->un_pager.vnp.vnp_size = size;
 		object->un_pager.vnp.writemappings = 0;
 		object->domain.dr_policy = vnode_domainset;
-
 		object->handle = handle;
+		if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) {
+			VM_OBJECT_WLOCK(object);
+			vm_object_set_flag(object, OBJ_SIZEVNLOCK);
+			VM_OBJECT_WUNLOCK(object);
+		}
 		VI_LOCK(vp);
 		if (vp->v_object != NULL) {
 			/*
 			 * Object has been created while we were sleeping
 			 */
 			VI_UNLOCK(vp);
 			VM_OBJECT_WLOCK(object);
 			KASSERT(object->ref_count == 1,
 			    ("leaked ref %p %d", object, object->ref_count));
 			object->type = OBJT_DEAD;
 			object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(object);
 			vm_object_destroy(object);
 			goto retry;
 		}
 		vp->v_object = object;
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(object, 0);
 #endif
 		VM_OBJECT_WUNLOCK(object);
 	}
 	vrefact(vp);
 	return (object);
 }
 
 /*
  *	The object must be locked.
  */
 static void
 vnode_pager_dealloc(vm_object_t object)
 {
 	struct vnode *vp;
 	int refs;
 
 	vp = object->handle;
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "vnpdea");
 	refs = object->ref_count;
 
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 	if (object->flags & OBJ_DISCONNECTWNT) {
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 	if (object->un_pager.vnp.writemappings > 0) {
 		object->un_pager.vnp.writemappings = 0;
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	vp->v_object = NULL;
 	VI_LOCK(vp);
 
 	/*
 	 * vm_map_entry_set_vnode_text() cannot reach this vnode by
 	 * following object->handle.  Clear all text references now.
 	 * This also clears the transient references from
 	 * kern_execve(), which is fine because dead_vnodeops uses nop
 	 * for VOP_UNSET_TEXT().
 	 */
 	if (vp->v_writecount < 0)
 		vp->v_writecount = 0;
 	VI_UNLOCK(vp);
 	VM_OBJECT_WUNLOCK(object);
 	while (refs-- > 0)
 		vunref(vp);
 	VM_OBJECT_WLOCK(object);
 }
 
 static boolean_t
 vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
 	 */
 	if (vp == NULL || vp->v_iflag & VI_DOOMED)
 		return FALSE;
 	/*
 	 * If the offset is beyond end of file we do
 	 * not have the page.
 	 */
 	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	VM_OBJECT_WUNLOCK(object);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 	VM_OBJECT_WLOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			/*
 			 * The BMAP vop can report a partial block in the
 			 * 'after', but must not report blocks after EOF.
 			 * Assert the latter, and truncate 'after' in case
 			 * of the former.
 			 */
 			KASSERT((reqblock + *after) * pagesperblock <
 			    roundup2(object->size, pagesperblock),
 			    ("%s: reqblock %jd after %d size %ju", __func__,
 			    (intmax_t )reqblock, *after,
 			    (uintmax_t )object->size));
 			*after *= pagesperblock;
 			*after += pagesperblock - (poff + 1);
 			if (pindex + *after >= object->size)
 				*after = object->size - 1 - pindex;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 /* 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */
 	VM_OBJECT_WLOCK(object);
 	if (object->type == OBJT_DEAD) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	KASSERT(object->type == OBJT_VNODE,
 	    ("not vnode-backed object %p", object));
 	if (nsize == object->un_pager.vnp.vnp_size) {
 		/*
 		 * Hasn't changed size
 		 */
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		/*
 		 * File has shrunk. Toss any cached pages beyond the new EOF.
 		 */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
 		 *
 		 * XXX for some reason (I don't know yet), if we take a
 		 * completely invalid page and mark it partially valid
 		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
 		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 		    m->valid != 0) {
 			int base = (int)nsize & PAGE_MASK;
 			int size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
 			 * the page has been mapped.
 			 */
 			pmap_zero_page_area(m, base, size);
 
 			/*
 			 * Update the valid bits to reflect the blocks that
 			 * have been zeroed.  Some of these valid bits may
 			 * have already been set.
 			 */
 			vm_page_set_valid_range(m, base, size);
 
 			/*
 			 * Round "base" to the next block boundary so that the
 			 * dirty bit for a partially zeroed block is not
 			 * cleared.
 			 */
 			base = roundup2(base, DEV_BSIZE);
 
 			/*
 			 * Clear out partial-page dirty bits.
 			 *
 			 * note that we do not clear out the valid
 			 * bits.  This would prevent bogus_page
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static int
 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
     int *run)
 {
 	int bsize;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
 
 	if (address < 0)
 		return -1;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 	if (err == 0) {
 		if (*rtaddress != -1)
 			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
 			*run -= voffset/PAGE_SIZE;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * small block filesystem vnode pager input
  */
 static int
 vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct buf *bp;
 	struct sf_buf *sf;
 	daddr_t fileaddr;
 	vm_offset_t bsize;
 	vm_page_bits_t bits;
 	int error, i;
 
 	error = 0;
 	vp = object->handle;
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 
 	sf = sf_buf_alloc(m, 0);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 		vm_ooffset_t address;
 
 		bits = vm_page_bits(i * bsize, bsize);
 		if (m->valid & bits)
 			continue;
 
 		address = IDX_TO_OFF(m->pindex) + i * bsize;
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
 			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 			if (error)
 				break;
 		}
 		if (fileaddr != -1) {
 			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_iocmd = BIO_READ;
 			bp->b_iodone = bdone;
 			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 			bp->b_rcred = crhold(curthread->td_ucred);
 			bp->b_wcred = crhold(curthread->td_ucred);
 			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetbo(bo, bp);
 			bp->b_vp = vp;
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 			bp->b_runningbufspace = bp->b_bufsize;
 			atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 
 			/* do the input */
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
 
 			bwait(bp, PVM, "vnsrd");
 
 			if ((bp->b_ioflags & BIO_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			bp->b_vp = NULL;
 			pbrelbo(bp);
 			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 		} else
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		KASSERT((m->dirty & bits) == 0,
 		    ("vnode_pager_input_smlfs: page %p is dirty", m));
 		VM_OBJECT_WLOCK(object);
 		m->valid |= bits;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	sf_buf_free(sf);
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 }
 
 /*
  * old style vnode pager input routine
  */
 static int
 vnode_pager_input_old(vm_object_t object, vm_page_t m)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	struct sf_buf *sf;
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		sf = sf_buf_alloc(m, 0);
 
 		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_td = curthread;
 
 		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 		if (!error) {
 			int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t)sf_buf_kva(sf) + count,
 				    PAGE_SIZE - count);
 		}
 		sf_buf_free(sf);
 
 		VM_OBJECT_WLOCK(object);
 	}
 	KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 /*
  * Local media VFS's that do not implement their own VOP_GETPAGES
  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
  * to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_GETPAGES.
  */
 static int
 vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return rtval;
 }
 
 static int
 vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
     int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages_async not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return (rtval);
 }
 
 /*
  * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
  * local filesystems, where partially valid pages can only occur at
  * the end of file.
  */
 int
 vnode_pager_local_getpages(struct vop_getpages_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 int
 vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
 vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
     int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	vm_object_t object;
 	struct bufobj *bo;
 	struct buf *bp;
 	off_t foff;
 #ifdef INVARIANTS
 	off_t blkno0;
 #endif
 	int bsize, pagesperblock, *freecnt;
 	int error, before, after, rbehind, rahead, poff, i;
 	int bytecount, secmask;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("%s does not support devices", __func__));
 
 	if (vp->v_iflag & VI_DOOMED)
 		return (VM_PAGER_BAD);
 
 	object = vp->v_object;
 	foff = IDX_TO_OFF(m[0]->pindex);
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 
 	KASSERT(foff < object->un_pager.vnp.vnp_size,
 	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
 	KASSERT(count <= nitems(bp->b_pages),
 	    ("%s: requested %d pages", __func__, count));
 
 	/*
 	 * The last page has valid blocks.  Invalid part can only
 	 * exist at the end of file, and the page is made fully valid
 	 * by zeroing in vm_pager_get_pages().
 	 */
 	if (m[count - 1]->valid != 0 && --count == 0) {
 		if (iodone != NULL)
 			iodone(arg, m, 1, 0);
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * Synchronous and asynchronous paging operations use different
 	 * free pbuf counters.  This is done to avoid asynchronous requests
 	 * to consume all pbufs.
 	 * Allocate the pbuf at the very beginning of the function, so that
 	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
 	 * but sleep.
 	 */
 	freecnt = iodone != NULL ?
 	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
 	bp = getpbuf(freecnt);
 
 	/*
 	 * Get the underlying device blocks for the file with VOP_BMAP().
 	 * If the file system doesn't support VOP_BMAP, use old way of
 	 * getting pages via VOP_READ.
 	 */
 	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
 	if (error == EOPNOTSUPP) {
 		relpbuf(bp, freecnt);
 		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < count; i++) {
 			VM_CNT_INC(v_vnodein);
 			VM_CNT_INC(v_vnodepgsin);
 			error = vnode_pager_input_old(object, m[i]);
 			if (error)
 				break;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
 		relpbuf(bp, freecnt);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * If the file system supports BMAP, but blocksize is smaller
 	 * than a page size, then use special small filesystem code.
 	 */
 	if (pagesperblock == 0) {
 		relpbuf(bp, freecnt);
 		for (i = 0; i < count; i++) {
 			VM_CNT_INC(v_vnodein);
 			VM_CNT_INC(v_vnodepgsin);
 			error = vnode_pager_input_smlfs(object, m[i]);
 			if (error)
 				break;
 		}
 		return (error);
 	}
 
 	/*
 	 * A sparse file can be encountered only for a single page request,
 	 * which may not be preceded by call to vm_pager_haspage().
 	 */
 	if (bp->b_blkno == -1) {
 		KASSERT(count == 1,
 		    ("%s: array[%d] request to a sparse file %p", __func__,
 		    count, vp));
 		relpbuf(bp, freecnt);
 		pmap_zero_page(m[0]);
 		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
 		    __func__, m[0]));
 		VM_OBJECT_WLOCK(object);
 		m[0]->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 
 #ifdef INVARIANTS
 	blkno0 = bp->b_blkno;
 #endif
 	bp->b_blkno += (foff % bsize) / DEV_BSIZE;
 
 	/* Recalculate blocks available after/before to pages. */
 	poff = (foff % bsize) / PAGE_SIZE;
 	before *= pagesperblock;
 	before += poff;
 	after *= pagesperblock;
 	after += pagesperblock - (poff + 1);
 	if (m[0]->pindex + after >= object->size)
 		after = object->size - 1 - m[0]->pindex;
 	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
 	    __func__, count, after + 1));
 	after -= count - 1;
 
 	/* Trim requested rbehind/rahead to possible values. */   
 	rbehind = a_rbehind ? *a_rbehind : 0;
 	rahead = a_rahead ? *a_rahead : 0;
 	rbehind = min(rbehind, before);
 	rbehind = min(rbehind, m[0]->pindex);
 	rahead = min(rahead, after);
 	rahead = min(rahead, object->size - m[count - 1]->pindex);
 	/*
 	 * Check that total amount of pages fit into buf.  Trim rbehind and
 	 * rahead evenly if not.
 	 */
 	if (rbehind + rahead + count > nitems(bp->b_pages)) {
 		int trim, sum;
 
 		trim = rbehind + rahead + count - nitems(bp->b_pages) + 1;
 		sum = rbehind + rahead;
 		if (rbehind == before) {
 			/* Roundup rbehind trim to block size. */
 			rbehind -= roundup(trim * rbehind / sum, pagesperblock);
 			if (rbehind < 0)
 				rbehind = 0;
 		} else
 			rbehind -= trim * rbehind / sum;
 		rahead -= trim * rahead / sum;
 	}
 	KASSERT(rbehind + rahead + count <= nitems(bp->b_pages),
 	    ("%s: behind %d ahead %d count %d", __func__,
 	    rbehind, rahead, count));
 
 	/*
 	 * Fill in the bp->b_pages[] array with requested and optional   
 	 * read behind or read ahead pages.  Read behind pages are looked
 	 * up in a backward direction, down to a first cached page.  Same
 	 * for read ahead pages, but there is no need to shift the array
 	 * in case of encountering a cached page.
 	 */
 	i = bp->b_npages = 0;
 	if (rbehind) {
 		vm_pindex_t startpindex, tpindex;
 		vm_page_t p;
 
 		VM_OBJECT_WLOCK(object);
 		startpindex = m[0]->pindex - rbehind;
 		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
 		    p->pindex >= startpindex)
 			startpindex = p->pindex + 1;
 
 		/* tpindex is unsigned; beware of numeric underflow. */
 		for (tpindex = m[0]->pindex - 1;
 		    tpindex >= startpindex && tpindex < m[0]->pindex;
 		    tpindex--, i++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 			if (p == NULL) {
 				/* Shift the array. */
 				for (int j = 0; j < i; j++)
 					bp->b_pages[j] = bp->b_pages[j + 
 					    tpindex + 1 - startpindex]; 
 				break;
 			}
 			bp->b_pages[tpindex - startpindex] = p;
 		}
 
 		bp->b_pgbefore = i;
 		bp->b_npages += i;
 		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
 	} else
 		bp->b_pgbefore = 0;
 
 	/* Requested pages. */
 	for (int j = 0; j < count; j++, i++)
 		bp->b_pages[i] = m[j];
 	bp->b_npages += count;
 
 	if (rahead) {
 		vm_pindex_t endpindex, tpindex;
 		vm_page_t p;
 
 		if (!VM_OBJECT_WOWNED(object))
 			VM_OBJECT_WLOCK(object);
 		endpindex = m[count - 1]->pindex + rahead + 1;
 		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
 		    p->pindex < endpindex)
 			endpindex = p->pindex;
 		if (endpindex > object->size)
 			endpindex = object->size;
 
 		for (tpindex = m[count - 1]->pindex + 1;
 		    tpindex < endpindex; i++, tpindex++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			bp->b_pages[i] = p;
 		}
 
 		bp->b_pgafter = i - bp->b_npages;
 		bp->b_npages = i;
 	} else
 		bp->b_pgafter = 0;
 
 	if (VM_OBJECT_WOWNED(object))
 		VM_OBJECT_WUNLOCK(object);
 
 	/* Report back actual behind/ahead read. */
 	if (a_rbehind)
 		*a_rbehind = bp->b_pgbefore;
 	if (a_rahead)
 		*a_rahead = bp->b_pgafter;
 
 #ifdef INVARIANTS
 	KASSERT(bp->b_npages <= nitems(bp->b_pages),
 	    ("%s: buf %p overflowed", __func__, bp));
 	for (int j = 1, prev = 0; j < bp->b_npages; j++) {
 		if (bp->b_pages[j] == bogus_page)
 			continue;
 		KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex ==
 		    j - prev, ("%s: pages array not consecutive, bp %p",
 		     __func__, bp));
 		prev = j;
 	}
 #endif
 
 	/*
 	 * Recalculate first offset and bytecount with regards to read behind.
 	 * Truncate bytecount to vnode real size and round up physical size
 	 * for real devices.
 	 */
 	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	bytecount = bp->b_npages << PAGE_SHIFT;
 	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
 		bytecount = object->un_pager.vnp.vnp_size - foff;
 	secmask = bo->bo_bsize - 1;
 	KASSERT(secmask < PAGE_SIZE && secmask > 0,
 	    ("%s: sector size %d too large", __func__, secmask + 1));
 	bytecount = (bytecount + secmask) & ~secmask;
 
 	/*
 	 * And map the pages to be read into the kva, if the filesystem
 	 * requires mapped buffers.
 	 */
 	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
 	    unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
 		bp->b_offset = 0;
 	} else {
 		bp->b_data = bp->b_kvabase;
 		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	}
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	pbgetbo(bo, bp);
 	bp->b_vp = vp;
 	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) ==
 	    (blkno0 - bp->b_blkno) * DEV_BSIZE +
 	    IDX_TO_OFF(m[0]->pindex) % bsize,
 	    ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju "
 	    "blkno0 %ju b_blkno %ju", bsize,
 	    (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex,
 	    (uintmax_t)blkno0, (uintmax_t)bp->b_blkno));
 
 	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, bp->b_npages);
 
 	if (iodone != NULL) { /* async */
 		bp->b_pgiodone = iodone;
 		bp->b_caller1 = arg;
 		bp->b_iodone = vnode_pager_generic_getpages_done_async;
 		bp->b_flags |= B_ASYNC;
 		BUF_KERNPROC(bp);
 		bstrategy(bp);
 		return (VM_PAGER_OK);
 	} else {
 		bp->b_iodone = bdone;
 		bstrategy(bp);
 		bwait(bp, PVM, "vnread");
 		error = vnode_pager_generic_getpages_done(bp);
 		for (i = 0; i < bp->b_npages; i++)
 			bp->b_pages[i] = NULL;
 		bp->b_vp = NULL;
 		pbrelbo(bp);
 		relpbuf(bp, &vnode_pbuf_freecnt);
 		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 	}
 }
 
 static void
 vnode_pager_generic_getpages_done_async(struct buf *bp)
 {
 	int error;
 
 	error = vnode_pager_generic_getpages_done(bp);
 	/* Run the iodone upon the requested range. */
 	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
 	    bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
 	for (int i = 0; i < bp->b_npages; i++)
 		bp->b_pages[i] = NULL;
 	bp->b_vp = NULL;
 	pbrelbo(bp);
 	relpbuf(bp, &vnode_async_pbuf_freecnt);
 }
 
 static int
 vnode_pager_generic_getpages_done(struct buf *bp)
 {
 	vm_object_t object;
 	off_t tfoff, nextoff;
 	int i, error;
 
 	error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
 	object = bp->b_vp->v_object;
 
 	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
 		if (!buf_mapped(bp)) {
 			bp->b_data = bp->b_kvabase;
 			pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
 			    bp->b_npages);
 		}
 		bzero(bp->b_data + bp->b_bcount,
 		    PAGE_SIZE * bp->b_npages - bp->b_bcount);
 	}
 	if (buf_mapped(bp)) {
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 		bp->b_data = unmapped_buf;
 	}
 
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	    i < bp->b_npages; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
 		mt = bp->b_pages[i];
 		if (mt == bogus_page)
 			continue;
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			KASSERT(mt->dirty == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 			KASSERT(!pmap_page_is_mapped(mt),
 			    ("%s: page %p is mapped", __func__, mt));
 		} else {
 			/*
 			 * Read did not fill up entire page.
 			 *
 			 * Currently we do not set the entire page valid,
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
 			vm_page_set_valid_range(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 		}
 
 		if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
 			vm_page_readahead_finish(mt);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	if (error != 0)
 		printf("%s: I/O read error %d\n", __func__, error);
 
 	return (error);
 }
 
 /*
  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
  * vnode_pager_generic_putpages() to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_PUTPAGES.
  */
 static void
 vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     int flags, int *rtvals)
 {
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
 
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
 	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
 	 *
 	 * Also, the backing vnode's iodone routine may not wake the pageout
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
 	if (vm_page_count_min())
 		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_WLOCK(object);
 }
 
 static int
 vn_off2bidx(vm_ooffset_t offset)
 {
 
 	return ((offset & PAGE_MASK) / DEV_BSIZE);
 }
 
 static bool
 vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
 {
 
 	KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
 	    offset < IDX_TO_OFF(m->pindex + 1),
 	    ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
 	    (uintmax_t)offset));
 	return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
  * clustering has already typically occurred, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
 int
 vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
     int flags, int *rtvals)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
 	struct uio auio;
 	struct iovec aiov;
 	off_t prev_resid, wrsz;
 	int count, error, i, maxsize, ncount, pgoff, ppscheck;
 	bool in_hole;
 	static struct timeval lastfail;
 	static int curfail;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	if ((int64_t)ma[0]->pindex < 0) {
 		printf("vnode_pager_generic_putpages: "
 		    "attempt to write meta-data 0x%jx(%lx)\n",
 		    (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return (VM_PAGER_BAD);
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(ma[0]->pindex);
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
 	 * have to invalidate pages occurring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
 	 *
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
 	VM_OBJECT_RLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (!VM_OBJECT_TRYUPGRADE(object)) {
 			VM_OBJECT_RUNLOCK(object);
 			VM_OBJECT_WLOCK(object);
 			if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
 				goto downgrade;
 		}
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
 				pgoff = roundup2(pgoff, DEV_BSIZE);
 
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
 				MPASS(m->dirty != 0);
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			maxsize = 0;
 			ncount = 0;
 		}
 		for (i = ncount; i < count; i++)
 			rtvals[i] = VM_PAGER_BAD;
 downgrade:
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = NULL;
 	maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
 
 	for (prev_offset = poffset; prev_offset < maxblksz;) {
 		/* Skip clean blocks. */
 		for (in_hole = true; in_hole && prev_offset < maxblksz;) {
 			m = ma[OFF_TO_IDX(prev_offset - poffset)];
 			for (i = vn_off2bidx(prev_offset);
 			    i < sizeof(vm_page_bits_t) * NBBY &&
 			    prev_offset < maxblksz; i++) {
 				if (vn_dirty_blk(m, prev_offset)) {
 					in_hole = false;
 					break;
 				}
 				prev_offset += DEV_BSIZE;
 			}
 		}
 		if (in_hole)
 			goto write_done;
 
 		/* Find longest run of dirty blocks. */
 		for (next_offset = prev_offset; next_offset < maxblksz;) {
 			m = ma[OFF_TO_IDX(next_offset - poffset)];
 			for (i = vn_off2bidx(next_offset);
 			    i < sizeof(vm_page_bits_t) * NBBY &&
 			    next_offset < maxblksz; i++) {
 				if (!vn_dirty_blk(m, next_offset))
 					goto start_write;
 				next_offset += DEV_BSIZE;
 			}
 		}
 start_write:
 		if (next_offset > poffset + maxsize)
 			next_offset = poffset + maxsize;
 
 		/*
 		 * Getting here requires finding a dirty block in the
 		 * 'skip clean blocks' loop.
 		 */
 		MPASS(prev_offset < next_offset);
 
 		VM_OBJECT_RUNLOCK(object);
 		aiov.iov_base = NULL;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = prev_offset;
 		prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
 		    prev_offset;
 		error = VOP_WRITE(vp, &auio,
 		    vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
 
 		wrsz = prev_resid - auio.uio_resid;
 		if (wrsz == 0) {
 			if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
 				vn_printf(vp, "vnode_pager_putpages: "
 				    "zero-length write at %ju resid %zd\n",
 				    auio.uio_offset, auio.uio_resid);
 			}
 			VM_OBJECT_RLOCK(object);
 			break;
 		}
 
 		/* Adjust the starting offset for next iteration. */
 		prev_offset += wrsz;
 		MPASS(auio.uio_offset == prev_offset);
 
 		ppscheck = 0;
 		if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
 		    &curfail, 1)) != 0)
 			vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
 			    error);
 		if (auio.uio_resid != 0 && (ppscheck != 0 ||
 		    ppsratecheck(&lastfail, &curfail, 1) != 0))
 			vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
 			    "at %ju\n", auio.uio_resid,
 			    (uintmax_t)ma[0]->pindex);
 		VM_OBJECT_RLOCK(object);
 		if (error != 0 || auio.uio_resid != 0)
 			break;
 	}
 write_done:
 	/* Mark completely processed pages. */
 	for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
 		rtvals[i] = VM_PAGER_OK;
 	/* Mark partial EOF page. */
 	if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
 		rtvals[i++] = VM_PAGER_OK;
 	/* Unwritten pages in range, free bonus if the page is clean. */
 	for (; i < ncount; i++)
 		rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
 	VM_OBJECT_RUNLOCK(object);
 	VM_CNT_ADD(v_vnodepgsout, i);
 	VM_CNT_INC(v_vnodeout);
 	return (rtvals[0]);
 }
 
 int
 vnode_pager_putpages_ioflags(int pager_flags)
 {
 	int ioflags;
 
 	/*
 	 * Pageouts are already clustered, use IO_ASYNC to force a
 	 * bawrite() rather then a bdwrite() to prevent paging I/O
 	 * from saturating the buffer cache.  Dummy-up the sequential
 	 * heuristic to cause large ranges to cluster.  If neither
 	 * IO_SYNC or IO_ASYNC is set, the system decides how to
 	 * cluster.
 	 */
 	ioflags = IO_VMIO;
 	if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0)
 		ioflags |= IO_SYNC;
 	else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
 	ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
 	ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 	return (ioflags);
 }
 
 /*
  * vnode_pager_undirty_pages().
  *
  * A helper to mark pages as clean after pageout that was possibly
  * done with a short write.  The lpos argument specifies the page run
  * length in bytes, and the written argument specifies how many bytes
  * were actually written.  eof is the offset past the last valid byte
  * in the vnode using the absolute file position of the first byte in
  * the run as the base from which it is computed.
  */
 void
 vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof,
     int lpos)
 {
 	vm_object_t obj;
 	int i, pos, pos_devb;
 
 	if (written == 0 && eof >= lpos)
 		return;
 	obj = ma[0]->object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
 		if (pos < trunc_page(written)) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(ma[i]);
 		} else {
 			/* Partially written page. */
 			rtvals[i] = VM_PAGER_AGAIN;
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
 	if (eof >= lpos) /* avoid truncation */
 		goto done;
 	for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
 		if (pos != trunc_page(pos)) {
 			/*
 			 * The page contains the last valid byte in
 			 * the vnode, mark the rest of the page as
 			 * clean, potentially making the whole page
 			 * clean.
 			 */
 			pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
 			vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
 			    pos_devb);
 
 			/*
 			 * If the page was cleaned, report the pageout
 			 * on it as successful.  msync() no longer
 			 * needs to write out the page, endlessly
 			 * creating write requests and dirty buffers.
 			 */
 			if (ma[i]->dirty == 0)
 				rtvals[i] = VM_PAGER_OK;
 
 			pos = round_page(pos);
 		} else {
 			/* vm_pageout_flush() clears dirty */
 			rtvals[i] = VM_PAGER_BAD;
 			pos += PAGE_SIZE;
 		}
 	}
 done:
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 static void
 vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	vm_ooffset_t old_wm;
 
 	VM_OBJECT_WLOCK(object);
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	old_wm = object->un_pager.vnp.writemappings;
 	object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start;
 	vp = object->handle;
 	if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
 		ASSERT_VOP_LOCKED(vp, "v_writecount inc");
 		VOP_ADD_WRITECOUNT_CHECKED(vp, 1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
 		ASSERT_VOP_LOCKED(vp, "v_writecount dec");
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
 static void
 vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_offset_t inc;
 
 	VM_OBJECT_WLOCK(object);
 
 	/*
 	 * First, recheck the object type to account for the race when
 	 * the vnode is reclaimed.
 	 */
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	/*
 	 * Optimize for the case when writemappings is not going to
 	 * zero.
 	 */
 	inc = end - start;
 	if (object->un_pager.vnp.writemappings != inc) {
 		object->un_pager.vnp.writemappings -= inc;
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	vp = object->handle;
 	vhold(vp);
 	VM_OBJECT_WUNLOCK(object);
 	mp = NULL;
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	/*
 	 * Decrement the object's writemappings, by swapping the start
 	 * and end arguments for vnode_pager_update_writecount().  If
 	 * there was not a race with vnode reclaimation, then the
 	 * vnode's v_writecount is decremented.
 	 */
 	vnode_pager_update_writecount(object, end, start);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 	if (mp != NULL)
 		vn_finished_write(mp);
 }
Index: stable/12
===================================================================
--- stable/12	(revision 354393)
+++ stable/12	(revision 354394)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r353890