Index: head/sys/kern/kern_lockf.c
===================================================================
--- head/sys/kern/kern_lockf.c	(revision 302215)
+++ head/sys/kern/kern_lockf.c	(revision 302216)
@@ -1,2554 +1,2556 @@
 /*-
  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
  * Authors: Doug Rabson <dfr@rabson.org>
  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Scooter Morris at Genentech Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_debug_lockf.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/taskqueue.h>
 
 #ifdef LOCKF_DEBUG
 #include <sys/sysctl.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 
 static int	lockf_debug = 0; /* control debug output */
 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
 #endif
 
 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 
 struct owner_edge;
 struct owner_vertex;
 struct owner_vertex_list;
 struct owner_graph;
 
 #define NOLOCKF (struct lockf_entry *)0
 #define SELF	0x1
 #define OTHERS	0x2
 static void	 lf_init(void *);
 static int	 lf_hash_owner(caddr_t, struct flock *, int);
 static int	 lf_owner_matches(struct lock_owner *, caddr_t, struct flock *,
     int);
 static struct lockf_entry *
 		 lf_alloc_lock(struct lock_owner *);
 static int	 lf_free_lock(struct lockf_entry *);
 static int	 lf_clearlock(struct lockf *, struct lockf_entry *);
 static int	 lf_overlaps(struct lockf_entry *, struct lockf_entry *);
 static int	 lf_blocks(struct lockf_entry *, struct lockf_entry *);
 static void	 lf_free_edge(struct lockf_edge *);
 static struct lockf_edge *
 		 lf_alloc_edge(void);
 static void	 lf_alloc_vertex(struct lockf_entry *);
 static int	 lf_add_edge(struct lockf_entry *, struct lockf_entry *);
 static void	 lf_remove_edge(struct lockf_edge *);
 static void	 lf_remove_outgoing(struct lockf_entry *);
 static void	 lf_remove_incoming(struct lockf_entry *);
 static int	 lf_add_outgoing(struct lockf *, struct lockf_entry *);
 static int	 lf_add_incoming(struct lockf *, struct lockf_entry *);
 static int	 lf_findoverlap(struct lockf_entry **, struct lockf_entry *,
     int);
 static struct lockf_entry *
 		 lf_getblock(struct lockf *, struct lockf_entry *);
 static int	 lf_getlock(struct lockf *, struct lockf_entry *, struct flock *);
 static void	 lf_insert_lock(struct lockf *, struct lockf_entry *);
 static void	 lf_wakeup_lock(struct lockf *, struct lockf_entry *);
 static void	 lf_update_dependancies(struct lockf *, struct lockf_entry *,
     int all, struct lockf_entry_list *);
 static void	 lf_set_start(struct lockf *, struct lockf_entry *, off_t,
 	struct lockf_entry_list*);
 static void	 lf_set_end(struct lockf *, struct lockf_entry *, off_t,
 	struct lockf_entry_list*);
 static int	 lf_setlock(struct lockf *, struct lockf_entry *,
     struct vnode *, void **cookiep);
 static int	 lf_cancel(struct lockf *, struct lockf_entry *, void *);
 static void	 lf_split(struct lockf *, struct lockf_entry *,
     struct lockf_entry *, struct lockf_entry_list *);
 #ifdef LOCKF_DEBUG
 static int	 graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
     struct owner_vertex_list *path);
 static void	 graph_check(struct owner_graph *g, int checkorder);
 static void	 graph_print_vertices(struct owner_vertex_list *set);
 #endif
 static int	 graph_delta_forward(struct owner_graph *g,
     struct owner_vertex *x, struct owner_vertex *y,
     struct owner_vertex_list *delta);
 static int	 graph_delta_backward(struct owner_graph *g,
     struct owner_vertex *x, struct owner_vertex *y,
     struct owner_vertex_list *delta);
 static int	 graph_add_indices(int *indices, int n,
     struct owner_vertex_list *set);
 static int	 graph_assign_indices(struct owner_graph *g, int *indices,
     int nextunused, struct owner_vertex_list *set);
 static int	 graph_add_edge(struct owner_graph *g,
     struct owner_vertex *x, struct owner_vertex *y);
 static void	 graph_remove_edge(struct owner_graph *g,
     struct owner_vertex *x, struct owner_vertex *y);
 static struct owner_vertex *graph_alloc_vertex(struct owner_graph *g,
     struct lock_owner *lo);
 static void	 graph_free_vertex(struct owner_graph *g,
     struct owner_vertex *v);
 static struct owner_graph * graph_init(struct owner_graph *g);
 #ifdef LOCKF_DEBUG
 static void	 lf_print(char *, struct lockf_entry *);
 static void	 lf_printlist(char *, struct lockf_entry *);
 static void	 lf_print_owner(struct lock_owner *);
 #endif
 
 /*
  * This structure is used to keep track of both local and remote lock
  * owners. The lf_owner field of the struct lockf_entry points back at
  * the lock owner structure. Each possible lock owner (local proc for
  * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
  * pair for remote locks) is represented by a unique instance of
  * struct lock_owner.
  *
  * If a lock owner has a lock that blocks some other lock or a lock
  * that is waiting for some other lock, it also has a vertex in the
  * owner_graph below.
  *
  * Locks:
  * (s)		locked by state->ls_lock
  * (S)		locked by lf_lock_states_lock
  * (l)		locked by lf_lock_owners_lock
  * (g)		locked by lf_owner_graph_lock
  * (c)		const until freeing
  */
 #define	LOCK_OWNER_HASH_SIZE	256
 
 struct lock_owner {
 	LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
 	int	lo_refs;	    /* (l) Number of locks referring to this */
 	int	lo_flags;	    /* (c) Flags passwd to lf_advlock */
 	caddr_t	lo_id;		    /* (c) Id value passed to lf_advlock */
 	pid_t	lo_pid;		    /* (c) Process Id of the lock owner */
 	int	lo_sysid;	    /* (c) System Id of the lock owner */
 	struct owner_vertex *lo_vertex; /* (g) entry in deadlock graph */
 };
 
 LIST_HEAD(lock_owner_list, lock_owner);
 
 static struct sx		lf_lock_states_lock;
 static struct lockf_list	lf_lock_states; /* (S) */
 static struct sx		lf_lock_owners_lock;
 static struct lock_owner_list	lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
 
 /*
  * Structures for deadlock detection.
  *
  * We have two types of directed graph, the first is the set of locks,
  * both active and pending on a vnode. Within this graph, active locks
  * are terminal nodes in the graph (i.e. have no out-going
  * edges). Pending locks have out-going edges to each blocking active
  * lock that prevents the lock from being granted and also to each
  * older pending lock that would block them if it was active. The
  * graph for each vnode is naturally acyclic; new edges are only ever
  * added to or from new nodes (either new pending locks which only add
  * out-going edges or new active locks which only add in-coming edges)
  * therefore they cannot create loops in the lock graph.
  *
  * The second graph is a global graph of lock owners. Each lock owner
  * is a vertex in that graph and an edge is added to the graph
  * whenever an edge is added to a vnode graph, with end points
  * corresponding to owner of the new pending lock and the owner of the
  * lock upon which it waits. In order to prevent deadlock, we only add
  * an edge to this graph if the new edge would not create a cycle.
  * 
  * The lock owner graph is topologically sorted, i.e. if a node has
  * any outgoing edges, then it has an order strictly less than any
  * node to which it has an outgoing edge. We preserve this ordering
  * (and detect cycles) on edge insertion using Algorithm PK from the
  * paper "A Dynamic Topological Sort Algorithm for Directed Acyclic
  * Graphs" (ACM Journal of Experimental Algorithms, Vol 11, Article
  * No. 1.7)
  */
 struct owner_vertex;
 
 struct owner_edge {
 	LIST_ENTRY(owner_edge) e_outlink; /* (g) link from's out-edge list */
 	LIST_ENTRY(owner_edge) e_inlink;  /* (g) link to's in-edge list */
 	int		e_refs;		  /* (g) number of times added */
 	struct owner_vertex *e_from;	  /* (c) out-going from here */
 	struct owner_vertex *e_to;	  /* (c) in-coming to here */
 };
 LIST_HEAD(owner_edge_list, owner_edge);
 
 struct owner_vertex {
 	TAILQ_ENTRY(owner_vertex) v_link; /* (g) workspace for edge insertion */
 	uint32_t	v_gen;		  /* (g) workspace for edge insertion */
 	int		v_order;	  /* (g) order of vertex in graph */
 	struct owner_edge_list v_outedges;/* (g) list of out-edges */
 	struct owner_edge_list v_inedges; /* (g) list of in-edges */
 	struct lock_owner *v_owner;	  /* (c) corresponding lock owner */
 };
 TAILQ_HEAD(owner_vertex_list, owner_vertex);
 
 struct owner_graph {
 	struct owner_vertex** g_vertices; /* (g) pointers to vertices */
 	int		g_size;		  /* (g) number of vertices */
 	int		g_space;	  /* (g) space allocated for vertices */
 	int		*g_indexbuf;	  /* (g) workspace for loop detection */
 	uint32_t	g_gen;		  /* (g) increment when re-ordering */
 };
 
 static struct sx		lf_owner_graph_lock;
 static struct owner_graph	lf_owner_graph;
 
 /*
  * Initialise various structures and locks.
  */
 static void
 lf_init(void *dummy)
 {
 	int i;
 
 	sx_init(&lf_lock_states_lock, "lock states lock");
 	LIST_INIT(&lf_lock_states);
 
 	sx_init(&lf_lock_owners_lock, "lock owners lock");
 	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
 		LIST_INIT(&lf_lock_owners[i]);
 
 	sx_init(&lf_owner_graph_lock, "owner graph lock");
 	graph_init(&lf_owner_graph);
 }
 SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL);
 
 /*
  * Generate a hash value for a lock owner.
  */
 static int
 lf_hash_owner(caddr_t id, struct flock *fl, int flags)
 {
 	uint32_t h;
 
 	if (flags & F_REMOTE) {
 		h = HASHSTEP(0, fl->l_pid);
 		h = HASHSTEP(h, fl->l_sysid);
 	} else if (flags & F_FLOCK) {
 		h = ((uintptr_t) id) >> 7;
 	} else {
 		struct proc *p = (struct proc *) id;
 		h = HASHSTEP(0, p->p_pid);
 		h = HASHSTEP(h, 0);
 	}
 
 	return (h % LOCK_OWNER_HASH_SIZE);
 }
 
 /*
  * Return true if a lock owner matches the details passed to
  * lf_advlock.
  */
 static int
 lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
     int flags)
 {
 	if (flags & F_REMOTE) {
 		return lo->lo_pid == fl->l_pid
 			&& lo->lo_sysid == fl->l_sysid;
 	} else {
 		return lo->lo_id == id;
 	}
 }
 
 static struct lockf_entry *
 lf_alloc_lock(struct lock_owner *lo)
 {
 	struct lockf_entry *lf;
 
 	lf = malloc(sizeof(struct lockf_entry), M_LOCKF, M_WAITOK|M_ZERO);
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 4)
 		printf("Allocated lock %p\n", lf);
 #endif
 	if (lo) {
 		sx_xlock(&lf_lock_owners_lock);
 		lo->lo_refs++;
 		sx_xunlock(&lf_lock_owners_lock);
 		lf->lf_owner = lo;
 	}
 
 	return (lf);
 }
 
 static int
 lf_free_lock(struct lockf_entry *lock)
 {
 
 	KASSERT(lock->lf_refs > 0, ("lockf_entry negative ref count %p", lock));
 	if (--lock->lf_refs > 0)
 		return (0);
 	/*
 	 * Adjust the lock_owner reference count and
 	 * reclaim the entry if this is the last lock
 	 * for that owner.
 	 */
 	struct lock_owner *lo = lock->lf_owner;
 	if (lo) {
 		KASSERT(LIST_EMPTY(&lock->lf_outedges),
 		    ("freeing lock with dependencies"));
 		KASSERT(LIST_EMPTY(&lock->lf_inedges),
 		    ("freeing lock with dependants"));
 		sx_xlock(&lf_lock_owners_lock);
 		KASSERT(lo->lo_refs > 0, ("lock owner refcount"));
 		lo->lo_refs--;
 		if (lo->lo_refs == 0) {
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 1)
 				printf("lf_free_lock: freeing lock owner %p\n",
 				    lo);
 #endif
 			if (lo->lo_vertex) {
 				sx_xlock(&lf_owner_graph_lock);
 				graph_free_vertex(&lf_owner_graph,
 				    lo->lo_vertex);
 				sx_xunlock(&lf_owner_graph_lock);
 			}
 			LIST_REMOVE(lo, lo_link);
 			free(lo, M_LOCKF);
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 4)
 				printf("Freed lock owner %p\n", lo);
 #endif
 		}
 		sx_unlock(&lf_lock_owners_lock);
 	}
 	if ((lock->lf_flags & F_REMOTE) && lock->lf_vnode) {
 		vrele(lock->lf_vnode);
 		lock->lf_vnode = NULL;
 	}
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 4)
 		printf("Freed lock %p\n", lock);
 #endif
 	free(lock, M_LOCKF);
 	return (1);
 }
 
 /*
  * Advisory record locking support
  */
 int
 lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **statep,
     u_quad_t size)
 {
 	struct lockf *state, *freestate = NULL;
 	struct flock *fl = ap->a_fl;
 	struct lockf_entry *lock;
 	struct vnode *vp = ap->a_vp;
 	caddr_t id = ap->a_id;
 	int flags = ap->a_flags;
 	int hash;
 	struct lock_owner *lo;
 	off_t start, end, oadd;
 	int error;
 
 	/*
 	 * Handle the F_UNLKSYS case first - no need to mess about
 	 * creating a lock owner for this one.
 	 */
 	if (ap->a_op == F_UNLCKSYS) {
 		lf_clearremotesys(fl->l_sysid);
 		return (0);
 	}
 
 	/*
 	 * Convert the flock structure into a start and end.
 	 */
 	switch (fl->l_whence) {
 
 	case SEEK_SET:
 	case SEEK_CUR:
 		/*
 		 * Caller is responsible for adding any necessary offset
 		 * when SEEK_CUR is used.
 		 */
 		start = fl->l_start;
 		break;
 
 	case SEEK_END:
 		if (size > OFF_MAX ||
 		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
 			return (EOVERFLOW);
 		start = size + fl->l_start;
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	if (start < 0)
 		return (EINVAL);
 	if (fl->l_len < 0) {
 		if (start == 0)
 			return (EINVAL);
 		end = start - 1;
 		start += fl->l_len;
 		if (start < 0)
 			return (EINVAL);
 	} else if (fl->l_len == 0) {
 		end = OFF_MAX;
 	} else {
 		oadd = fl->l_len - 1;
 		if (oadd > OFF_MAX - start)
 			return (EOVERFLOW);
 		end = start + oadd;
 	}
 
 retry_setlock:
 
 	/*
 	 * Avoid the common case of unlocking when inode has no locks.
 	 */
 	VI_LOCK(vp);
 	if ((*statep) == NULL) {
 		if (ap->a_op != F_SETLK) {
 			fl->l_type = F_UNLCK;
 			VI_UNLOCK(vp);
 			return (0);
 		}
 	}
 	VI_UNLOCK(vp);
 
 	/*
 	 * Map our arguments to an existing lock owner or create one
 	 * if this is the first time we have seen this owner.
 	 */
 	hash = lf_hash_owner(id, fl, flags);
 	sx_xlock(&lf_lock_owners_lock);
 	LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
 		if (lf_owner_matches(lo, id, fl, flags))
 			break;
 	if (!lo) {
 		/*
 		 * We initialise the lock with a reference
 		 * count which matches the new lockf_entry
 		 * structure created below.
 		 */
 		lo = malloc(sizeof(struct lock_owner), M_LOCKF,
 		    M_WAITOK|M_ZERO);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 4)
 			printf("Allocated lock owner %p\n", lo);
 #endif
 
 		lo->lo_refs = 1;
 		lo->lo_flags = flags;
 		lo->lo_id = id;
 		if (flags & F_REMOTE) {
 			lo->lo_pid = fl->l_pid;
 			lo->lo_sysid = fl->l_sysid;
 		} else if (flags & F_FLOCK) {
 			lo->lo_pid = -1;
 			lo->lo_sysid = 0;
 		} else {
 			struct proc *p = (struct proc *) id;
 			lo->lo_pid = p->p_pid;
 			lo->lo_sysid = 0;
 		}
 		lo->lo_vertex = NULL;
 
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			printf("lf_advlockasync: new lock owner %p ", lo);
 			lf_print_owner(lo);
 			printf("\n");
 		}
 #endif
 
 		LIST_INSERT_HEAD(&lf_lock_owners[hash], lo, lo_link);
 	} else {
 		/*
 		 * We have seen this lock owner before, increase its
 		 * reference count to account for the new lockf_entry
 		 * structure we create below.
 		 */
 		lo->lo_refs++;
 	}
 	sx_xunlock(&lf_lock_owners_lock);
 
 	/*
 	 * Create the lockf structure. We initialise the lf_owner
 	 * field here instead of in lf_alloc_lock() to avoid paying
 	 * the lf_lock_owners_lock tax twice.
 	 */
 	lock = lf_alloc_lock(NULL);
 	lock->lf_refs = 1;
 	lock->lf_start = start;
 	lock->lf_end = end;
 	lock->lf_owner = lo;
 	lock->lf_vnode = vp;
 	if (flags & F_REMOTE) {
 		/*
 		 * For remote locks, the caller may release its ref to
 		 * the vnode at any time - we have to ref it here to
 		 * prevent it from being recycled unexpectedly.
 		 */
 		vref(vp);
 	}
 
 	/*
 	 * XXX The problem is that VTOI is ufs specific, so it will
 	 * break LOCKF_DEBUG for all other FS's other than UFS because
 	 * it casts the vnode->data ptr to struct inode *.
 	 */
 /*	lock->lf_inode = VTOI(ap->a_vp); */
 	lock->lf_inode = (struct inode *)0;
 	lock->lf_type = fl->l_type;
 	LIST_INIT(&lock->lf_outedges);
 	LIST_INIT(&lock->lf_inedges);
 	lock->lf_async_task = ap->a_task;
 	lock->lf_flags = ap->a_flags;
 
 	/*
 	 * Do the requested operation. First find our state structure
 	 * and create a new one if necessary - the caller's *statep
 	 * variable and the state's ls_threads count is protected by
 	 * the vnode interlock.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		VI_UNLOCK(vp);
 		lf_free_lock(lock);
 		return (ENOENT);
 	}
 
 	/*
 	 * Allocate a state structure if necessary.
 	 */
 	state = *statep;
 	if (state == NULL) {
 		struct lockf *ls;
 
 		VI_UNLOCK(vp);
 
 		ls = malloc(sizeof(struct lockf), M_LOCKF, M_WAITOK|M_ZERO);
 		sx_init(&ls->ls_lock, "ls_lock");
 		LIST_INIT(&ls->ls_active);
 		LIST_INIT(&ls->ls_pending);
 		ls->ls_threads = 1;
 
 		sx_xlock(&lf_lock_states_lock);
 		LIST_INSERT_HEAD(&lf_lock_states, ls, ls_link);
 		sx_xunlock(&lf_lock_states_lock);
 
 		/*
 		 * Cope if we lost a race with some other thread while
 		 * trying to allocate memory.
 		 */
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_DOOMED) {
 			VI_UNLOCK(vp);
 			sx_xlock(&lf_lock_states_lock);
 			LIST_REMOVE(ls, ls_link);
 			sx_xunlock(&lf_lock_states_lock);
 			sx_destroy(&ls->ls_lock);
 			free(ls, M_LOCKF);
 			lf_free_lock(lock);
 			return (ENOENT);
 		}
 		if ((*statep) == NULL) {
 			state = *statep = ls;
 			VI_UNLOCK(vp);
 		} else {
 			state = *statep;
 			state->ls_threads++;
 			VI_UNLOCK(vp);
 
 			sx_xlock(&lf_lock_states_lock);
 			LIST_REMOVE(ls, ls_link);
 			sx_xunlock(&lf_lock_states_lock);
 			sx_destroy(&ls->ls_lock);
 			free(ls, M_LOCKF);
 		}
 	} else {
 		state->ls_threads++;
 		VI_UNLOCK(vp);
 	}
 
 	sx_xlock(&state->ls_lock);
 	/*
 	 * Recheck the doomed vnode after state->ls_lock is
 	 * locked. lf_purgelocks() requires that no new threads add
 	 * pending locks when vnode is marked by VI_DOOMED flag.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		state->ls_threads--;
 		wakeup(state);
 		VI_UNLOCK(vp);
 		sx_xunlock(&state->ls_lock);
 		lf_free_lock(lock);
 		return (ENOENT);
 	}
 	VI_UNLOCK(vp);
 
 	switch (ap->a_op) {
 	case F_SETLK:
 		error = lf_setlock(state, lock, vp, ap->a_cookiep);
 		break;
 
 	case F_UNLCK:
 		error = lf_clearlock(state, lock);
 		lf_free_lock(lock);
 		break;
 
 	case F_GETLK:
 		error = lf_getlock(state, lock, fl);
 		lf_free_lock(lock);
 		break;
 
 	case F_CANCEL:
 		if (ap->a_cookiep)
 			error = lf_cancel(state, lock, *ap->a_cookiep);
 		else
 			error = EINVAL;
 		lf_free_lock(lock);
 		break;
 
 	default:
 		lf_free_lock(lock);
 		error = EINVAL;
 		break;
 	}
 
 #ifdef INVARIANTS
 	/*
 	 * Check for some can't happen stuff. In this case, the active
 	 * lock list becoming disordered or containing mutually
 	 * blocking locks. We also check the pending list for locks
 	 * which should be active (i.e. have no out-going edges).
 	 */
 	LIST_FOREACH(lock, &state->ls_active, lf_link) {
 		struct lockf_entry *lf;
 		if (LIST_NEXT(lock, lf_link))
 			KASSERT((lock->lf_start
 				<= LIST_NEXT(lock, lf_link)->lf_start),
 			    ("locks disordered"));
 		LIST_FOREACH(lf, &state->ls_active, lf_link) {
 			if (lock == lf)
 				break;
 			KASSERT(!lf_blocks(lock, lf),
 			    ("two conflicting active locks"));
 			if (lock->lf_owner == lf->lf_owner)
 				KASSERT(!lf_overlaps(lock, lf),
 				    ("two overlapping locks from same owner"));
 		}
 	}
 	LIST_FOREACH(lock, &state->ls_pending, lf_link) {
 		KASSERT(!LIST_EMPTY(&lock->lf_outedges),
 		    ("pending lock which should be active"));
 	}
 #endif
 	sx_xunlock(&state->ls_lock);
 
 	/*
 	 * If we have removed the last active lock on the vnode and
 	 * this is the last thread that was in-progress, we can free
 	 * the state structure. We update the caller's pointer inside
 	 * the vnode interlock but call free outside.
 	 *
 	 * XXX alternatively, keep the state structure around until
 	 * the filesystem recycles - requires a callback from the
 	 * filesystem.
 	 */
 	VI_LOCK(vp);
 
 	state->ls_threads--;
 	wakeup(state);
 	if (LIST_EMPTY(&state->ls_active) && state->ls_threads == 0) {
 		KASSERT(LIST_EMPTY(&state->ls_pending),
 		    ("freeing state with pending locks"));
 		freestate = state;
 		*statep = NULL;
 	}
 
 	VI_UNLOCK(vp);
 
 	if (freestate != NULL) {
 		sx_xlock(&lf_lock_states_lock);
 		LIST_REMOVE(freestate, ls_link);
 		sx_xunlock(&lf_lock_states_lock);
 		sx_destroy(&freestate->ls_lock);
 		free(freestate, M_LOCKF);
 		freestate = NULL;
 	}
 
 	if (error == EDOOFUS) {
 		KASSERT(ap->a_op == F_SETLK, ("EDOOFUS"));
 		goto retry_setlock;
 	}
 	return (error);
 }
 
 int
 lf_advlock(struct vop_advlock_args *ap, struct lockf **statep, u_quad_t size)
 {
 	struct vop_advlockasync_args a;
 
 	a.a_vp = ap->a_vp;
 	a.a_id = ap->a_id;
 	a.a_op = ap->a_op;
 	a.a_fl = ap->a_fl;
 	a.a_flags = ap->a_flags;
 	a.a_task = NULL;
 	a.a_cookiep = NULL;
 
 	return (lf_advlockasync(&a, statep, size));
 }
 
 void
 lf_purgelocks(struct vnode *vp, struct lockf **statep)
 {
 	struct lockf *state;
 	struct lockf_entry *lock, *nlock;
 
 	/*
 	 * For this to work correctly, the caller must ensure that no
 	 * other threads enter the locking system for this vnode,
 	 * e.g. by checking VI_DOOMED. We wake up any threads that are
 	 * sleeping waiting for locks on this vnode and then free all
 	 * the remaining locks.
 	 */
 	VI_LOCK(vp);
 	KASSERT(vp->v_iflag & VI_DOOMED,
 	    ("lf_purgelocks: vp %p has not vgone yet", vp));
 	state = *statep;
 	if (state) {
 		*statep = NULL;
 		state->ls_threads++;
 		VI_UNLOCK(vp);
 
 		sx_xlock(&state->ls_lock);
 		sx_xlock(&lf_owner_graph_lock);
 		LIST_FOREACH_SAFE(lock, &state->ls_pending, lf_link, nlock) {
 			LIST_REMOVE(lock, lf_link);
 			lf_remove_outgoing(lock);
 			lf_remove_incoming(lock);
 
 			/*
 			 * If its an async lock, we can just free it
 			 * here, otherwise we let the sleeping thread
 			 * free it.
 			 */
 			if (lock->lf_async_task) {
 				lf_free_lock(lock);
 			} else {
 				lock->lf_flags |= F_INTR;
 				wakeup(lock);
 			}
 		}
 		sx_xunlock(&lf_owner_graph_lock);
 		sx_xunlock(&state->ls_lock);
 
 		/*
 		 * Wait for all other threads, sleeping and otherwise
 		 * to leave.
 		 */
 		VI_LOCK(vp);
 		while (state->ls_threads > 1)
 			msleep(state, VI_MTX(vp), 0, "purgelocks", 0);
 		VI_UNLOCK(vp);
 
 		/*
 		 * We can just free all the active locks since they
 		 * will have no dependencies (we removed them all
 		 * above). We don't need to bother locking since we
 		 * are the last thread using this state structure.
 		 */
 		KASSERT(LIST_EMPTY(&state->ls_pending),
 		    ("lock pending for %p", state));
 		LIST_FOREACH_SAFE(lock, &state->ls_active, lf_link, nlock) {
 			LIST_REMOVE(lock, lf_link);
 			lf_free_lock(lock);
 		}
 		sx_xlock(&lf_lock_states_lock);
 		LIST_REMOVE(state, ls_link);
 		sx_xunlock(&lf_lock_states_lock);
 		sx_destroy(&state->ls_lock);
 		free(state, M_LOCKF);
 	} else {
 		VI_UNLOCK(vp);
 	}
 }
 
 /*
  * Return non-zero if locks 'x' and 'y' overlap.
  */
 static int
 lf_overlaps(struct lockf_entry *x, struct lockf_entry *y)
 {
 
 	return (x->lf_start <= y->lf_end && x->lf_end >= y->lf_start);
 }
 
 /*
  * Return non-zero if lock 'x' is blocked by lock 'y' (or vice versa).
  */
 static int
 lf_blocks(struct lockf_entry *x, struct lockf_entry *y)
 {
 
 	return x->lf_owner != y->lf_owner
 		&& (x->lf_type == F_WRLCK || y->lf_type == F_WRLCK)
 		&& lf_overlaps(x, y);
 }
 
 /*
  * Allocate a lock edge from the free list
  */
 static struct lockf_edge *
 lf_alloc_edge(void)
 {
 
 	return (malloc(sizeof(struct lockf_edge), M_LOCKF, M_WAITOK|M_ZERO));
 }
 
 /*
  * Free a lock edge.
  */
 static void
 lf_free_edge(struct lockf_edge *e)
 {
 
 	free(e, M_LOCKF);
 }
 
 
 /*
  * Ensure that the lock's owner has a corresponding vertex in the
  * owner graph.
  */
 static void
 lf_alloc_vertex(struct lockf_entry *lock)
 {
 	struct owner_graph *g = &lf_owner_graph;
 
 	if (!lock->lf_owner->lo_vertex)
 		lock->lf_owner->lo_vertex =
 			graph_alloc_vertex(g, lock->lf_owner);
 }
 
 /*
  * Attempt to record an edge from lock x to lock y. Return EDEADLK if
  * the new edge would cause a cycle in the owner graph.
  */
 static int
 lf_add_edge(struct lockf_entry *x, struct lockf_entry *y)
 {
 	struct owner_graph *g = &lf_owner_graph;
 	struct lockf_edge *e;
 	int error;
 
 #ifdef INVARIANTS
 	LIST_FOREACH(e, &x->lf_outedges, le_outlink)
 		KASSERT(e->le_to != y, ("adding lock edge twice"));
 #endif
 
 	/*
 	 * Make sure the two owners have entries in the owner graph.
 	 */
 	lf_alloc_vertex(x);
 	lf_alloc_vertex(y);
 
 	error = graph_add_edge(g, x->lf_owner->lo_vertex,
 	    y->lf_owner->lo_vertex);
 	if (error)
 		return (error);
 
 	e = lf_alloc_edge();
 	LIST_INSERT_HEAD(&x->lf_outedges, e, le_outlink);
 	LIST_INSERT_HEAD(&y->lf_inedges, e, le_inlink);
 	e->le_from = x;
 	e->le_to = y;
 
 	return (0);
 }
 
 /*
  * Remove an edge from the lock graph.
  */
 static void
 lf_remove_edge(struct lockf_edge *e)
 {
 	struct owner_graph *g = &lf_owner_graph;
 	struct lockf_entry *x = e->le_from;
 	struct lockf_entry *y = e->le_to;
 
 	graph_remove_edge(g, x->lf_owner->lo_vertex, y->lf_owner->lo_vertex);
 	LIST_REMOVE(e, le_outlink);
 	LIST_REMOVE(e, le_inlink);
 	e->le_from = NULL;
 	e->le_to = NULL;
 	lf_free_edge(e);
 }
 
 /*
  * Remove all out-going edges from lock x.
  */
 static void
 lf_remove_outgoing(struct lockf_entry *x)
 {
 	struct lockf_edge *e;
 
 	while ((e = LIST_FIRST(&x->lf_outedges)) != NULL) {
 		lf_remove_edge(e);
 	}
 }
 
 /*
  * Remove all in-coming edges from lock x.
  */
 static void
 lf_remove_incoming(struct lockf_entry *x)
 {
 	struct lockf_edge *e;
 
 	while ((e = LIST_FIRST(&x->lf_inedges)) != NULL) {
 		lf_remove_edge(e);
 	}
 }
 
 /*
  * Walk the list of locks for the file and create an out-going edge
  * from lock to each blocking lock.
  */
 static int
 lf_add_outgoing(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry *overlap;
 	int error;
 
 	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
 		/*
 		 * We may assume that the active list is sorted by
 		 * lf_start.
 		 */
 		if (overlap->lf_start > lock->lf_end)
 			break;
 		if (!lf_blocks(lock, overlap))
 			continue;
 
 		/*
 		 * We've found a blocking lock. Add the corresponding
 		 * edge to the graphs and see if it would cause a
 		 * deadlock.
 		 */
 		error = lf_add_edge(lock, overlap);
 
 		/*
 		 * The only error that lf_add_edge returns is EDEADLK.
 		 * Remove any edges we added and return the error.
 		 */
 		if (error) {
 			lf_remove_outgoing(lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * We also need to add edges to sleeping locks that block
 	 * us. This ensures that lf_wakeup_lock cannot grant two
 	 * mutually blocking locks simultaneously and also enforces a
 	 * 'first come, first served' fairness model. Note that this
 	 * only happens if we are blocked by at least one active lock
 	 * due to the call to lf_getblock in lf_setlock below.
 	 */
 	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
 		if (!lf_blocks(lock, overlap))
 			continue;
 		/*
 		 * We've found a blocking lock. Add the corresponding
 		 * edge to the graphs and see if it would cause a
 		 * deadlock.
 		 */
 		error = lf_add_edge(lock, overlap);
 
 		/*
 		 * The only error that lf_add_edge returns is EDEADLK.
 		 * Remove any edges we added and return the error.
 		 */
 		if (error) {
 			lf_remove_outgoing(lock);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Walk the list of pending locks for the file and create an in-coming
  * edge from lock to each blocking lock.
  */
 static int
 lf_add_incoming(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry *overlap;
 	int error;
 
 	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
 		if (!lf_blocks(lock, overlap))
 			continue;
 
 		/*
 		 * We've found a blocking lock. Add the corresponding
 		 * edge to the graphs and see if it would cause a
 		 * deadlock.
 		 */
 		error = lf_add_edge(overlap, lock);
 
 		/*
 		 * The only error that lf_add_edge returns is EDEADLK.
 		 * Remove any edges we added and return the error.
 		 */
 		if (error) {
 			lf_remove_incoming(lock);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /*
  * Insert lock into the active list, keeping list entries ordered by
  * increasing values of lf_start.
  */
 static void
 lf_insert_lock(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry *lf, *lfprev;
 
 	if (LIST_EMPTY(&state->ls_active)) {
 		LIST_INSERT_HEAD(&state->ls_active, lock, lf_link);
 		return;
 	}
 
 	lfprev = NULL;
 	LIST_FOREACH(lf, &state->ls_active, lf_link) {
 		if (lf->lf_start > lock->lf_start) {
 			LIST_INSERT_BEFORE(lf, lock, lf_link);
 			return;
 		}
 		lfprev = lf;
 	}
 	LIST_INSERT_AFTER(lfprev, lock, lf_link);
 }
 
 /*
  * Wake up a sleeping lock and remove it from the pending list now
  * that all its dependencies have been resolved. The caller should
  * arrange for the lock to be added to the active list, adjusting any
  * existing locks for the same owner as needed.
  */
 static void
 lf_wakeup_lock(struct lockf *state, struct lockf_entry *wakelock)
 {
 
 	/*
 	 * Remove from ls_pending list and wake up the caller
 	 * or start the async notification, as appropriate.
 	 */
 	LIST_REMOVE(wakelock, lf_link);
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 1)
 		lf_print("lf_wakeup_lock: awakening", wakelock);
 #endif /* LOCKF_DEBUG */
 	if (wakelock->lf_async_task) {
 		taskqueue_enqueue(taskqueue_thread, wakelock->lf_async_task);
 	} else {
 		wakeup(wakelock);
 	}
 }
 
 /*
  * Re-check all dependent locks and remove edges to locks that we no
  * longer block. If 'all' is non-zero, the lock has been removed and
  * we must remove all the dependencies, otherwise it has simply been
  * reduced but remains active. Any pending locks which have been been
  * unblocked are added to 'granted'
  */
 static void
 lf_update_dependancies(struct lockf *state, struct lockf_entry *lock, int all,
 	struct lockf_entry_list *granted)
 {
 	struct lockf_edge *e, *ne;
 	struct lockf_entry *deplock;
 
 	LIST_FOREACH_SAFE(e, &lock->lf_inedges, le_inlink, ne) {
 		deplock = e->le_from;
 		if (all || !lf_blocks(lock, deplock)) {
 			sx_xlock(&lf_owner_graph_lock);
 			lf_remove_edge(e);
 			sx_xunlock(&lf_owner_graph_lock);
 			if (LIST_EMPTY(&deplock->lf_outedges)) {
 				lf_wakeup_lock(state, deplock);
 				LIST_INSERT_HEAD(granted, deplock, lf_link);
 			}
 		}
 	}
 }
 
 /*
  * Set the start of an existing active lock, updating dependencies and
  * adding any newly woken locks to 'granted'.
  */
 static void
 lf_set_start(struct lockf *state, struct lockf_entry *lock, off_t new_start,
 	struct lockf_entry_list *granted)
 {
 
 	KASSERT(new_start >= lock->lf_start, ("can't increase lock"));
 	lock->lf_start = new_start;
 	LIST_REMOVE(lock, lf_link);
 	lf_insert_lock(state, lock);
 	lf_update_dependancies(state, lock, FALSE, granted);
 }
 
 /*
  * Set the end of an existing active lock, updating dependencies and
  * adding any newly woken locks to 'granted'.
  */
 static void
 lf_set_end(struct lockf *state, struct lockf_entry *lock, off_t new_end,
 	struct lockf_entry_list *granted)
 {
 
 	KASSERT(new_end <= lock->lf_end, ("can't increase lock"));
 	lock->lf_end = new_end;
 	lf_update_dependancies(state, lock, FALSE, granted);
 }
 
 /*
  * Add a lock to the active list, updating or removing any current
  * locks owned by the same owner and processing any pending locks that
  * become unblocked as a result. This code is also used for unlock
  * since the logic for updating existing locks is identical.
  *
  * As a result of processing the new lock, we may unblock existing
  * pending locks as a result of downgrading/unlocking. We simply
  * activate the newly granted locks by looping.
  *
  * Since the new lock already has its dependencies set up, we always
  * add it to the list (unless its an unlock request). This may
  * fragment the lock list in some pathological cases but its probably
  * not a real problem.
  */
 static void
 lf_activate_lock(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry *overlap, *lf;
 	struct lockf_entry_list granted;
 	int ovcase;
 
 	LIST_INIT(&granted);
 	LIST_INSERT_HEAD(&granted, lock, lf_link);
 
 	while (!LIST_EMPTY(&granted)) {
 		lock = LIST_FIRST(&granted);
 		LIST_REMOVE(lock, lf_link);
 
 		/*
 		 * Skip over locks owned by other processes.  Handle
 		 * any locks that overlap and are owned by ourselves.
 		 */
 		overlap = LIST_FIRST(&state->ls_active);
 		for (;;) {
 			ovcase = lf_findoverlap(&overlap, lock, SELF);
 
 #ifdef LOCKF_DEBUG
 			if (ovcase && (lockf_debug & 2)) {
 				printf("lf_setlock: overlap %d", ovcase);
 				lf_print("", overlap);
 			}
 #endif
 			/*
 			 * Six cases:
 			 *	0) no overlap
 			 *	1) overlap == lock
 			 *	2) overlap contains lock
 			 *	3) lock contains overlap
 			 *	4) overlap starts before lock
 			 *	5) overlap ends after lock
 			 */
 			switch (ovcase) {
 			case 0: /* no overlap */
 				break;
 
 			case 1: /* overlap == lock */
 				/*
 				 * We have already setup the
 				 * dependants for the new lock, taking
 				 * into account a possible downgrade
 				 * or unlock. Remove the old lock.
 				 */
 				LIST_REMOVE(overlap, lf_link);
 				lf_update_dependancies(state, overlap, TRUE,
 					&granted);
 				lf_free_lock(overlap);
 				break;
 
 			case 2: /* overlap contains lock */
 				/*
 				 * Just split the existing lock.
 				 */
 				lf_split(state, overlap, lock, &granted);
 				break;
 
 			case 3: /* lock contains overlap */
 				/*
 				 * Delete the overlap and advance to
 				 * the next entry in the list.
 				 */
 				lf = LIST_NEXT(overlap, lf_link);
 				LIST_REMOVE(overlap, lf_link);
 				lf_update_dependancies(state, overlap, TRUE,
 					&granted);
 				lf_free_lock(overlap);
 				overlap = lf;
 				continue;
 
 			case 4: /* overlap starts before lock */
 				/*
 				 * Just update the overlap end and
 				 * move on.
 				 */
 				lf_set_end(state, overlap, lock->lf_start - 1,
 				    &granted);
 				overlap = LIST_NEXT(overlap, lf_link);
 				continue;
 
 			case 5: /* overlap ends after lock */
 				/*
 				 * Change the start of overlap and
 				 * re-insert.
 				 */
 				lf_set_start(state, overlap, lock->lf_end + 1,
 				    &granted);
 				break;
 			}
 			break;
 		}
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			if (lock->lf_type != F_UNLCK)
 				lf_print("lf_activate_lock: activated", lock);
 			else
 				lf_print("lf_activate_lock: unlocked", lock);
 			lf_printlist("lf_activate_lock", lock);
 		}
 #endif /* LOCKF_DEBUG */
 		if (lock->lf_type != F_UNLCK)
 			lf_insert_lock(state, lock);
 	}
 }
 
 /*
  * Cancel a pending lock request, either as a result of a signal or a
  * cancel request for an async lock.
  */
 static void
 lf_cancel_lock(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry_list granted;
 
 	/*
 	 * Note it is theoretically possible that cancelling this lock
 	 * may allow some other pending lock to become
 	 * active. Consider this case:
 	 *
 	 * Owner	Action		Result		Dependencies
 	 * 
 	 * A:		lock [0..0]	succeeds	
 	 * B:		lock [2..2]	succeeds	
 	 * C:		lock [1..2]	blocked		C->B
 	 * D:		lock [0..1]	blocked		C->B,D->A,D->C
 	 * A:		unlock [0..0]			C->B,D->C
 	 * C:		cancel [1..2]	
 	 */
 
 	LIST_REMOVE(lock, lf_link);
 
 	/*
 	 * Removing out-going edges is simple.
 	 */
 	sx_xlock(&lf_owner_graph_lock);
 	lf_remove_outgoing(lock);
 	sx_xunlock(&lf_owner_graph_lock);
 
 	/*
 	 * Removing in-coming edges may allow some other lock to
 	 * become active - we use lf_update_dependancies to figure
 	 * this out.
 	 */
 	LIST_INIT(&granted);
 	lf_update_dependancies(state, lock, TRUE, &granted);
 	lf_free_lock(lock);
 
 	/*
 	 * Feed any newly active locks to lf_activate_lock.
 	 */
 	while (!LIST_EMPTY(&granted)) {
 		lock = LIST_FIRST(&granted);
 		LIST_REMOVE(lock, lf_link);
 		lf_activate_lock(state, lock);
 	}
 }
 
 /*
  * Set a byte-range lock.
  */
 static int
 lf_setlock(struct lockf *state, struct lockf_entry *lock, struct vnode *vp,
     void **cookiep)
 {
 	static char lockstr[] = "lockf";
-	int priority, error;
+	int error, priority, stops_deferred;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 1)
 		lf_print("lf_setlock", lock);
 #endif /* LOCKF_DEBUG */
 
 	/*
 	 * Set the priority
 	 */
 	priority = PLOCK;
 	if (lock->lf_type == F_WRLCK)
 		priority += 4;
 	if (!(lock->lf_flags & F_NOINTR))
 		priority |= PCATCH;
 	/*
 	 * Scan lock list for this file looking for locks that would block us.
 	 */
 	if (lf_getblock(state, lock)) {
 		/*
 		 * Free the structure and return if nonblocking.
 		 */
 		if ((lock->lf_flags & F_WAIT) == 0
 		    && lock->lf_async_task == NULL) {
 			lf_free_lock(lock);
 			error = EAGAIN;
 			goto out;
 		}
 
 		/*
 		 * For flock type locks, we must first remove
 		 * any shared locks that we hold before we sleep
 		 * waiting for an exclusive lock.
 		 */
 		if ((lock->lf_flags & F_FLOCK) &&
 		    lock->lf_type == F_WRLCK) {
 			lock->lf_type = F_UNLCK;
 			lf_activate_lock(state, lock);
 			lock->lf_type = F_WRLCK;
 		}
 
 		/*
 		 * We are blocked. Create edges to each blocking lock,
 		 * checking for deadlock using the owner graph. For
 		 * simplicity, we run deadlock detection for all
 		 * locks, posix and otherwise.
 		 */
 		sx_xlock(&lf_owner_graph_lock);
 		error = lf_add_outgoing(state, lock);
 		sx_xunlock(&lf_owner_graph_lock);
 
 		if (error) {
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 1)
 				lf_print("lf_setlock: deadlock", lock);
 #endif
 			lf_free_lock(lock);
 			goto out;
 		}
 
 		/*
 		 * We have added edges to everything that blocks
 		 * us. Sleep until they all go away.
 		 */
 		LIST_INSERT_HEAD(&state->ls_pending, lock, lf_link);
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			struct lockf_edge *e;
 			LIST_FOREACH(e, &lock->lf_outedges, le_outlink) {
 				lf_print("lf_setlock: blocking on", e->le_to);
 				lf_printlist("lf_setlock", e->le_to);
 			}
 		}
 #endif /* LOCKF_DEBUG */
 
 		if ((lock->lf_flags & F_WAIT) == 0) {
 			/*
 			 * The caller requested async notification -
 			 * this callback happens when the blocking
 			 * lock is released, allowing the caller to
 			 * make another attempt to take the lock.
 			 */
 			*cookiep = (void *) lock;
 			error = EINPROGRESS;
 			goto out;
 		}
 
 		lock->lf_refs++;
+		stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
 		error = sx_sleep(lock, &state->ls_lock, priority, lockstr, 0);
+		sigallowstop(stops_deferred);
 		if (lf_free_lock(lock)) {
 			error = EDOOFUS;
 			goto out;
 		}
 
 		/*
 		 * We may have been awakened by a signal and/or by a
 		 * debugger continuing us (in which cases we must
 		 * remove our lock graph edges) and/or by another
 		 * process releasing a lock (in which case our edges
 		 * have already been removed and we have been moved to
 		 * the active list). We may also have been woken by
 		 * lf_purgelocks which we report to the caller as
 		 * EINTR. In that case, lf_purgelocks will have
 		 * removed our lock graph edges.
 		 *
 		 * Note that it is possible to receive a signal after
 		 * we were successfully woken (and moved to the active
 		 * list) but before we resumed execution. In this
 		 * case, our lf_outedges list will be clear. We
 		 * pretend there was no error.
 		 *
 		 * Note also, if we have been sleeping long enough, we
 		 * may now have incoming edges from some newer lock
 		 * which is waiting behind us in the queue.
 		 */
 		if (lock->lf_flags & F_INTR) {
 			error = EINTR;
 			lf_free_lock(lock);
 			goto out;
 		}
 		if (LIST_EMPTY(&lock->lf_outedges)) {
 			error = 0;
 		} else {
 			lf_cancel_lock(state, lock);
 			goto out;
 		}
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1) {
 			lf_print("lf_setlock: granted", lock);
 		}
 #endif
 		goto out;
 	}
 	/*
 	 * It looks like we are going to grant the lock. First add
 	 * edges from any currently pending lock that the new lock
 	 * would block.
 	 */
 	sx_xlock(&lf_owner_graph_lock);
 	error = lf_add_incoming(state, lock);
 	sx_xunlock(&lf_owner_graph_lock);
 	if (error) {
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 1)
 			lf_print("lf_setlock: deadlock", lock);
 #endif
 		lf_free_lock(lock);
 		goto out;
 	}
 
 	/*
 	 * No blocks!!  Add the lock.  Note that we will
 	 * downgrade or upgrade any overlapping locks this
 	 * process already owns.
 	 */
 	lf_activate_lock(state, lock);
 	error = 0;
 out:
 	return (error);
 }
 
 /*
  * Remove a byte-range lock on an inode.
  *
  * Generally, find the lock (or an overlap to that lock)
  * and remove it (or shrink it), then wakeup anyone we can.
  */
 static int
 lf_clearlock(struct lockf *state, struct lockf_entry *unlock)
 {
 	struct lockf_entry *overlap;
 
 	overlap = LIST_FIRST(&state->ls_active);
 
 	if (overlap == NOLOCKF)
 		return (0);
 #ifdef LOCKF_DEBUG
 	if (unlock->lf_type != F_UNLCK)
 		panic("lf_clearlock: bad type");
 	if (lockf_debug & 1)
 		lf_print("lf_clearlock", unlock);
 #endif /* LOCKF_DEBUG */
 
 	lf_activate_lock(state, unlock);
 
 	return (0);
 }
 
 /*
  * Check whether there is a blocking lock, and if so return its
  * details in '*fl'.
  */
 static int
 lf_getlock(struct lockf *state, struct lockf_entry *lock, struct flock *fl)
 {
 	struct lockf_entry *block;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 1)
 		lf_print("lf_getlock", lock);
 #endif /* LOCKF_DEBUG */
 
 	if ((block = lf_getblock(state, lock))) {
 		fl->l_type = block->lf_type;
 		fl->l_whence = SEEK_SET;
 		fl->l_start = block->lf_start;
 		if (block->lf_end == OFF_MAX)
 			fl->l_len = 0;
 		else
 			fl->l_len = block->lf_end - block->lf_start + 1;
 		fl->l_pid = block->lf_owner->lo_pid;
 		fl->l_sysid = block->lf_owner->lo_sysid;
 	} else {
 		fl->l_type = F_UNLCK;
 	}
 	return (0);
 }
 
 /*
  * Cancel an async lock request.
  */
 static int
 lf_cancel(struct lockf *state, struct lockf_entry *lock, void *cookie)
 {
 	struct lockf_entry *reallock;
 
 	/*
 	 * We need to match this request with an existing lock
 	 * request.
 	 */
 	LIST_FOREACH(reallock, &state->ls_pending, lf_link) {
 		if ((void *) reallock == cookie) {
 			/*
 			 * Double-check that this lock looks right
 			 * (maybe use a rolling ID for the cancel
 			 * cookie instead?)
 			 */
 			if (!(reallock->lf_vnode == lock->lf_vnode
 				&& reallock->lf_start == lock->lf_start
 				&& reallock->lf_end == lock->lf_end)) {
 				return (ENOENT);
 			}
 
 			/*
 			 * Make sure this lock was async and then just
 			 * remove it from its wait lists.
 			 */
 			if (!reallock->lf_async_task) {
 				return (ENOENT);
 			}
 
 			/*
 			 * Note that since any other thread must take
 			 * state->ls_lock before it can possibly
 			 * trigger the async callback, we are safe
 			 * from a race with lf_wakeup_lock, i.e. we
 			 * can free the lock (actually our caller does
 			 * this).
 			 */
 			lf_cancel_lock(state, reallock);
 			return (0);
 		}
 	}
 
 	/*
 	 * We didn't find a matching lock - not much we can do here.
 	 */
 	return (ENOENT);
 }
 
 /*
  * Walk the list of locks for an inode and
  * return the first blocking lock.
  */
 static struct lockf_entry *
 lf_getblock(struct lockf *state, struct lockf_entry *lock)
 {
 	struct lockf_entry *overlap;
 
 	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
 		/*
 		 * We may assume that the active list is sorted by
 		 * lf_start.
 		 */
 		if (overlap->lf_start > lock->lf_end)
 			break;
 		if (!lf_blocks(lock, overlap))
 			continue;
 		return (overlap);
 	}
 	return (NOLOCKF);
 }
 
 /*
  * Walk the list of locks for an inode to find an overlapping lock (if
  * any) and return a classification of that overlap.
  *
  * Arguments:
  *	*overlap	The place in the lock list to start looking
  *	lock		The lock which is being tested
  *	type		Pass 'SELF' to test only locks with the same
  *			owner as lock, or 'OTHER' to test only locks
  *			with a different owner
  *
  * Returns one of six values:
  *	0) no overlap
  *	1) overlap == lock
  *	2) overlap contains lock
  *	3) lock contains overlap
  *	4) overlap starts before lock
  *	5) overlap ends after lock
  *
  * If there is an overlapping lock, '*overlap' is set to point at the
  * overlapping lock.
  *
  * NOTE: this returns only the FIRST overlapping lock.  There
  *	 may be more than one.
  */
 static int
 lf_findoverlap(struct lockf_entry **overlap, struct lockf_entry *lock, int type)
 {
 	struct lockf_entry *lf;
 	off_t start, end;
 	int res;
 
 	if ((*overlap) == NOLOCKF) {
 		return (0);
 	}
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 2)
 		lf_print("lf_findoverlap: looking for overlap in", lock);
 #endif /* LOCKF_DEBUG */
 	start = lock->lf_start;
 	end = lock->lf_end;
 	res = 0;
 	while (*overlap) {
 		lf = *overlap;
 		if (lf->lf_start > end)
 			break;
 		if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
 		    ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
 			*overlap = LIST_NEXT(lf, lf_link);
 			continue;
 		}
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 2)
 			lf_print("\tchecking", lf);
 #endif /* LOCKF_DEBUG */
 		/*
 		 * OK, check for overlap
 		 *
 		 * Six cases:
 		 *	0) no overlap
 		 *	1) overlap == lock
 		 *	2) overlap contains lock
 		 *	3) lock contains overlap
 		 *	4) overlap starts before lock
 		 *	5) overlap ends after lock
 		 */
 		if (start > lf->lf_end) {
 			/* Case 0 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("no overlap\n");
 #endif /* LOCKF_DEBUG */
 			*overlap = LIST_NEXT(lf, lf_link);
 			continue;
 		}
 		if (lf->lf_start == start && lf->lf_end == end) {
 			/* Case 1 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("overlap == lock\n");
 #endif /* LOCKF_DEBUG */
 			res = 1;
 			break;
 		}
 		if (lf->lf_start <= start && lf->lf_end >= end) {
 			/* Case 2 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("overlap contains lock\n");
 #endif /* LOCKF_DEBUG */
 			res = 2;
 			break;
 		}
 		if (start <= lf->lf_start && end >= lf->lf_end) {
 			/* Case 3 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("lock contains overlap\n");
 #endif /* LOCKF_DEBUG */
 			res = 3;
 			break;
 		}
 		if (lf->lf_start < start && lf->lf_end >= start) {
 			/* Case 4 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("overlap starts before lock\n");
 #endif /* LOCKF_DEBUG */
 			res = 4;
 			break;
 		}
 		if (lf->lf_start > start && lf->lf_end > end) {
 			/* Case 5 */
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 2)
 				printf("overlap ends after lock\n");
 #endif /* LOCKF_DEBUG */
 			res = 5;
 			break;
 		}
 		panic("lf_findoverlap: default");
 	}
 	return (res);
 }
 
 /*
  * Split an the existing 'lock1', based on the extent of the lock
  * described by 'lock2'. The existing lock should cover 'lock2'
  * entirely.
  *
  * Any pending locks which have been been unblocked are added to
  * 'granted'
  */
 static void
 lf_split(struct lockf *state, struct lockf_entry *lock1,
     struct lockf_entry *lock2, struct lockf_entry_list *granted)
 {
 	struct lockf_entry *splitlock;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 2) {
 		lf_print("lf_split", lock1);
 		lf_print("splitting from", lock2);
 	}
 #endif /* LOCKF_DEBUG */
 	/*
 	 * Check to see if we don't need to split at all.
 	 */
 	if (lock1->lf_start == lock2->lf_start) {
 		lf_set_start(state, lock1, lock2->lf_end + 1, granted);
 		return;
 	}
 	if (lock1->lf_end == lock2->lf_end) {
 		lf_set_end(state, lock1, lock2->lf_start - 1, granted);
 		return;
 	}
 	/*
 	 * Make a new lock consisting of the last part of
 	 * the encompassing lock.
 	 */
 	splitlock = lf_alloc_lock(lock1->lf_owner);
 	memcpy(splitlock, lock1, sizeof *splitlock);
 	splitlock->lf_refs = 1;
 	if (splitlock->lf_flags & F_REMOTE)
 		vref(splitlock->lf_vnode);
 
 	/*
 	 * This cannot cause a deadlock since any edges we would add
 	 * to splitlock already exist in lock1. We must be sure to add
 	 * necessary dependencies to splitlock before we reduce lock1
 	 * otherwise we may accidentally grant a pending lock that
 	 * was blocked by the tail end of lock1.
 	 */
 	splitlock->lf_start = lock2->lf_end + 1;
 	LIST_INIT(&splitlock->lf_outedges);
 	LIST_INIT(&splitlock->lf_inedges);
 	sx_xlock(&lf_owner_graph_lock);
 	lf_add_incoming(state, splitlock);
 	sx_xunlock(&lf_owner_graph_lock);
 
 	lf_set_end(state, lock1, lock2->lf_start - 1, granted);
 
 	/*
 	 * OK, now link it in
 	 */
 	lf_insert_lock(state, splitlock);
 }
 
 struct lockdesc {
 	STAILQ_ENTRY(lockdesc) link;
 	struct vnode *vp;
 	struct flock fl;
 };
 STAILQ_HEAD(lockdesclist, lockdesc);
 
 int
 lf_iteratelocks_sysid(int sysid, lf_iterator *fn, void *arg)
 {
 	struct lockf *ls;
 	struct lockf_entry *lf;
 	struct lockdesc *ldesc;
 	struct lockdesclist locks;
 	int error;
 
 	/*
 	 * In order to keep the locking simple, we iterate over the
 	 * active lock lists to build a list of locks that need
 	 * releasing. We then call the iterator for each one in turn.
 	 *
 	 * We take an extra reference to the vnode for the duration to
 	 * make sure it doesn't go away before we are finished.
 	 */
 	STAILQ_INIT(&locks);
 	sx_xlock(&lf_lock_states_lock);
 	LIST_FOREACH(ls, &lf_lock_states, ls_link) {
 		sx_xlock(&ls->ls_lock);
 		LIST_FOREACH(lf, &ls->ls_active, lf_link) {
 			if (lf->lf_owner->lo_sysid != sysid)
 				continue;
 
 			ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
 			    M_WAITOK);
 			ldesc->vp = lf->lf_vnode;
 			vref(ldesc->vp);
 			ldesc->fl.l_start = lf->lf_start;
 			if (lf->lf_end == OFF_MAX)
 				ldesc->fl.l_len = 0;
 			else
 				ldesc->fl.l_len =
 					lf->lf_end - lf->lf_start + 1;
 			ldesc->fl.l_whence = SEEK_SET;
 			ldesc->fl.l_type = F_UNLCK;
 			ldesc->fl.l_pid = lf->lf_owner->lo_pid;
 			ldesc->fl.l_sysid = sysid;
 			STAILQ_INSERT_TAIL(&locks, ldesc, link);
 		}
 		sx_xunlock(&ls->ls_lock);
 	}
 	sx_xunlock(&lf_lock_states_lock);
 
 	/*
 	 * Call the iterator function for each lock in turn. If the
 	 * iterator returns an error code, just free the rest of the
 	 * lockdesc structures.
 	 */
 	error = 0;
 	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
 		STAILQ_REMOVE_HEAD(&locks, link);
 		if (!error)
 			error = fn(ldesc->vp, &ldesc->fl, arg);
 		vrele(ldesc->vp);
 		free(ldesc, M_LOCKF);
 	}
 
 	return (error);
 }
 
 int
 lf_iteratelocks_vnode(struct vnode *vp, lf_iterator *fn, void *arg)
 {
 	struct lockf *ls;
 	struct lockf_entry *lf;
 	struct lockdesc *ldesc;
 	struct lockdesclist locks;
 	int error;
 
 	/*
 	 * In order to keep the locking simple, we iterate over the
 	 * active lock lists to build a list of locks that need
 	 * releasing. We then call the iterator for each one in turn.
 	 *
 	 * We take an extra reference to the vnode for the duration to
 	 * make sure it doesn't go away before we are finished.
 	 */
 	STAILQ_INIT(&locks);
 	VI_LOCK(vp);
 	ls = vp->v_lockf;
 	if (!ls) {
 		VI_UNLOCK(vp);
 		return (0);
 	}
 	ls->ls_threads++;
 	VI_UNLOCK(vp);
 
 	sx_xlock(&ls->ls_lock);
 	LIST_FOREACH(lf, &ls->ls_active, lf_link) {
 		ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
 		    M_WAITOK);
 		ldesc->vp = lf->lf_vnode;
 		vref(ldesc->vp);
 		ldesc->fl.l_start = lf->lf_start;
 		if (lf->lf_end == OFF_MAX)
 			ldesc->fl.l_len = 0;
 		else
 			ldesc->fl.l_len =
 				lf->lf_end - lf->lf_start + 1;
 		ldesc->fl.l_whence = SEEK_SET;
 		ldesc->fl.l_type = F_UNLCK;
 		ldesc->fl.l_pid = lf->lf_owner->lo_pid;
 		ldesc->fl.l_sysid = lf->lf_owner->lo_sysid;
 		STAILQ_INSERT_TAIL(&locks, ldesc, link);
 	}
 	sx_xunlock(&ls->ls_lock);
 	VI_LOCK(vp);
 	ls->ls_threads--;
 	wakeup(ls);
 	VI_UNLOCK(vp);
 
 	/*
 	 * Call the iterator function for each lock in turn. If the
 	 * iterator returns an error code, just free the rest of the
 	 * lockdesc structures.
 	 */
 	error = 0;
 	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
 		STAILQ_REMOVE_HEAD(&locks, link);
 		if (!error)
 			error = fn(ldesc->vp, &ldesc->fl, arg);
 		vrele(ldesc->vp);
 		free(ldesc, M_LOCKF);
 	}
 
 	return (error);
 }
 
 static int
 lf_clearremotesys_iterator(struct vnode *vp, struct flock *fl, void *arg)
 {
 
 	VOP_ADVLOCK(vp, 0, F_UNLCK, fl, F_REMOTE);
 	return (0);
 }
 
 void
 lf_clearremotesys(int sysid)
 {
 
 	KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
 	lf_iteratelocks_sysid(sysid, lf_clearremotesys_iterator, NULL);
 }
 
 int
 lf_countlocks(int sysid)
 {
 	int i;
 	struct lock_owner *lo;
 	int count;
 
 	count = 0;
 	sx_xlock(&lf_lock_owners_lock);
 	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
 		LIST_FOREACH(lo, &lf_lock_owners[i], lo_link)
 			if (lo->lo_sysid == sysid)
 				count += lo->lo_refs;
 	sx_xunlock(&lf_lock_owners_lock);
 
 	return (count);
 }
 
 #ifdef LOCKF_DEBUG
 
 /*
  * Return non-zero if y is reachable from x using a brute force
  * search. If reachable and path is non-null, return the route taken
  * in path.
  */
 static int
 graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
     struct owner_vertex_list *path)
 {
 	struct owner_edge *e;
 
 	if (x == y) {
 		if (path)
 			TAILQ_INSERT_HEAD(path, x, v_link);
 		return 1;
 	}
 
 	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
 		if (graph_reaches(e->e_to, y, path)) {
 			if (path)
 				TAILQ_INSERT_HEAD(path, x, v_link);
 			return 1;
 		}
 	}
 	return 0;
 }
 
 /*
  * Perform consistency checks on the graph. Make sure the values of
  * v_order are correct. If checkorder is non-zero, check no vertex can
  * reach any other vertex with a smaller order.
  */
 static void
 graph_check(struct owner_graph *g, int checkorder)
 {
 	int i, j;
 
 	for (i = 0; i < g->g_size; i++) {
 		if (!g->g_vertices[i]->v_owner)
 			continue;
 		KASSERT(g->g_vertices[i]->v_order == i,
 		    ("lock graph vertices disordered"));
 		if (checkorder) {
 			for (j = 0; j < i; j++) {
 				if (!g->g_vertices[j]->v_owner)
 					continue;
 				KASSERT(!graph_reaches(g->g_vertices[i],
 					g->g_vertices[j], NULL),
 				    ("lock graph vertices disordered"));
 			}
 		}
 	}
 }
 
 static void
 graph_print_vertices(struct owner_vertex_list *set)
 {
 	struct owner_vertex *v;
 
 	printf("{ ");
 	TAILQ_FOREACH(v, set, v_link) {
 		printf("%d:", v->v_order);
 		lf_print_owner(v->v_owner);
 		if (TAILQ_NEXT(v, v_link))
 			printf(", ");
 	}
 	printf(" }\n");
 }
 
 #endif
 
 /*
  * Calculate the sub-set of vertices v from the affected region [y..x]
  * where v is reachable from y. Return -1 if a loop was detected
  * (i.e. x is reachable from y, otherwise the number of vertices in
  * this subset.
  */
 static int
 graph_delta_forward(struct owner_graph *g, struct owner_vertex *x,
     struct owner_vertex *y, struct owner_vertex_list *delta)
 {
 	uint32_t gen;
 	struct owner_vertex *v;
 	struct owner_edge *e;
 	int n;
 
 	/*
 	 * We start with a set containing just y. Then for each vertex
 	 * v in the set so far unprocessed, we add each vertex that v
 	 * has an out-edge to and that is within the affected region
 	 * [y..x]. If we see the vertex x on our travels, stop
 	 * immediately.
 	 */
 	TAILQ_INIT(delta);
 	TAILQ_INSERT_TAIL(delta, y, v_link);
 	v = y;
 	n = 1;
 	gen = g->g_gen;
 	while (v) {
 		LIST_FOREACH(e, &v->v_outedges, e_outlink) {
 			if (e->e_to == x)
 				return -1;
 			if (e->e_to->v_order < x->v_order
 			    && e->e_to->v_gen != gen) {
 				e->e_to->v_gen = gen;
 				TAILQ_INSERT_TAIL(delta, e->e_to, v_link);
 				n++;
 			}
 		}
 		v = TAILQ_NEXT(v, v_link);
 	}
 
 	return (n);
 }
 
 /*
  * Calculate the sub-set of vertices v from the affected region [y..x]
  * where v reaches x. Return the number of vertices in this subset.
  */
 static int
 graph_delta_backward(struct owner_graph *g, struct owner_vertex *x,
     struct owner_vertex *y, struct owner_vertex_list *delta)
 {
 	uint32_t gen;
 	struct owner_vertex *v;
 	struct owner_edge *e;
 	int n;
 
 	/*
 	 * We start with a set containing just x. Then for each vertex
 	 * v in the set so far unprocessed, we add each vertex that v
 	 * has an in-edge from and that is within the affected region
 	 * [y..x].
 	 */
 	TAILQ_INIT(delta);
 	TAILQ_INSERT_TAIL(delta, x, v_link);
 	v = x;
 	n = 1;
 	gen = g->g_gen;
 	while (v) {
 		LIST_FOREACH(e, &v->v_inedges, e_inlink) {
 			if (e->e_from->v_order > y->v_order
 			    && e->e_from->v_gen != gen) {
 				e->e_from->v_gen = gen;
 				TAILQ_INSERT_HEAD(delta, e->e_from, v_link);
 				n++;
 			}
 		}
 		v = TAILQ_PREV(v, owner_vertex_list, v_link);
 	}
 
 	return (n);
 }
 
 static int
 graph_add_indices(int *indices, int n, struct owner_vertex_list *set)
 {
 	struct owner_vertex *v;
 	int i, j;
 
 	TAILQ_FOREACH(v, set, v_link) {
 		for (i = n;
 		     i > 0 && indices[i - 1] > v->v_order; i--)
 			;
 		for (j = n - 1; j >= i; j--)
 			indices[j + 1] = indices[j];
 		indices[i] = v->v_order;
 		n++;
 	}
 
 	return (n);
 }
 
 static int
 graph_assign_indices(struct owner_graph *g, int *indices, int nextunused,
     struct owner_vertex_list *set)
 {
 	struct owner_vertex *v, *vlowest;
 
 	while (!TAILQ_EMPTY(set)) {
 		vlowest = NULL;
 		TAILQ_FOREACH(v, set, v_link) {
 			if (!vlowest || v->v_order < vlowest->v_order)
 				vlowest = v;
 		}
 		TAILQ_REMOVE(set, vlowest, v_link);
 		vlowest->v_order = indices[nextunused];
 		g->g_vertices[vlowest->v_order] = vlowest;
 		nextunused++;
 	}
 
 	return (nextunused);
 }
 
 static int
 graph_add_edge(struct owner_graph *g, struct owner_vertex *x,
     struct owner_vertex *y)
 {
 	struct owner_edge *e;
 	struct owner_vertex_list deltaF, deltaB;
 	int nF, nB, n, vi, i;
 	int *indices;
 
 	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
 
 	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
 		if (e->e_to == y) {
 			e->e_refs++;
 			return (0);
 		}
 	}
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 8) {
 		printf("adding edge %d:", x->v_order);
 		lf_print_owner(x->v_owner);
 		printf(" -> %d:", y->v_order);
 		lf_print_owner(y->v_owner);
 		printf("\n");
 	}
 #endif
 	if (y->v_order < x->v_order) {
 		/*
 		 * The new edge violates the order. First find the set
 		 * of affected vertices reachable from y (deltaF) and
 		 * the set of affect vertices affected that reach x
 		 * (deltaB), using the graph generation number to
 		 * detect whether we have visited a given vertex
 		 * already. We re-order the graph so that each vertex
 		 * in deltaB appears before each vertex in deltaF.
 		 *
 		 * If x is a member of deltaF, then the new edge would
 		 * create a cycle. Otherwise, we may assume that
 		 * deltaF and deltaB are disjoint.
 		 */
 		g->g_gen++;
 		if (g->g_gen == 0) {
 			/*
 			 * Generation wrap.
 			 */
 			for (vi = 0; vi < g->g_size; vi++) {
 				g->g_vertices[vi]->v_gen = 0;
 			}
 			g->g_gen++;
 		}
 		nF = graph_delta_forward(g, x, y, &deltaF);
 		if (nF < 0) {
 #ifdef LOCKF_DEBUG
 			if (lockf_debug & 8) {
 				struct owner_vertex_list path;
 				printf("deadlock: ");
 				TAILQ_INIT(&path);
 				graph_reaches(y, x, &path);
 				graph_print_vertices(&path);
 			}
 #endif
 			return (EDEADLK);
 		}
 
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 8) {
 			printf("re-ordering graph vertices\n");
 			printf("deltaF = ");
 			graph_print_vertices(&deltaF);
 		}
 #endif
 
 		nB = graph_delta_backward(g, x, y, &deltaB);
 
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 8) {
 			printf("deltaB = ");
 			graph_print_vertices(&deltaB);
 		}
 #endif
 
 		/*
 		 * We first build a set of vertex indices (vertex
 		 * order values) that we may use, then we re-assign
 		 * orders first to those vertices in deltaB, then to
 		 * deltaF. Note that the contents of deltaF and deltaB
 		 * may be partially disordered - we perform an
 		 * insertion sort while building our index set.
 		 */
 		indices = g->g_indexbuf;
 		n = graph_add_indices(indices, 0, &deltaF);
 		graph_add_indices(indices, n, &deltaB);
 
 		/*
 		 * We must also be sure to maintain the relative
 		 * ordering of deltaF and deltaB when re-assigning
 		 * vertices. We do this by iteratively removing the
 		 * lowest ordered element from the set and assigning
 		 * it the next value from our new ordering.
 		 */
 		i = graph_assign_indices(g, indices, 0, &deltaB);
 		graph_assign_indices(g, indices, i, &deltaF);
 
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 8) {
 			struct owner_vertex_list set;
 			TAILQ_INIT(&set);
 			for (i = 0; i < nB + nF; i++)
 				TAILQ_INSERT_TAIL(&set,
 				    g->g_vertices[indices[i]], v_link);
 			printf("new ordering = ");
 			graph_print_vertices(&set);
 		}
 #endif
 	}
 
 	KASSERT(x->v_order < y->v_order, ("Failed to re-order graph"));
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 8) {
 		graph_check(g, TRUE);
 	}
 #endif
 
 	e = malloc(sizeof(struct owner_edge), M_LOCKF, M_WAITOK);
 
 	LIST_INSERT_HEAD(&x->v_outedges, e, e_outlink);
 	LIST_INSERT_HEAD(&y->v_inedges, e, e_inlink);
 	e->e_refs = 1;
 	e->e_from = x;
 	e->e_to = y;
 
 	return (0);
 }
 
 /*
  * Remove an edge x->y from the graph.
  */
 static void
 graph_remove_edge(struct owner_graph *g, struct owner_vertex *x,
     struct owner_vertex *y)
 {
 	struct owner_edge *e;
 
 	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
 
 	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
 		if (e->e_to == y)
 			break;
 	}
 	KASSERT(e, ("Removing non-existent edge from deadlock graph"));
 
 	e->e_refs--;
 	if (e->e_refs == 0) {
 #ifdef LOCKF_DEBUG
 		if (lockf_debug & 8) {
 			printf("removing edge %d:", x->v_order);
 			lf_print_owner(x->v_owner);
 			printf(" -> %d:", y->v_order);
 			lf_print_owner(y->v_owner);
 			printf("\n");
 		}
 #endif
 		LIST_REMOVE(e, e_outlink);
 		LIST_REMOVE(e, e_inlink);
 		free(e, M_LOCKF);
 	}
 }
 
 /*
  * Allocate a vertex from the free list. Return ENOMEM if there are
  * none.
  */
 static struct owner_vertex *
 graph_alloc_vertex(struct owner_graph *g, struct lock_owner *lo)
 {
 	struct owner_vertex *v;
 
 	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
 
 	v = malloc(sizeof(struct owner_vertex), M_LOCKF, M_WAITOK);
 	if (g->g_size == g->g_space) {
 		g->g_vertices = realloc(g->g_vertices,
 		    2 * g->g_space * sizeof(struct owner_vertex *),
 		    M_LOCKF, M_WAITOK);
 		free(g->g_indexbuf, M_LOCKF);
 		g->g_indexbuf = malloc(2 * g->g_space * sizeof(int),
 		    M_LOCKF, M_WAITOK);
 		g->g_space = 2 * g->g_space;
 	}
 	v->v_order = g->g_size;
 	v->v_gen = g->g_gen;
 	g->g_vertices[g->g_size] = v;
 	g->g_size++;
 
 	LIST_INIT(&v->v_outedges);
 	LIST_INIT(&v->v_inedges);
 	v->v_owner = lo;
 
 	return (v);
 }
 
 static void
 graph_free_vertex(struct owner_graph *g, struct owner_vertex *v)
 {
 	struct owner_vertex *w;
 	int i;
 
 	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
 	
 	KASSERT(LIST_EMPTY(&v->v_outedges), ("Freeing vertex with edges"));
 	KASSERT(LIST_EMPTY(&v->v_inedges), ("Freeing vertex with edges"));
 
 	/*
 	 * Remove from the graph's array and close up the gap,
 	 * renumbering the other vertices.
 	 */
 	for (i = v->v_order + 1; i < g->g_size; i++) {
 		w = g->g_vertices[i];
 		w->v_order--;
 		g->g_vertices[i - 1] = w;
 	}
 	g->g_size--;
 
 	free(v, M_LOCKF);
 }
 
 static struct owner_graph *
 graph_init(struct owner_graph *g)
 {
 
 	g->g_vertices = malloc(10 * sizeof(struct owner_vertex *),
 	    M_LOCKF, M_WAITOK);
 	g->g_size = 0;
 	g->g_space = 10;
 	g->g_indexbuf = malloc(g->g_space * sizeof(int), M_LOCKF, M_WAITOK);
 	g->g_gen = 0;
 
 	return (g);
 }
 
 #ifdef LOCKF_DEBUG
 /*
  * Print description of a lock owner
  */
 static void
 lf_print_owner(struct lock_owner *lo)
 {
 
 	if (lo->lo_flags & F_REMOTE) {
 		printf("remote pid %d, system %d",
 		    lo->lo_pid, lo->lo_sysid);
 	} else if (lo->lo_flags & F_FLOCK) {
 		printf("file %p", lo->lo_id);
 	} else {
 		printf("local pid %d", lo->lo_pid);
 	}
 }
 
 /*
  * Print out a lock.
  */
 static void
 lf_print(char *tag, struct lockf_entry *lock)
 {
 
 	printf("%s: lock %p for ", tag, (void *)lock);
 	lf_print_owner(lock->lf_owner);
 	if (lock->lf_inode != (struct inode *)0)
 		printf(" in ino %ju on dev <%s>,",
 		    (uintmax_t)lock->lf_inode->i_number,
 		    devtoname(lock->lf_inode->i_dev));
 	printf(" %s, start %jd, end ",
 	    lock->lf_type == F_RDLCK ? "shared" :
 	    lock->lf_type == F_WRLCK ? "exclusive" :
 	    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
 	    (intmax_t)lock->lf_start);
 	if (lock->lf_end == OFF_MAX)
 		printf("EOF");
 	else
 		printf("%jd", (intmax_t)lock->lf_end);
 	if (!LIST_EMPTY(&lock->lf_outedges))
 		printf(" block %p\n",
 		    (void *)LIST_FIRST(&lock->lf_outedges)->le_to);
 	else
 		printf("\n");
 }
 
 static void
 lf_printlist(char *tag, struct lockf_entry *lock)
 {
 	struct lockf_entry *lf, *blk;
 	struct lockf_edge *e;
 
 	if (lock->lf_inode == (struct inode *)0)
 		return;
 
 	printf("%s: Lock list for ino %ju on dev <%s>:\n",
 	    tag, (uintmax_t)lock->lf_inode->i_number,
 	    devtoname(lock->lf_inode->i_dev));
 	LIST_FOREACH(lf, &lock->lf_vnode->v_lockf->ls_active, lf_link) {
 		printf("\tlock %p for ",(void *)lf);
 		lf_print_owner(lock->lf_owner);
 		printf(", %s, start %jd, end %jd",
 		    lf->lf_type == F_RDLCK ? "shared" :
 		    lf->lf_type == F_WRLCK ? "exclusive" :
 		    lf->lf_type == F_UNLCK ? "unlock" :
 		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
 		LIST_FOREACH(e, &lf->lf_outedges, le_outlink) {
 			blk = e->le_to;
 			printf("\n\t\tlock request %p for ", (void *)blk);
 			lf_print_owner(blk->lf_owner);
 			printf(", %s, start %jd, end %jd",
 			    blk->lf_type == F_RDLCK ? "shared" :
 			    blk->lf_type == F_WRLCK ? "exclusive" :
 			    blk->lf_type == F_UNLCK ? "unlock" :
 			    "unknown", (intmax_t)blk->lf_start,
 			    (intmax_t)blk->lf_end);
 			if (!LIST_EMPTY(&blk->lf_inedges))
 				panic("lf_printlist: bad list");
 		}
 		printf("\n");
 	}
 }
 #endif /* LOCKF_DEBUG */
Index: head/sys/nlm/nlm_advlock.c
===================================================================
--- head/sys/nlm/nlm_advlock.c	(revision 302215)
+++ head/sys/nlm/nlm_advlock.c	(revision 302216)
@@ -1,1267 +1,1273 @@
 /*-
  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
  * Authors: Doug Rabson <dfr@rabson.org>
  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsmount.h>
 
 #include <nlm/nlm_prot.h>
 #include <nlm/nlm.h>
 
 /*
  * We need to keep track of the svid values used for F_FLOCK locks.
  */
 struct nlm_file_svid {
 	int		ns_refs;	/* thread count + 1 if active */
 	int		ns_svid;	/* on-the-wire SVID for this file */
 	struct ucred	*ns_ucred;	/* creds to use for lock recovery */
 	void		*ns_id;		/* local struct file pointer */
 	bool_t		ns_active;	/* TRUE if we own a lock */
 	LIST_ENTRY(nlm_file_svid) ns_link;
 };
 LIST_HEAD(nlm_file_svid_list, nlm_file_svid);
 
 #define NLM_SVID_HASH_SIZE	256
 struct nlm_file_svid_list nlm_file_svids[NLM_SVID_HASH_SIZE];
 
 struct mtx nlm_svid_lock;
 static struct unrhdr *nlm_svid_allocator;
 static volatile u_int nlm_xid = 1;
 
 static int nlm_setlock(struct nlm_host *host, struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size, bool_t reclaim);
 static int nlm_clearlock(struct nlm_host *host,  struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size);
 static int nlm_getlock(struct nlm_host *host, struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size);
 static int nlm_map_status(nlm4_stats stat);
 static struct nlm_file_svid *nlm_find_svid(void *id);
 static void nlm_free_svid(struct nlm_file_svid *nf);
 static int nlm_init_lock(struct flock *fl, int flags, int svid,
     rpcvers_t vers, size_t fhlen, void *fh, off_t size,
     struct nlm4_lock *lock, char oh_space[32]);
 
 static void
 nlm_client_init(void *dummy)
 {
 	int i;
 
 	mtx_init(&nlm_svid_lock, "NLM svid lock", NULL, MTX_DEF);
 	/* pid_max cannot be greater than PID_MAX */
 	nlm_svid_allocator = new_unrhdr(PID_MAX + 2, INT_MAX, &nlm_svid_lock);
 	for (i = 0; i < NLM_SVID_HASH_SIZE; i++)
 		LIST_INIT(&nlm_file_svids[i]);
 }
 SYSINIT(nlm_client_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_client_init, NULL);
 
 static int
 nlm_msg(struct thread *td, const char *server, const char *msg, int error)
 {
 	struct proc *p;
 
 	p = td ? td->td_proc : NULL;
 	if (error) {
 		tprintf(p, LOG_INFO, "nfs server %s: %s, error %d\n", server,
 		    msg, error);
 	} else {
 		tprintf(p, LOG_INFO, "nfs server %s: %s\n", server, msg);
 	}
 	return (0);
 }
 
 struct nlm_feedback_arg {
 	bool_t	nf_printed;
 	struct nfsmount *nf_nmp;
 };
 
 static void
 nlm_down(struct nlm_feedback_arg *nf, struct thread *td,
     const char *msg, int error)
 {
 	struct nfsmount *nmp = nf->nf_nmp;
 
 	if (nmp == NULL)
 		return;
 	mtx_lock(&nmp->nm_mtx);
 	if (!(nmp->nm_state & NFSSTA_LOCKTIMEO)) {
 		nmp->nm_state |= NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 0);
 	} else {
 		mtx_unlock(&nmp->nm_mtx);
 	}
 
 	nf->nf_printed = TRUE;
 	nlm_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, error);
 }
 
 static void
 nlm_up(struct nlm_feedback_arg *nf, struct thread *td,
     const char *msg)
 {
 	struct nfsmount *nmp = nf->nf_nmp;
 
 	if (!nf->nf_printed)
 		return;
 
 	nlm_msg(td, nmp->nm_mountp->mnt_stat.f_mntfromname, msg, 0);
 
 	mtx_lock(&nmp->nm_mtx);
 	if (nmp->nm_state & NFSSTA_LOCKTIMEO) {
 		nmp->nm_state &= ~NFSSTA_LOCKTIMEO;
 		mtx_unlock(&nmp->nm_mtx);
 		vfs_event_signal(&nmp->nm_mountp->mnt_stat.f_fsid,
 		    VQ_NOTRESPLOCK, 1);
 	} else {
 		mtx_unlock(&nmp->nm_mtx);
 	}
 }
 
 static void
 nlm_feedback(int type, int proc, void *arg)
 {
 	struct thread *td = curthread;
 	struct nlm_feedback_arg *nf = (struct nlm_feedback_arg *) arg;
 
 	switch (type) {
 	case FEEDBACK_REXMIT2:
 	case FEEDBACK_RECONNECT:
 		nlm_down(nf, td, "lockd not responding", 0);
 		break;
 
 	case FEEDBACK_OK:
 		nlm_up(nf, td, "lockd is alive again");
 		break;
 	}
 }
 
 /*
  * nlm_advlock --
  *      NFS advisory byte-level locks.
  */
 static int
 nlm_advlock_internal(struct vnode *vp, void *id, int op, struct flock *fl,
     int flags, bool_t reclaim, bool_t unlock_vp)
 {
 	struct thread *td = curthread;
 	struct nfsmount *nmp;
 	off_t size;
 	size_t fhlen;
 	union nfsfh fh;
 	struct sockaddr *sa;
 	struct sockaddr_storage ss;
 	char servername[MNAMELEN];
 	struct timeval timo;
 	int retries;
 	rpcvers_t vers;
 	struct nlm_host *host;
 	struct rpc_callextra ext;
 	struct nlm_feedback_arg nf;
 	AUTH *auth;
 	struct ucred *cred, *cred1;
 	struct nlm_file_svid *ns;
 	int svid;
 	int error;
 	int is_v3;
 
 	ASSERT_VOP_LOCKED(vp, "nlm_advlock_1");
 
 	nmp = VFSTONFS(vp->v_mount);
 	/*
 	 * Push any pending writes to the server and flush our cache
 	 * so that if we are contending with another machine for a
 	 * file, we get whatever they wrote and vice-versa.
 	 */
 	if (op == F_SETLK || op == F_UNLCK)
 		nmp->nm_vinvalbuf(vp, V_SAVE, td, 1);
 
 	strcpy(servername, nmp->nm_hostname);
 	nmp->nm_getinfo(vp, fh.fh_bytes, &fhlen, &ss, &is_v3, &size, &timo);
 	sa = (struct sockaddr *) &ss;
 	if (is_v3 != 0)
 		vers = NLM_VERS4;
 	else
 		vers = NLM_VERS;
 
 	if (nmp->nm_flag & NFSMNT_SOFT)
 		retries = nmp->nm_retry;
 	else
 		retries = INT_MAX;
 
 	/*
 	 * We need to switch to mount-point creds so that we can send
 	 * packets from a privileged port.  Reference mnt_cred and
 	 * switch to them before unlocking the vnode, since mount
 	 * point could be unmounted right after unlock.
 	 */
 	cred = td->td_ucred;
 	td->td_ucred = vp->v_mount->mnt_cred;
 	crhold(td->td_ucred);
 	if (unlock_vp)
 		VOP_UNLOCK(vp, 0);
 
 	host = nlm_find_host_by_name(servername, sa, vers);
 	auth = authunix_create(cred);
 	memset(&ext, 0, sizeof(ext));
 
 	nf.nf_printed = FALSE;
 	nf.nf_nmp = nmp;
 	ext.rc_auth = auth;
 
 	ext.rc_feedback = nlm_feedback;
 	ext.rc_feedback_arg = &nf;
 	ext.rc_timers = NULL;
 
 	ns = NULL;
 	if (flags & F_FLOCK) {
 		ns = nlm_find_svid(id);
 		KASSERT(fl->l_start == 0 && fl->l_len == 0,
 		    ("F_FLOCK lock requests must be whole-file locks"));
 		if (!ns->ns_ucred) {
 			/*
 			 * Remember the creds used for locking in case
 			 * we need to recover the lock later.
 			 */
 			ns->ns_ucred = crdup(cred);
 		}
 		svid = ns->ns_svid;
 	} else if (flags & F_REMOTE) {
 		/*
 		 * If we are recovering after a server restart or
 		 * trashing locks on a force unmount, use the same
 		 * svid as last time.
 		 */
 		svid = fl->l_pid;
 	} else {
 		svid = ((struct proc *) id)->p_pid;
 	}
 
 	switch(op) {
 	case F_SETLK:
 		if ((flags & (F_FLOCK|F_WAIT)) == (F_FLOCK|F_WAIT)
 		    && fl->l_type == F_WRLCK) {
 			/*
 			 * The semantics for flock(2) require that any
 			 * shared lock on the file must be released
 			 * before an exclusive lock is granted. The
 			 * local locking code interprets this by
 			 * unlocking the file before sleeping on a
 			 * blocked exclusive lock request. We
 			 * approximate this by first attempting
 			 * non-blocking and if that fails, we unlock
 			 * the file and block.
 			 */
 			error = nlm_setlock(host, &ext, vers, &timo, retries,
 			    vp, F_SETLK, fl, flags & ~F_WAIT,
 			    svid, fhlen, &fh.fh_bytes, size, reclaim);
 			if (error == EAGAIN) {
 				fl->l_type = F_UNLCK;
 				error = nlm_clearlock(host, &ext, vers, &timo,
 				    retries, vp, F_UNLCK, fl, flags,
 				    svid, fhlen, &fh.fh_bytes, size);
 				fl->l_type = F_WRLCK;
 				if (!error) {
 					mtx_lock(&nlm_svid_lock);
 					if (ns->ns_active) {
 						ns->ns_refs--;
 						ns->ns_active = FALSE;
 					}
 					mtx_unlock(&nlm_svid_lock);
 					flags |= F_WAIT;
 					error = nlm_setlock(host, &ext, vers,
 					    &timo, retries, vp, F_SETLK, fl,
 					    flags, svid, fhlen, &fh.fh_bytes,
 					    size, reclaim);
 				}
 			}
 		} else {
 			error = nlm_setlock(host, &ext, vers, &timo, retries,
 			    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes,
 			    size, reclaim);
 		}
 		if (!error && ns) {
 			mtx_lock(&nlm_svid_lock);
 			if (!ns->ns_active) {
 				/*
 				 * Add one to the reference count to
 				 * hold onto the SVID for the lifetime
 				 * of the lock. Note that since
 				 * F_FLOCK only supports whole-file
 				 * locks, there can only be one active
 				 * lock for this SVID.
 				 */
 				ns->ns_refs++;
 				ns->ns_active = TRUE;
 			}
 			mtx_unlock(&nlm_svid_lock);
 		}
 		break;
 
 	case F_UNLCK:
 		error = nlm_clearlock(host, &ext, vers, &timo, retries,
 		    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes, size);
 		if (!error && ns) {
 			mtx_lock(&nlm_svid_lock);
 			if (ns->ns_active) {
 				ns->ns_refs--;
 				ns->ns_active = FALSE;
 			}
 			mtx_unlock(&nlm_svid_lock);
 		}
 		break;
 
 	case F_GETLK:
 		error = nlm_getlock(host, &ext, vers, &timo, retries,
 		    vp, op, fl, flags, svid, fhlen, &fh.fh_bytes, size);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (ns)
 		nlm_free_svid(ns);
 
 	cred1 = td->td_ucred;
 	td->td_ucred = cred;
 	crfree(cred1);
 	AUTH_DESTROY(auth);
 
 	nlm_host_release(host);
 
 	return (error);
 }
 
 int
 nlm_advlock(struct vop_advlock_args *ap)
 {
 
 	return (nlm_advlock_internal(ap->a_vp, ap->a_id, ap->a_op, ap->a_fl,
 		ap->a_flags, FALSE, TRUE));
 }
 
 /*
  * Set the creds of td to the creds of the given lock's owner. The new
  * creds reference count will be incremented via crhold. The caller is
  * responsible for calling crfree and restoring td's original creds.
  */
 static void
 nlm_set_creds_for_lock(struct thread *td, struct flock *fl)
 {
 	int i;
 	struct nlm_file_svid *ns;
 	struct proc *p;
 	struct ucred *cred;
 
 	cred = NULL;
 	if (fl->l_pid > PID_MAX) {
 		/*
 		 * If this was originally a F_FLOCK-style lock, we
 		 * recorded the creds used when it was originally
 		 * locked in the nlm_file_svid structure.
 		 */
 		mtx_lock(&nlm_svid_lock);
 		for (i = 0; i < NLM_SVID_HASH_SIZE; i++) {
 			for (ns = LIST_FIRST(&nlm_file_svids[i]); ns;
 			     ns = LIST_NEXT(ns, ns_link)) {
 				if (ns->ns_svid == fl->l_pid) {
 					cred = crhold(ns->ns_ucred);
 					break;
 				}
 			}
 		}
 		mtx_unlock(&nlm_svid_lock);
 	} else {
 		/*
 		 * This lock is owned by a process. Get a reference to
 		 * the process creds.
 		 */
 		p = pfind(fl->l_pid);
 		if (p) {
 			cred = crhold(p->p_ucred);
 			PROC_UNLOCK(p);
 		}
 	}
 
 	/*
 	 * If we can't find a cred, fall back on the recovery
 	 * thread's cred.
 	 */
 	if (!cred) {
 		cred = crhold(td->td_ucred);
 	}
 
 	td->td_ucred = cred;
 }
 
 static int
 nlm_reclaim_free_lock(struct vnode *vp, struct flock *fl, void *arg)
 {
 	struct flock newfl;
 	struct thread *td = curthread;
 	struct ucred *oldcred;
 	int error;
 
 	newfl = *fl;
 	newfl.l_type = F_UNLCK;
 
 	oldcred = td->td_ucred;
 	nlm_set_creds_for_lock(td, &newfl);
 
 	error = nlm_advlock_internal(vp, NULL, F_UNLCK, &newfl, F_REMOTE,
 	    FALSE, FALSE);
 
 	crfree(td->td_ucred);
 	td->td_ucred = oldcred;
 
 	return (error);
 }
 
 int
 nlm_reclaim(struct vop_reclaim_args *ap)
 {
 
 	nlm_cancel_wait(ap->a_vp);
 	lf_iteratelocks_vnode(ap->a_vp, nlm_reclaim_free_lock, NULL);
 	return (0);
 }
 
 struct nlm_recovery_context {
 	struct nlm_host	*nr_host;	/* host we are recovering */
 	int		nr_state;	/* remote NSM state for recovery */
 };
 
 static int
 nlm_client_recover_lock(struct vnode *vp, struct flock *fl, void *arg)
 {
 	struct nlm_recovery_context *nr = (struct nlm_recovery_context *) arg;
 	struct thread *td = curthread;
 	struct ucred *oldcred;
 	int state, error;
 
 	/*
 	 * If the remote NSM state changes during recovery, the host
 	 * must have rebooted a second time. In that case, we must
 	 * restart the recovery.
 	 */
 	state = nlm_host_get_state(nr->nr_host);
 	if (nr->nr_state != state)
 		return (ERESTART);
 
 	error = vn_lock(vp, LK_SHARED);
 	if (error)
 		return (error);
 
 	oldcred = td->td_ucred;
 	nlm_set_creds_for_lock(td, fl);
 
 	error = nlm_advlock_internal(vp, NULL, F_SETLK, fl, F_REMOTE,
 	    TRUE, TRUE);
 
 	crfree(td->td_ucred);
 	td->td_ucred = oldcred;
 
 	return (error);
 }
 
 void
 nlm_client_recovery(struct nlm_host *host)
 {
 	struct nlm_recovery_context nr;
 	int sysid, error;
 
 	sysid = NLM_SYSID_CLIENT | nlm_host_get_sysid(host);
 	do {
 		nr.nr_host = host;
 		nr.nr_state = nlm_host_get_state(host);
 		error = lf_iteratelocks_sysid(sysid,
 		    nlm_client_recover_lock, &nr);
 	} while (error == ERESTART);
 }
 
 static void
 nlm_convert_to_nlm_lock(struct nlm_lock *dst, struct nlm4_lock *src)
 {
 
 	dst->caller_name = src->caller_name;
 	dst->fh = src->fh;
 	dst->oh = src->oh;
 	dst->svid = src->svid;
 	dst->l_offset = src->l_offset;
 	dst->l_len = src->l_len;
 }
 
 static void
 nlm_convert_to_nlm4_holder(struct nlm4_holder *dst, struct nlm_holder *src)
 {
 
 	dst->exclusive = src->exclusive;
 	dst->svid = src->svid;
 	dst->oh = src->oh;
 	dst->l_offset = src->l_offset;
 	dst->l_len = src->l_len;
 }
 
 static void
 nlm_convert_to_nlm4_res(struct nlm4_res *dst, struct nlm_res *src)
 {
 	dst->cookie = src->cookie;
 	dst->stat.stat = (enum nlm4_stats) src->stat.stat;
 }
 
 static enum clnt_stat
 nlm_test_rpc(rpcvers_t vers, nlm4_testargs *args, nlm4_testres *res, CLIENT *client,
     struct rpc_callextra *ext, struct timeval timo)
 {
 	if (vers == NLM_VERS4) {
 		return nlm4_test_4(args, res, client, ext, timo);
 	} else {
 		nlm_testargs args1;
 		nlm_testres res1;
 		enum clnt_stat stat;
 
 		args1.cookie = args->cookie;
 		args1.exclusive = args->exclusive;
 		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
 		memset(&res1, 0, sizeof(res1));
 
 		stat = nlm_test_1(&args1, &res1, client, ext, timo);
 
 		if (stat == RPC_SUCCESS) {
 			res->cookie = res1.cookie;
 			res->stat.stat = (enum nlm4_stats) res1.stat.stat;
 			if (res1.stat.stat == nlm_denied)
 				nlm_convert_to_nlm4_holder(
 					&res->stat.nlm4_testrply_u.holder,
 					&res1.stat.nlm_testrply_u.holder);
 		}
 
 		return (stat);
 	}
 }
 
 static enum clnt_stat
 nlm_lock_rpc(rpcvers_t vers, nlm4_lockargs *args, nlm4_res *res, CLIENT *client,
     struct rpc_callextra *ext, struct timeval timo)
 {
 	if (vers == NLM_VERS4) {
 		return nlm4_lock_4(args, res, client, ext, timo);
 	} else {
 		nlm_lockargs args1;
 		nlm_res res1;
 		enum clnt_stat stat;
 
 		args1.cookie = args->cookie;
 		args1.block = args->block;
 		args1.exclusive = args->exclusive;
 		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
 		args1.reclaim = args->reclaim;
 		args1.state = args->state;
 		memset(&res1, 0, sizeof(res1));
 
 		stat = nlm_lock_1(&args1, &res1, client, ext, timo);
 
 		if (stat == RPC_SUCCESS) {
 			nlm_convert_to_nlm4_res(res, &res1);
 		}
 
 		return (stat);
 	}
 }
 
 static enum clnt_stat
 nlm_cancel_rpc(rpcvers_t vers, nlm4_cancargs *args, nlm4_res *res, CLIENT *client,
     struct rpc_callextra *ext, struct timeval timo)
 {
 	if (vers == NLM_VERS4) {
 		return nlm4_cancel_4(args, res, client, ext, timo);
 	} else {
 		nlm_cancargs args1;
 		nlm_res res1;
 		enum clnt_stat stat;
 
 		args1.cookie = args->cookie;
 		args1.block = args->block;
 		args1.exclusive = args->exclusive;
 		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
 		memset(&res1, 0, sizeof(res1));
 
 		stat = nlm_cancel_1(&args1, &res1, client, ext, timo);
 
 		if (stat == RPC_SUCCESS) {
 			nlm_convert_to_nlm4_res(res, &res1);
 		}
 
 		return (stat);
 	}
 }
 
 static enum clnt_stat
 nlm_unlock_rpc(rpcvers_t vers, nlm4_unlockargs *args, nlm4_res *res, CLIENT *client,
     struct rpc_callextra *ext, struct timeval timo)
 {
 	if (vers == NLM_VERS4) {
 		return nlm4_unlock_4(args, res, client, ext, timo);
 	} else {
 		nlm_unlockargs args1;
 		nlm_res res1;
 		enum clnt_stat stat;
 
 		args1.cookie = args->cookie;
 		nlm_convert_to_nlm_lock(&args1.alock, &args->alock);
 		memset(&res1, 0, sizeof(res1));
 
 		stat = nlm_unlock_1(&args1, &res1, client, ext, timo);
 
 		if (stat == RPC_SUCCESS) {
 			nlm_convert_to_nlm4_res(res, &res1);
 		}
 
 		return (stat);
 	}
 }
 
 /*
  * Called after a lock request (set or clear) succeeded. We record the
  * details in the local lock manager. Note that since the remote
  * server has granted the lock, we can be sure that it doesn't
  * conflict with any other locks we have in the local lock manager.
  *
  * Since it is possible that host may also make NLM client requests to
  * our NLM server, we use a different sysid value to record our own
  * client locks.
  *
  * Note that since it is possible for us to receive replies from the
  * server in a different order than the locks were granted (e.g. if
  * many local threads are contending for the same lock), we must use a
  * blocking operation when registering with the local lock manager.
  * We expect that any actual wait will be rare and short hence we
  * ignore signals for this.
  */
 static void
 nlm_record_lock(struct vnode *vp, int op, struct flock *fl,
     int svid, int sysid, off_t size)
 {
 	struct vop_advlockasync_args a;
 	struct flock newfl;
-	int error;
+	struct proc *p;
+	int error, stops_deferred;
 
 	a.a_vp = vp;
 	a.a_id = NULL;
 	a.a_op = op;
 	a.a_fl = &newfl;
 	a.a_flags = F_REMOTE|F_WAIT|F_NOINTR;
 	a.a_task = NULL;
 	a.a_cookiep = NULL;
 	newfl.l_start = fl->l_start;
 	newfl.l_len = fl->l_len;
 	newfl.l_type = fl->l_type;
 	newfl.l_whence = fl->l_whence;
 	newfl.l_pid = svid;
 	newfl.l_sysid = NLM_SYSID_CLIENT | sysid;
 
 	for (;;) {
 		error = lf_advlockasync(&a, &vp->v_lockf, size);
 		if (error == EDEADLK) {
 			/*
 			 * Locks are associated with the processes and
 			 * not with threads.  Suppose we have two
 			 * threads A1 A2 in one process, A1 locked
 			 * file f1, A2 is locking file f2, and A1 is
 			 * unlocking f1. Then remote server may
 			 * already unlocked f1, while local still not
 			 * yet scheduled A1 to make the call to local
 			 * advlock manager. The process B owns lock on
 			 * f2 and issued the lock on f1.  Remote would
 			 * grant B the request on f1, but local would
 			 * return EDEADLK.
 			*/
 			pause("nlmdlk", 1);
-			/* XXXKIB allow suspend */
+			p = curproc;
+			stops_deferred = sigdeferstop(SIGDEFERSTOP_OFF);
+			PROC_LOCK(p);
+			thread_suspend_check(0);
+			PROC_UNLOCK(p);
+			sigallowstop(stops_deferred);
 		} else if (error == EINTR) {
 			/*
 			 * lf_purgelocks() might wake up the lock
 			 * waiter and removed our lock graph edges.
 			 * There is no sense in re-trying recording
 			 * the lock to the local manager after
 			 * reclaim.
 			 */
 			error = 0;
 			break;
 		} else
 			break;
 	}
 	KASSERT(error == 0 || error == ENOENT,
 	    ("Failed to register NFS lock locally - error=%d", error));
 }
 
 static int
 nlm_setlock(struct nlm_host *host, struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size, bool_t reclaim)
 {
 	struct nlm4_lockargs args;
 	char oh_space[32];
 	struct nlm4_res res;
 	u_int xid;
 	CLIENT *client;
 	enum clnt_stat stat;
 	int retry, block, exclusive;
 	void *wait_handle = NULL;
 	int error;
 
 	memset(&args, 0, sizeof(args));
 	memset(&res, 0, sizeof(res));
 
 	block = (flags & F_WAIT) ? TRUE : FALSE;
 	exclusive = (fl->l_type == F_WRLCK);
 
 	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
 	    &args.alock, oh_space);
 	if (error)
 		return (error);
 	args.block = block;
 	args.exclusive = exclusive;
 	args.reclaim = reclaim;
 	args.state = nlm_nsm_state;
 
 	retry = 5*hz;
 	for (;;) {
 		client = nlm_host_get_rpc(host, FALSE);
 		if (!client)
 			return (ENOLCK); /* XXX retry? */
 
 		if (block)
 			wait_handle = nlm_register_wait_lock(&args.alock, vp);
 
 		xid = atomic_fetchadd_int(&nlm_xid, 1);
 		args.cookie.n_len = sizeof(xid);
 		args.cookie.n_bytes = (char*) &xid;
 
 		stat = nlm_lock_rpc(vers, &args, &res, client, ext, *timo);
 
 		CLNT_RELEASE(client);
 
 		if (stat != RPC_SUCCESS) {
 			if (block)
 				nlm_deregister_wait_lock(wait_handle);
 			if (retries) {
 				retries--;
 				continue;
 			}
 			return (EINVAL);
 		}
 
 		/*
 		 * Free res.cookie.
 		 */
 		xdr_free((xdrproc_t) xdr_nlm4_res, &res);
 
 		if (block && res.stat.stat != nlm4_blocked)
 			nlm_deregister_wait_lock(wait_handle);
 
 		if (res.stat.stat == nlm4_denied_grace_period) {
 			/*
 			 * The server has recently rebooted and is
 			 * giving old clients a change to reclaim
 			 * their locks. Wait for a few seconds and try
 			 * again.
 			 */
 			error = tsleep(&args, PCATCH, "nlmgrace", retry);
 			if (error && error != EWOULDBLOCK)
 				return (error);
 			retry = 2*retry;
 			if (retry > 30*hz)
 				retry = 30*hz;
 			continue;
 		}
 
 		if (block && res.stat.stat == nlm4_blocked) {
 			/*
 			 * The server should call us back with a
 			 * granted message when the lock succeeds. In
 			 * order to deal with broken servers, lost
 			 * granted messages and server reboots, we
 			 * will also re-try every few seconds.
 			 */
 			error = nlm_wait_lock(wait_handle, retry);
 			if (error == EWOULDBLOCK) {
 				retry = 2*retry;
 				if (retry > 30*hz)
 					retry = 30*hz;
 				continue;
 			}
 			if (error) {
 				/*
 				 * We need to call the server to
 				 * cancel our lock request.
 				 */
 				nlm4_cancargs cancel;
 
 				memset(&cancel, 0, sizeof(cancel));
 
 				xid = atomic_fetchadd_int(&nlm_xid, 1);
 				cancel.cookie.n_len = sizeof(xid);
 				cancel.cookie.n_bytes = (char*) &xid;
 				cancel.block = block;
 				cancel.exclusive = exclusive;
 				cancel.alock = args.alock;
 
 				do {
 					client = nlm_host_get_rpc(host, FALSE);
 					if (!client)
 						/* XXX retry? */
 						return (ENOLCK);
 
 					stat = nlm_cancel_rpc(vers, &cancel,
 					    &res, client, ext, *timo);
 
 					CLNT_RELEASE(client);
 
 					if (stat != RPC_SUCCESS) {
 						/*
 						 * We need to cope
 						 * with temporary
 						 * network partitions
 						 * as well as server
 						 * reboots. This means
 						 * we have to keep
 						 * trying to cancel
 						 * until the server
 						 * wakes up again.
 						 */
 						pause("nlmcancel", 10*hz);
 					}
 				} while (stat != RPC_SUCCESS);
 
 				/*
 				 * Free res.cookie.
 				 */
 				xdr_free((xdrproc_t) xdr_nlm4_res, &res);
 
 				switch (res.stat.stat) {
 				case nlm_denied:
 					/*
 					 * There was nothing
 					 * to cancel. We are
 					 * going to go ahead
 					 * and assume we got
 					 * the lock.
 					 */
 					error = 0;
 					break;
 
 				case nlm4_denied_grace_period:
 					/*
 					 * The server has
 					 * recently rebooted -
 					 * treat this as a
 					 * successful
 					 * cancellation.
 					 */
 					break;
 
 				case nlm4_granted:
 					/*
 					 * We managed to
 					 * cancel.
 					 */
 					break;
 
 				default:
 					/*
 					 * Broken server
 					 * implementation -
 					 * can't really do
 					 * anything here.
 					 */
 					break;
 				}
 
 			}
 		} else {
 			error = nlm_map_status(res.stat.stat);
 		}
 
 		if (!error && !reclaim) {
 			nlm_record_lock(vp, op, fl, args.alock.svid,
 			    nlm_host_get_sysid(host), size);
 			nlm_host_monitor(host, 0);
 		}
 
 		return (error);
 	}
 }
 
 static int
 nlm_clearlock(struct nlm_host *host, struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size)
 {
 	struct nlm4_unlockargs args;
 	char oh_space[32];
 	struct nlm4_res res;
 	u_int xid;
 	CLIENT *client;
 	enum clnt_stat stat;
 	int error;
 
 	memset(&args, 0, sizeof(args));
 	memset(&res, 0, sizeof(res));
 
 	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
 	    &args.alock, oh_space);
 	if (error)
 		return (error);
 
 	for (;;) {
 		client = nlm_host_get_rpc(host, FALSE);
 		if (!client)
 			return (ENOLCK); /* XXX retry? */
 
 		xid = atomic_fetchadd_int(&nlm_xid, 1);
 		args.cookie.n_len = sizeof(xid);
 		args.cookie.n_bytes = (char*) &xid;
 
 		stat = nlm_unlock_rpc(vers, &args, &res, client, ext, *timo);
 
 		CLNT_RELEASE(client);
 
 		if (stat != RPC_SUCCESS) {
 			if (retries) {
 				retries--;
 				continue;
 			}
 			return (EINVAL);
 		}
 
 		/*
 		 * Free res.cookie.
 		 */
 		xdr_free((xdrproc_t) xdr_nlm4_res, &res);
 
 		if (res.stat.stat == nlm4_denied_grace_period) {
 			/*
 			 * The server has recently rebooted and is
 			 * giving old clients a change to reclaim
 			 * their locks. Wait for a few seconds and try
 			 * again.
 			 */
 			error = tsleep(&args, PCATCH, "nlmgrace", 5*hz);
 			if (error && error != EWOULDBLOCK)
 				return (error);
 			continue;
 		}
 
 		/*
 		 * If we are being called via nlm_reclaim (which will
 		 * use the F_REMOTE flag), don't record the lock
 		 * operation in the local lock manager since the vnode
 		 * is going away.
 		 */
 		if (!(flags & F_REMOTE))
 			nlm_record_lock(vp, op, fl, args.alock.svid,
 			    nlm_host_get_sysid(host), size);
 
 		return (0);
 	}
 }
 
 static int
 nlm_getlock(struct nlm_host *host, struct rpc_callextra *ext,
     rpcvers_t vers, struct timeval *timo, int retries,
     struct vnode *vp, int op, struct flock *fl, int flags,
     int svid, size_t fhlen, void *fh, off_t size)
 {
 	struct nlm4_testargs args;
 	char oh_space[32];
 	struct nlm4_testres res;
 	u_int xid;
 	CLIENT *client;
 	enum clnt_stat stat;
 	int exclusive;
 	int error;
 
 	KASSERT(!(flags & F_FLOCK), ("unexpected F_FLOCK for F_GETLK"));
 
 	memset(&args, 0, sizeof(args));
 	memset(&res, 0, sizeof(res));
 
 	exclusive = (fl->l_type == F_WRLCK);
 
 	error = nlm_init_lock(fl, flags, svid, vers, fhlen, fh, size,
 	    &args.alock, oh_space);
 	if (error)
 		return (error);
 	args.exclusive = exclusive;
 
 	for (;;) {
 		client = nlm_host_get_rpc(host, FALSE);
 		if (!client)
 			return (ENOLCK); /* XXX retry? */
 
 		xid = atomic_fetchadd_int(&nlm_xid, 1);
 		args.cookie.n_len = sizeof(xid);
 		args.cookie.n_bytes = (char*) &xid;
 
 		stat = nlm_test_rpc(vers, &args, &res, client, ext, *timo);
 
 		CLNT_RELEASE(client);
 
 		if (stat != RPC_SUCCESS) {
 			if (retries) {
 				retries--;
 				continue;
 			}
 			return (EINVAL);
 		}
 
 		if (res.stat.stat == nlm4_denied_grace_period) {
 			/*
 			 * The server has recently rebooted and is
 			 * giving old clients a change to reclaim
 			 * their locks. Wait for a few seconds and try
 			 * again.
 			 */
 			xdr_free((xdrproc_t) xdr_nlm4_testres, &res);
 			error = tsleep(&args, PCATCH, "nlmgrace", 5*hz);
 			if (error && error != EWOULDBLOCK)
 				return (error);
 			continue;
 		}
 
 		if (res.stat.stat == nlm4_denied) {
 			struct nlm4_holder *h =
 				&res.stat.nlm4_testrply_u.holder;
 			fl->l_start = h->l_offset;
 			fl->l_len = h->l_len;
 			fl->l_pid = h->svid;
 			if (h->exclusive)
 				fl->l_type = F_WRLCK;
 			else
 				fl->l_type = F_RDLCK;
 			fl->l_whence = SEEK_SET;
 			fl->l_sysid = 0;
 		} else {
 			fl->l_type = F_UNLCK;
 		}
 
 		xdr_free((xdrproc_t) xdr_nlm4_testres, &res);
 
 		return (0);
 	}
 }
 
 static int
 nlm_map_status(nlm4_stats stat)
 {
 	switch (stat) {
 	case nlm4_granted:
 		return (0);
 
 	case nlm4_denied:
 		return (EAGAIN);
 
 	case nlm4_denied_nolocks:
 		return (ENOLCK);
 
 	case nlm4_deadlck:
 		return (EDEADLK);
 
 	case nlm4_rofs:
 		return (EROFS);
 
 	case nlm4_stale_fh:
 		return (ESTALE);
 
 	case nlm4_fbig:
 		return (EFBIG);
 
 	case nlm4_failed:
 		return (EACCES);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 static struct nlm_file_svid *
 nlm_find_svid(void *id)
 {
 	struct nlm_file_svid *ns, *newns;
 	int h;
 
 	h = (((uintptr_t) id) >> 7) % NLM_SVID_HASH_SIZE;
 
 	mtx_lock(&nlm_svid_lock);
 	LIST_FOREACH(ns, &nlm_file_svids[h], ns_link) {
 		if (ns->ns_id == id) {
 			ns->ns_refs++;
 			break;
 		}
 	}
 	mtx_unlock(&nlm_svid_lock);
 	if (!ns) {
 		int svid = alloc_unr(nlm_svid_allocator);
 		newns = malloc(sizeof(struct nlm_file_svid), M_NLM,
 		    M_WAITOK);
 		newns->ns_refs = 1;
 		newns->ns_id = id;
 		newns->ns_svid = svid;
 		newns->ns_ucred = NULL;
 		newns->ns_active = FALSE;
 
 		/*
 		 * We need to check for a race with some other
 		 * thread allocating a svid for this file.
 		 */
 		mtx_lock(&nlm_svid_lock);
 		LIST_FOREACH(ns, &nlm_file_svids[h], ns_link) {
 			if (ns->ns_id == id) {
 				ns->ns_refs++;
 				break;
 			}
 		}
 		if (ns) {
 			mtx_unlock(&nlm_svid_lock);
 			free_unr(nlm_svid_allocator, newns->ns_svid);
 			free(newns, M_NLM);
 		} else {
 			LIST_INSERT_HEAD(&nlm_file_svids[h], newns,
 			    ns_link);
 			ns = newns;
 			mtx_unlock(&nlm_svid_lock);
 		}
 	}
 
 	return (ns);
 }
 
 static void
 nlm_free_svid(struct nlm_file_svid *ns)
 {
 
 	mtx_lock(&nlm_svid_lock);
 	ns->ns_refs--;
 	if (!ns->ns_refs) {
 		KASSERT(!ns->ns_active, ("Freeing active SVID"));
 		LIST_REMOVE(ns, ns_link);
 		mtx_unlock(&nlm_svid_lock);
 		free_unr(nlm_svid_allocator, ns->ns_svid);
 		if (ns->ns_ucred)
 			crfree(ns->ns_ucred);
 		free(ns, M_NLM);
 	} else {
 		mtx_unlock(&nlm_svid_lock);
 	}
 }
 
 static int
 nlm_init_lock(struct flock *fl, int flags, int svid,
     rpcvers_t vers, size_t fhlen, void *fh, off_t size,
     struct nlm4_lock *lock, char oh_space[32])
 {
 	size_t oh_len;
 	off_t start, len;
 
 	if (fl->l_whence == SEEK_END) {
 		if (size > OFF_MAX
 		    || (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
 			return (EOVERFLOW);
 		start = size + fl->l_start;
 	} else if (fl->l_whence == SEEK_SET || fl->l_whence == SEEK_CUR) {
 		start = fl->l_start;
 	} else {
 		return (EINVAL);
 	}
 	if (start < 0)
 		return (EINVAL);
 	if (fl->l_len < 0) {
 		len = -fl->l_len;
 		start -= len;
 		if (start < 0)
 			return (EINVAL);
 	} else {
 		len = fl->l_len;
 	}
 
 	if (vers == NLM_VERS) {
 		/*
 		 * Enforce range limits on V1 locks
 		 */
 		if (start > 0xffffffffLL || len > 0xffffffffLL)
 			return (EOVERFLOW);
 	}
 
 	snprintf(oh_space, 32, "%d@", svid);
 	oh_len = strlen(oh_space);
 	getcredhostname(NULL, oh_space + oh_len, 32 - oh_len);
 	oh_len = strlen(oh_space);
 
 	memset(lock, 0, sizeof(*lock));
 	lock->caller_name = prison0.pr_hostname;
 	lock->fh.n_len = fhlen;
 	lock->fh.n_bytes = fh;
 	lock->oh.n_len = oh_len;
 	lock->oh.n_bytes = oh_space;
 	lock->svid = svid;
 	lock->l_offset = start;
 	lock->l_len = len;
 
 	return (0);
 }
Index: head/sys/nlm/nlm_prot_impl.c
===================================================================
--- head/sys/nlm/nlm_prot_impl.c	(revision 302215)
+++ head/sys/nlm/nlm_prot_impl.c	(revision 302216)
@@ -1,2432 +1,2435 @@
 /*-
  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
  * Authors: Doug Rabson <dfr@rabson.org>
  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #if __FreeBSD_version >= 700000
 #include <sys/priv.h>
 #endif
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <nfs/nfsproto.h>
 #include <nfs/nfs_lock.h>
 
 #include <nlm/nlm_prot.h>
 #include <nlm/sm_inter.h>
 #include <nlm/nlm.h>
 #include <rpc/rpc_com.h>
 #include <rpc/rpcb_prot.h>
 
 MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");
 
 /*
  * If a host is inactive (and holds no locks) for this amount of
  * seconds, we consider it idle and stop tracking it.
  */
 #define NLM_IDLE_TIMEOUT	30
 
 /*
  * We check the host list for idle every few seconds.
  */
 #define NLM_IDLE_PERIOD		5
 
 /*
  * We only look for GRANTED_RES messages for a little while.
  */
 #define NLM_EXPIRE_TIMEOUT	10
 
 /*
  * Support for sysctl vfs.nlm.sysid
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW, NULL,
     "Network Lock Manager");
 static SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid, CTLFLAG_RW, NULL, "");
 
 /*
  * Syscall hooks
  */
 static int nlm_syscall_offset = SYS_nlm_syscall;
 static struct sysent nlm_syscall_prev_sysent;
 #if __FreeBSD_version < 700000
 static struct sysent nlm_syscall_sysent = {
 	(sizeof(struct nlm_syscall_args) / sizeof(register_t)) | SYF_MPSAFE,
 	(sy_call_t *) nlm_syscall
 };
 #else
 MAKE_SYSENT(nlm_syscall);
 #endif
 static bool_t nlm_syscall_registered = FALSE;
 
 /*
  * Debug level passed in from userland. We also support a sysctl hook
  * so that it can be changed on a live system.
  */
 static int nlm_debug_level;
 SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");
 
 #define NLM_DEBUG(_level, args...)			\
 	do {						\
 		if (nlm_debug_level >= (_level))	\
 			log(LOG_DEBUG, args);		\
 	} while(0)
 #define NLM_ERR(args...)			\
 	do {					\
 		log(LOG_ERR, args);		\
 	} while(0)
 
 /*
  * Grace period handling. The value of nlm_grace_threshold is the
  * value of time_uptime after which we are serving requests normally.
  */
 static time_t nlm_grace_threshold;
 
 /*
  * We check for idle hosts if time_uptime is greater than
  * nlm_next_idle_check,
  */
 static time_t nlm_next_idle_check;
 
 /*
  * A flag to indicate the server is already running.
  */
 static int nlm_is_running;
 
 /*
  * A socket to use for RPC - shared by all IPv4 RPC clients.
  */
 static struct socket *nlm_socket;
 
 #ifdef INET6
 
 /*
  * A socket to use for RPC - shared by all IPv6 RPC clients.
  */
 static struct socket *nlm_socket6;
 
 #endif
 
 /*
  * An RPC client handle that can be used to communicate with the local
  * NSM.
  */
 static CLIENT *nlm_nsm;
 
 /*
  * An AUTH handle for the server's creds.
  */
 static AUTH *nlm_auth;
 
 /*
  * A zero timeval for sending async RPC messages.
  */
 struct timeval nlm_zero_tv = { 0, 0 };
 
 /*
  * The local NSM state number
  */
 int nlm_nsm_state;
 
 
 /*
  * A lock to protect the host list and waiting lock list.
  */
 static struct mtx nlm_global_lock;
 
 /*
  * Locks:
  * (l)		locked by nh_lock
  * (s)		only accessed via server RPC which is single threaded
  * (g)		locked by nlm_global_lock
  * (c)		const until freeing
  * (a)		modified using atomic ops
  */
 
 /*
  * A pending client-side lock request, stored on the nlm_waiting_locks
  * list.
  */
 struct nlm_waiting_lock {
 	TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
 	bool_t		nw_waiting;	       /* (g) */
 	nlm4_lock	nw_lock;	       /* (c) */
 	union nfsfh	nw_fh;		       /* (c) */
 	struct vnode	*nw_vp;		       /* (c) */
 };
 TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);
 
 struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */
 
 /*
  * A pending server-side asynchronous lock request, stored on the
  * nh_pending list of the NLM host.
  */
 struct nlm_async_lock {
 	TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
 	struct task	af_task;	/* (c) async callback details */
 	void		*af_cookie;	/* (l) lock manager cancel token */
 	struct vnode	*af_vp;		/* (l) vnode to lock */
 	struct flock	af_fl;		/* (c) lock details */
 	struct nlm_host *af_host;	/* (c) host which is locking */
 	CLIENT		*af_rpc;	/* (c) rpc client to send message */
 	nlm4_testargs	af_granted;	/* (c) notification details */
 	time_t		af_expiretime;	/* (c) notification time */
 };
 TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);
 
 /*
  * NLM host.
  */
 enum nlm_host_state {
 	NLM_UNMONITORED,
 	NLM_MONITORED,
 	NLM_MONITOR_FAILED,
 	NLM_RECOVERING
 };
 
 struct nlm_rpc {
 	CLIENT		*nr_client;    /* (l) RPC client handle */
 	time_t		nr_create_time; /* (l) when client was created */
 };
 
 struct nlm_host {
 	struct mtx	nh_lock;
 	volatile u_int	nh_refs;       /* (a) reference count */
 	TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
 	char		nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
 	uint32_t	nh_sysid;	 /* (c) our allocaed system ID */
 	char		nh_sysid_string[10]; /* (c) string rep. of sysid */
 	struct sockaddr_storage	nh_addr; /* (s) remote address of host */
 	struct nlm_rpc	nh_srvrpc;	 /* (l) RPC for server replies */
 	struct nlm_rpc	nh_clntrpc;	 /* (l) RPC for client requests */
 	rpcvers_t	nh_vers;	 /* (s) NLM version of host */
 	int		nh_state;	 /* (s) last seen NSM state of host */
 	enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
 	time_t		nh_idle_timeout; /* (s) Time at which host is idle */
 	struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
 	uint32_t	nh_grantcookie;  /* (l) grant cookie counter */
 	struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
 	struct nlm_async_lock_list nh_granted; /* (l) granted locks */
 	struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
 };
 TAILQ_HEAD(nlm_host_list, nlm_host);
 
 static struct nlm_host_list nlm_hosts; /* (g) */
 static uint32_t nlm_next_sysid = 1;    /* (g) */
 
 static void	nlm_host_unmonitor(struct nlm_host *);
 
 struct nlm_grantcookie {
 	uint32_t	ng_sysid;
 	uint32_t	ng_cookie;
 };
 
 static inline uint32_t
 ng_sysid(struct netobj *src)
 {
 
 	return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
 }
 
 static inline uint32_t
 ng_cookie(struct netobj *src)
 {
 
 	return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
 }
 
 /**********************************************************************/
 
 /*
  * Initialise NLM globals.
  */
 static void
 nlm_init(void *dummy)
 {
 	int error;
 
 	mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
 	TAILQ_INIT(&nlm_waiting_locks);
 	TAILQ_INIT(&nlm_hosts);
 
 	error = syscall_register(&nlm_syscall_offset, &nlm_syscall_sysent,
 	    &nlm_syscall_prev_sysent, SY_THR_STATIC_KLD);
 	if (error)
 		NLM_ERR("Can't register NLM syscall\n");
 	else
 		nlm_syscall_registered = TRUE;
 }
 SYSINIT(nlm_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_init, NULL);
 
 static void
 nlm_uninit(void *dummy)
 {
 
 	if (nlm_syscall_registered)
 		syscall_deregister(&nlm_syscall_offset,
 		    &nlm_syscall_prev_sysent);
 }
 SYSUNINIT(nlm_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_uninit, NULL);
 
 /*
  * Create a netobj from an arbitrary source.
  */
 void
 nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
     struct malloc_type *type)
 {
 
 	dst->n_len = srcsize;
 	dst->n_bytes = malloc(srcsize, type, M_WAITOK);
 	memcpy(dst->n_bytes, src, srcsize);
 }
 
 /*
  * Copy a struct netobj.
  */ 
 void
 nlm_copy_netobj(struct netobj *dst, struct netobj *src,
     struct malloc_type *type)
 {
 
 	nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
 }
 
 
 /*
  * Create an RPC client handle for the given (address,prog,vers)
  * triple using UDP.
  */
 static CLIENT *
 nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
 {
 	char *wchan = "nlmrcv";
 	const char* protofmly;
 	struct sockaddr_storage ss;
 	struct socket *so;
 	CLIENT *rpcb;
 	struct timeval timo;
 	RPCB parms;
 	char *uaddr;
 	enum clnt_stat stat = RPC_SUCCESS;
 	int rpcvers = RPCBVERS4;
 	bool_t do_tcp = FALSE;
 	bool_t tryagain = FALSE;
 	struct portmap mapping;
 	u_short port = 0;
 
 	/*
 	 * First we need to contact the remote RPCBIND service to find
 	 * the right port.
 	 */
 	memcpy(&ss, sa, sa->sa_len);
 	switch (ss.ss_family) {
 	case AF_INET:
 		((struct sockaddr_in *)&ss)->sin_port = htons(111);
 		protofmly = "inet";
 		so = nlm_socket;
 		break;
 		
 #ifdef INET6
 	case AF_INET6:
 		((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
 		protofmly = "inet6";
 		so = nlm_socket6;
 		break;
 #endif
 
 	default:
 		/*
 		 * Unsupported address family - fail.
 		 */
 		return (NULL);
 	}
 
 	rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
 	    RPCBPROG, rpcvers, 0, 0);
 	if (!rpcb)
 		return (NULL);
 
 try_tcp:
 	parms.r_prog = prog;
 	parms.r_vers = vers;
 	if (do_tcp)
 		parms.r_netid = "tcp";
 	else
 		parms.r_netid = "udp";
 	parms.r_addr = "";
 	parms.r_owner = "";
 
 	/*
 	 * Use the default timeout.
 	 */
 	timo.tv_sec = 25;
 	timo.tv_usec = 0;
 again:
 	switch (rpcvers) {
 	case RPCBVERS4:
 	case RPCBVERS:
 		/*
 		 * Try RPCBIND 4 then 3.
 		 */
 		uaddr = NULL;
 		stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
 		    (xdrproc_t) xdr_rpcb, &parms,
 		    (xdrproc_t) xdr_wrapstring, &uaddr, timo);
 		if (stat == RPC_SUCCESS) {
 			/*
 			 * We have a reply from the remote RPCBIND - turn it
 			 * into an appropriate address and make a new client
 			 * that can talk to the remote NLM.
 			 *
 			 * XXX fixup IPv6 scope ID.
 			 */
 			struct netbuf *a;
 			a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
 			if (!a) {
 				tryagain = TRUE;
 			} else {
 				tryagain = FALSE;
 				memcpy(&ss, a->buf, a->len);
 				free(a->buf, M_RPC);
 				free(a, M_RPC);
 				xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
 			}
 		}
 		if (tryagain || stat == RPC_PROGVERSMISMATCH) {
 			if (rpcvers == RPCBVERS4)
 				rpcvers = RPCBVERS;
 			else if (rpcvers == RPCBVERS)
 				rpcvers = PMAPVERS;
 			CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
 			goto again;
 		}
 		break;
 	case PMAPVERS:
 		/*
 		 * Try portmap.
 		 */
 		mapping.pm_prog = parms.r_prog;
 		mapping.pm_vers = parms.r_vers;
 		mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
 		mapping.pm_port = 0;
 
 		stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
 		    (xdrproc_t) xdr_portmap, &mapping,
 		    (xdrproc_t) xdr_u_short, &port, timo);
 
 		if (stat == RPC_SUCCESS) {
 			switch (ss.ss_family) {
 			case AF_INET:
 				((struct sockaddr_in *)&ss)->sin_port =
 					htons(port);
 				break;
 		
 #ifdef INET6
 			case AF_INET6:
 				((struct sockaddr_in6 *)&ss)->sin6_port =
 					htons(port);
 				break;
 #endif
 			}
 		}
 		break;
 	default:
 		panic("invalid rpcvers %d", rpcvers);
 	}
 	/*
 	 * We may have a positive response from the portmapper, but the NLM
 	 * service was not found. Make sure we received a valid port.
 	 */
 	switch (ss.ss_family) {
 	case AF_INET:
 		port = ((struct sockaddr_in *)&ss)->sin_port;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		port = ((struct sockaddr_in6 *)&ss)->sin6_port;
 		break;
 #endif
 	}
 	if (stat != RPC_SUCCESS || !port) {
 		/*
 		 * If we were able to talk to rpcbind or portmap, but the udp
 		 * variant wasn't available, ask about tcp.
 		 *
 		 * XXX - We could also check for a TCP portmapper, but
 		 * if the host is running a portmapper at all, we should be able
 		 * to hail it over UDP.
 		 */
 		if (stat == RPC_SUCCESS && !do_tcp) {
 			do_tcp = TRUE;
 			goto try_tcp;
 		}
 
 		/* Otherwise, bad news. */
 		NLM_ERR("NLM: failed to contact remote rpcbind, "
 		    "stat = %d, port = %d\n", (int) stat, port);
 		CLNT_DESTROY(rpcb);
 		return (NULL);
 	}
 
 	if (do_tcp) {
 		/*
 		 * Destroy the UDP client we used to speak to rpcbind and
 		 * recreate as a TCP client.
 		 */
 		struct netconfig *nconf = NULL;
 
 		CLNT_DESTROY(rpcb);
 
 		switch (ss.ss_family) {
 		case AF_INET:
 			nconf = getnetconfigent("tcp");
 			break;
 #ifdef INET6
 		case AF_INET6:
 			nconf = getnetconfigent("tcp6");
 			break;
 #endif
 		}
 
 		rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
 		    prog, vers, 0, 0);
 		CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
 		rpcb->cl_auth = nlm_auth;
 		
 	} else {
 		/*
 		 * Re-use the client we used to speak to rpcbind.
 		 */
 		CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
 		CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
 		CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
 		CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
 		rpcb->cl_auth = nlm_auth;
 	}
 
 	return (rpcb);
 }
 
 /*
  * This async callback after when an async lock request has been
  * granted. We notify the host which initiated the request.
  */
 static void
 nlm_lock_callback(void *arg, int pending)
 {
 	struct nlm_async_lock *af = (struct nlm_async_lock *) arg;
 	struct rpc_callextra ext;
 
 	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
 	    " cookie %d:%d\n", af, af->af_host->nh_caller_name,
 	    af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
 	    ng_cookie(&af->af_granted.cookie));
 
 	/*
 	 * Send the results back to the host.
 	 *
 	 * Note: there is a possible race here with nlm_host_notify
 	 * destroying the RPC client. To avoid problems, the first
 	 * thing nlm_host_notify does is to cancel pending async lock
 	 * requests.
 	 */
 	memset(&ext, 0, sizeof(ext));
 	ext.rc_auth = nlm_auth;
 	if (af->af_host->nh_vers == NLM_VERS4) {
 		nlm4_granted_msg_4(&af->af_granted,
 		    NULL, af->af_rpc, &ext, nlm_zero_tv);
 	} else {
 		/*
 		 * Back-convert to legacy protocol
 		 */
 		nlm_testargs granted;
 		granted.cookie = af->af_granted.cookie;
 		granted.exclusive = af->af_granted.exclusive;
 		granted.alock.caller_name =
 			af->af_granted.alock.caller_name;
 		granted.alock.fh = af->af_granted.alock.fh;
 		granted.alock.oh = af->af_granted.alock.oh;
 		granted.alock.svid = af->af_granted.alock.svid;
 		granted.alock.l_offset =
 			af->af_granted.alock.l_offset;
 		granted.alock.l_len =
 			af->af_granted.alock.l_len;
 
 		nlm_granted_msg_1(&granted,
 		    NULL, af->af_rpc, &ext, nlm_zero_tv);
 	}
 
 	/*
 	 * Move this entry to the nh_granted list.
 	 */
 	af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
 	mtx_lock(&af->af_host->nh_lock);
 	TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
 	TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
 	mtx_unlock(&af->af_host->nh_lock);
 }
 
 /*
  * Free an async lock request. The request must have been removed from
  * any list.
  */
 static void
 nlm_free_async_lock(struct nlm_async_lock *af)
 {
 	/*
 	 * Free an async lock.
 	 */
 	if (af->af_rpc)
 		CLNT_RELEASE(af->af_rpc);
 	xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
 	if (af->af_vp)
 		vrele(af->af_vp);
 	free(af, M_NLM);
 }
 
 /*
  * Cancel our async request - this must be called with
  * af->nh_host->nh_lock held. This is slightly complicated by a
  * potential race with our own callback. If we fail to cancel the
  * lock, it must already have been granted - we make sure our async
  * task has completed by calling taskqueue_drain in this case.
  */
 static int
 nlm_cancel_async_lock(struct nlm_async_lock *af)
 {
 	struct nlm_host *host = af->af_host;
 	int error;
 
 	mtx_assert(&host->nh_lock, MA_OWNED);
 
 	mtx_unlock(&host->nh_lock);
 
 	error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
 	    F_REMOTE, NULL, &af->af_cookie);
 
 	if (error) {
 		/*
 		 * We failed to cancel - make sure our callback has
 		 * completed before we continue.
 		 */
 		taskqueue_drain(taskqueue_thread, &af->af_task);
 	}
 
 	mtx_lock(&host->nh_lock);
 	
 	if (!error) {
 		NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
 		    "cancelled\n", af, host->nh_caller_name, host->nh_sysid);
 
 		/*
 		 * Remove from the nh_pending list and free now that
 		 * we are safe from the callback.
 		 */
 		TAILQ_REMOVE(&host->nh_pending, af, af_link);
 		mtx_unlock(&host->nh_lock);
 		nlm_free_async_lock(af);
 		mtx_lock(&host->nh_lock);
 	}
 
 	return (error);
 }
 
 static void
 nlm_check_expired_locks(struct nlm_host *host)
 {
 	struct nlm_async_lock *af;
 	time_t uptime = time_uptime;
 
 	mtx_lock(&host->nh_lock);
 	while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
 	    && uptime >= af->af_expiretime) {
 		NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
 		    " cookie %d:%d\n", af, af->af_host->nh_caller_name,
 		    af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
 		    ng_cookie(&af->af_granted.cookie));
 		TAILQ_REMOVE(&host->nh_granted, af, af_link);
 		mtx_unlock(&host->nh_lock);
 		nlm_free_async_lock(af);
 		mtx_lock(&host->nh_lock);
 	}
 	while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
 		TAILQ_REMOVE(&host->nh_finished, af, af_link);
 		mtx_unlock(&host->nh_lock);
 		nlm_free_async_lock(af);
 		mtx_lock(&host->nh_lock);
 	}
 	mtx_unlock(&host->nh_lock);
 }
 
 /*
  * Free resources used by a host. This is called after the reference
  * count has reached zero so it doesn't need to worry about locks.
  */
 static void
 nlm_host_destroy(struct nlm_host *host)
 {
 
 	mtx_lock(&nlm_global_lock);
 	TAILQ_REMOVE(&nlm_hosts, host, nh_link);
 	mtx_unlock(&nlm_global_lock);
 
 	if (host->nh_srvrpc.nr_client)
 		CLNT_RELEASE(host->nh_srvrpc.nr_client);
 	if (host->nh_clntrpc.nr_client)
 		CLNT_RELEASE(host->nh_clntrpc.nr_client);
 	mtx_destroy(&host->nh_lock);
 	sysctl_ctx_free(&host->nh_sysctl);
 	free(host, M_NLM);
 }
 
 /*
  * Thread start callback for client lock recovery
  */
 static void
 nlm_client_recovery_start(void *arg)
 {
 	struct nlm_host *host = (struct nlm_host *) arg;
 
 	NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
 	    host->nh_caller_name);
 
 	nlm_client_recovery(host);
 
 	NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
 	    host->nh_caller_name);
 
 	host->nh_monstate = NLM_MONITORED;
 	nlm_host_release(host);
 
 	kthread_exit();
 }
 
 /*
  * This is called when we receive a host state change notification. We
  * unlock any active locks owned by the host. When rpc.lockd is
  * shutting down, this function is called with newstate set to zero
  * which allows us to cancel any pending async locks and clear the
  * locking state.
  */
 static void
 nlm_host_notify(struct nlm_host *host, int newstate)
 {
 	struct nlm_async_lock *af;
 
 	if (newstate) {
 		NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
 		    "state is %d\n", host->nh_caller_name,
 		    host->nh_sysid, newstate);
 	}
 
 	/*
 	 * Cancel any pending async locks for this host.
 	 */
 	mtx_lock(&host->nh_lock);
 	while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
 		/*
 		 * nlm_cancel_async_lock will remove the entry from
 		 * nh_pending and free it.
 		 */
 		nlm_cancel_async_lock(af);
 	}
 	mtx_unlock(&host->nh_lock);
 	nlm_check_expired_locks(host);
 
 	/*
 	 * The host just rebooted - trash its locks.
 	 */
 	lf_clearremotesys(host->nh_sysid);
 	host->nh_state = newstate;
 
 	/*
 	 * If we have any remote locks for this host (i.e. it
 	 * represents a remote NFS server that our local NFS client
 	 * has locks for), start a recovery thread.
 	 */
 	if (newstate != 0
 	    && host->nh_monstate != NLM_RECOVERING
 	    && lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid) > 0) {
 		struct thread *td;
 		host->nh_monstate = NLM_RECOVERING;
 		refcount_acquire(&host->nh_refs);
 		kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
 		    "NFS lock recovery for %s", host->nh_caller_name);
 	}
 }
 
 /*
  * Sysctl handler to count the number of locks for a sysid.
  */
 static int
 nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct nlm_host *host;
 	int count;
 
 	host = oidp->oid_arg1;
 	count = lf_countlocks(host->nh_sysid);
 	return sysctl_handle_int(oidp, &count, 0, req);
 }
 
 /*
  * Sysctl handler to count the number of client locks for a sysid.
  */
 static int
 nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct nlm_host *host;
 	int count;
 
 	host = oidp->oid_arg1;
 	count = lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid);
 	return sysctl_handle_int(oidp, &count, 0, req);
 }
 
 /*
  * Create a new NLM host.
  */
 static struct nlm_host *
 nlm_create_host(const char* caller_name)
 {
 	struct nlm_host *host;
 	struct sysctl_oid *oid;
 
 	mtx_assert(&nlm_global_lock, MA_OWNED);
 
 	NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
 	    caller_name, nlm_next_sysid);
 	host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT|M_ZERO);
 	if (!host)
 		return (NULL);
 	mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
 	host->nh_refs = 1;
 	strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
 	host->nh_sysid = nlm_next_sysid++;
 	snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
 		"%d", host->nh_sysid);
 	host->nh_vers = 0;
 	host->nh_state = 0;
 	host->nh_monstate = NLM_UNMONITORED;
 	host->nh_grantcookie = 1;
 	TAILQ_INIT(&host->nh_pending);
 	TAILQ_INIT(&host->nh_granted);
 	TAILQ_INIT(&host->nh_finished);
 	TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);
 
 	mtx_unlock(&nlm_global_lock);
 
 	sysctl_ctx_init(&host->nh_sysctl);
 	oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
 	    SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
 	    OID_AUTO, host->nh_sysid_string, CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
 	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "version", CTLFLAG_RD, &host->nh_vers, 0, "");
 	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
 	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
 	    nlm_host_lock_count_sysctl, "I", "");
 	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "client_lock_count", CTLTYPE_INT | CTLFLAG_RD, host, 0,
 	    nlm_host_client_lock_count_sysctl, "I", "");
 
 	mtx_lock(&nlm_global_lock);
 
 	return (host);
 }
 
 /*
  * Acquire the next sysid for remote locks not handled by the NLM.
  */
 uint32_t
 nlm_acquire_next_sysid(void)
 {
 	uint32_t next_sysid;
 
 	mtx_lock(&nlm_global_lock);
 	next_sysid = nlm_next_sysid++;
 	mtx_unlock(&nlm_global_lock);
 	return (next_sysid);
 }
 
 /*
  * Return non-zero if the address parts of the two sockaddrs are the
  * same.
  */
 static int
 nlm_compare_addr(const struct sockaddr *a, const struct sockaddr *b)
 {
 	const struct sockaddr_in *a4, *b4;
 #ifdef INET6
 	const struct sockaddr_in6 *a6, *b6;
 #endif
 
 	if (a->sa_family != b->sa_family)
 		return (FALSE);
 
 	switch (a->sa_family) {
 	case AF_INET:
 		a4 = (const struct sockaddr_in *) a;
 		b4 = (const struct sockaddr_in *) b;
 		return !memcmp(&a4->sin_addr, &b4->sin_addr,
 		    sizeof(a4->sin_addr));
 #ifdef INET6
 	case AF_INET6:
 		a6 = (const struct sockaddr_in6 *) a;
 		b6 = (const struct sockaddr_in6 *) b;
 		return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
 		    sizeof(a6->sin6_addr));
 #endif
 	}
 
 	return (0);
 }
 
 /*
  * Check for idle hosts and stop monitoring them. We could also free
  * the host structure here, possibly after a larger timeout but that
  * would require some care to avoid races with
  * e.g. nlm_host_lock_count_sysctl.
  */
 static void
 nlm_check_idle(void)
 {
 	struct nlm_host *host;
 
 	mtx_assert(&nlm_global_lock, MA_OWNED);
 
 	if (time_uptime <= nlm_next_idle_check)
 		return;
 
 	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
 
 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
 		if (host->nh_monstate == NLM_MONITORED
 		    && time_uptime > host->nh_idle_timeout) {
 			mtx_unlock(&nlm_global_lock);
 			if (lf_countlocks(host->nh_sysid) > 0
 			    || lf_countlocks(NLM_SYSID_CLIENT
 				+ host->nh_sysid)) {
 				host->nh_idle_timeout =
 					time_uptime + NLM_IDLE_TIMEOUT;
 				mtx_lock(&nlm_global_lock);
 				continue;
 			}
 			nlm_host_unmonitor(host);
 			mtx_lock(&nlm_global_lock);
 		} 
 	}
 }
 
 /*
  * Search for an existing NLM host that matches the given name
  * (typically the caller_name element of an nlm4_lock).  If none is
  * found, create a new host. If 'addr' is non-NULL, record the remote
  * address of the host so that we can call it back for async
  * responses. If 'vers' is greater than zero then record the NLM
  * program version to use to communicate with this client.
  */
 struct nlm_host *
 nlm_find_host_by_name(const char *name, const struct sockaddr *addr,
     rpcvers_t vers)
 {
 	struct nlm_host *host;
 
 	mtx_lock(&nlm_global_lock);
 
 	/*
 	 * The remote host is determined by caller_name.
 	 */
 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
 		if (!strcmp(host->nh_caller_name, name))
 			break;
 	}
 
 	if (!host) {
 		host = nlm_create_host(name);
 		if (!host) {
 			mtx_unlock(&nlm_global_lock);
 			return (NULL);
 		}
 	}
 	refcount_acquire(&host->nh_refs);
 
 	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
 
 	/*
 	 * If we have an address for the host, record it so that we
 	 * can send async replies etc.
 	 */
 	if (addr) {
 		
 		KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
 		    ("Strange remote transport address length"));
 
 		/*
 		 * If we have seen an address before and we currently
 		 * have an RPC client handle, make sure the address is
 		 * the same, otherwise discard the client handle.
 		 */
 		if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
 			if (!nlm_compare_addr(
 				    (struct sockaddr *) &host->nh_addr,
 				    addr)
 			    || host->nh_vers != vers) {
 				CLIENT *client;
 				mtx_lock(&host->nh_lock);
 				client = host->nh_srvrpc.nr_client;
 				host->nh_srvrpc.nr_client = NULL;
 				mtx_unlock(&host->nh_lock);
 				if (client) {
 					CLNT_RELEASE(client);
 				}
 			}
 		}
 		memcpy(&host->nh_addr, addr, addr->sa_len);
 		host->nh_vers = vers;
 	}
 
 	nlm_check_idle();
 
 	mtx_unlock(&nlm_global_lock);
 
 	return (host);
 }
 
 /*
  * Search for an existing NLM host that matches the given remote
  * address. If none is found, create a new host with the requested
  * address and remember 'vers' as the NLM protocol version to use for
  * that host.
  */
 struct nlm_host *
 nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
 {
 	/*
 	 * Fake up a name using inet_ntop. This buffer is
 	 * large enough for an IPv6 address.
 	 */
 	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
 	struct nlm_host *host;
 
 	switch (addr->sa_family) {
 	case AF_INET:
 		inet_ntop(AF_INET,
 		    &((const struct sockaddr_in *) addr)->sin_addr,
 		    tmp, sizeof tmp);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		inet_ntop(AF_INET6,
 		    &((const struct sockaddr_in6 *) addr)->sin6_addr,
 		    tmp, sizeof tmp);
 		break;
 #endif
 	default:
 		strlcpy(tmp, "<unknown>", sizeof(tmp));
 	}
 
 
 	mtx_lock(&nlm_global_lock);
 
 	/*
 	 * The remote host is determined by caller_name.
 	 */
 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
 		if (nlm_compare_addr(addr,
 			(const struct sockaddr *) &host->nh_addr))
 			break;
 	}
 
 	if (!host) {
 		host = nlm_create_host(tmp);
 		if (!host) {
 			mtx_unlock(&nlm_global_lock);
 			return (NULL);
 		}
 		memcpy(&host->nh_addr, addr, addr->sa_len);
 		host->nh_vers = vers;
 	}
 	refcount_acquire(&host->nh_refs);
 
 	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
 
 	nlm_check_idle();
 
 	mtx_unlock(&nlm_global_lock);
 
 	return (host);
 }
 
 /*
  * Find the NLM host that matches the value of 'sysid'. If none
  * exists, return NULL.
  */
 static struct nlm_host *
 nlm_find_host_by_sysid(int sysid)
 {
 	struct nlm_host *host;
 
 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
 		if (host->nh_sysid == sysid) {
 			refcount_acquire(&host->nh_refs);
 			return (host);
 		}
 	}
 
 	return (NULL);
 }
 
 void nlm_host_release(struct nlm_host *host)
 {
 	if (refcount_release(&host->nh_refs)) {
 		/*
 		 * Free the host
 		 */
 		nlm_host_destroy(host);
 	}
 }
 
 /*
  * Unregister this NLM host with the local NSM due to idleness.
  */
 static void
 nlm_host_unmonitor(struct nlm_host *host)
 {
 	mon_id smmonid;
 	sm_stat_res smstat;
 	struct timeval timo;
 	enum clnt_stat stat;
 
 	NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	/*
 	 * We put our assigned system ID value in the priv field to
 	 * make it simpler to find the host if we are notified of a
 	 * host restart.
 	 */
 	smmonid.mon_name = host->nh_caller_name;
 	smmonid.my_id.my_name = "localhost";
 	smmonid.my_id.my_prog = NLM_PROG;
 	smmonid.my_id.my_vers = NLM_SM;
 	smmonid.my_id.my_proc = NLM_SM_NOTIFY;
 
 	timo.tv_sec = 25;
 	timo.tv_usec = 0;
 	stat = CLNT_CALL(nlm_nsm, SM_UNMON,
 	    (xdrproc_t) xdr_mon, &smmonid,
 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
 
 	if (stat != RPC_SUCCESS) {
 		NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
 		return;
 	}
 	if (smstat.res_stat == stat_fail) {
 		NLM_ERR("Local NSM refuses to unmonitor %s\n",
 		    host->nh_caller_name);
 		return;
 	}
 
 	host->nh_monstate = NLM_UNMONITORED;
 }
 
 /*
  * Register this NLM host with the local NSM so that we can be
  * notified if it reboots.
  */
 void
 nlm_host_monitor(struct nlm_host *host, int state)
 {
 	mon smmon;
 	sm_stat_res smstat;
 	struct timeval timo;
 	enum clnt_stat stat;
 
 	if (state && !host->nh_state) {
 		/*
 		 * This is the first time we have seen an NSM state
 		 * value for this host. We record it here to help
 		 * detect host reboots.
 		 */
 		host->nh_state = state;
 		NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
 		    host->nh_caller_name, host->nh_sysid, state);
 	}
 
 	mtx_lock(&host->nh_lock);
 	if (host->nh_monstate != NLM_UNMONITORED) {
 		mtx_unlock(&host->nh_lock);
 		return;
 	}
 	host->nh_monstate = NLM_MONITORED;
 	mtx_unlock(&host->nh_lock);
 
 	NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	/*
 	 * We put our assigned system ID value in the priv field to
 	 * make it simpler to find the host if we are notified of a
 	 * host restart.
 	 */
 	smmon.mon_id.mon_name = host->nh_caller_name;
 	smmon.mon_id.my_id.my_name = "localhost";
 	smmon.mon_id.my_id.my_prog = NLM_PROG;
 	smmon.mon_id.my_id.my_vers = NLM_SM;
 	smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
 	memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));
 
 	timo.tv_sec = 25;
 	timo.tv_usec = 0;
 	stat = CLNT_CALL(nlm_nsm, SM_MON,
 	    (xdrproc_t) xdr_mon, &smmon,
 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
 
 	if (stat != RPC_SUCCESS) {
 		NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
 		return;
 	}
 	if (smstat.res_stat == stat_fail) {
 		NLM_ERR("Local NSM refuses to monitor %s\n",
 		    host->nh_caller_name);
 		mtx_lock(&host->nh_lock);
 		host->nh_monstate = NLM_MONITOR_FAILED;
 		mtx_unlock(&host->nh_lock);
 		return;
 	}
 
 	host->nh_monstate = NLM_MONITORED;
 }
 
 /*
  * Return an RPC client handle that can be used to talk to the NLM
  * running on the given host.
  */
 CLIENT *
 nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
 {
 	struct nlm_rpc *rpc;
 	CLIENT *client;
 
 	mtx_lock(&host->nh_lock);
 
 	if (isserver)
 		rpc = &host->nh_srvrpc;
 	else
 		rpc = &host->nh_clntrpc;
 
 	/*
 	 * We can't hold onto RPC handles for too long - the async
 	 * call/reply protocol used by some NLM clients makes it hard
 	 * to tell when they change port numbers (e.g. after a
 	 * reboot). Note that if a client reboots while it isn't
 	 * holding any locks, it won't bother to notify us. We
 	 * expire the RPC handles after two minutes.
 	 */
 	if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
 		client = rpc->nr_client;
 		rpc->nr_client = NULL;
 		mtx_unlock(&host->nh_lock);
 		CLNT_RELEASE(client);
 		mtx_lock(&host->nh_lock);
 	}
 
 	if (!rpc->nr_client) {
 		mtx_unlock(&host->nh_lock);
 		client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
 		    NLM_PROG, host->nh_vers);
 		mtx_lock(&host->nh_lock);
 
 		if (client) {
 			if (rpc->nr_client) {
 				mtx_unlock(&host->nh_lock);
 				CLNT_DESTROY(client);
 				mtx_lock(&host->nh_lock);
 			} else {
 				rpc->nr_client = client;
 				rpc->nr_create_time = time_uptime;
 			}
 		}
 	}
 
 	client = rpc->nr_client;
 	if (client)
 		CLNT_ACQUIRE(client);
 	mtx_unlock(&host->nh_lock);
 
 	return (client);
 
 }
 
 int nlm_host_get_sysid(struct nlm_host *host)
 {
 
 	return (host->nh_sysid);
 }
 
 int
 nlm_host_get_state(struct nlm_host *host)
 {
 
 	return (host->nh_state);
 }
 
 void *
 nlm_register_wait_lock(struct nlm4_lock *lock, struct vnode *vp)
 {
 	struct nlm_waiting_lock *nw;
 
 	nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
 	nw->nw_lock = *lock;
 	memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
 	    nw->nw_lock.fh.n_len);
 	nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
 	nw->nw_waiting = TRUE;
 	nw->nw_vp = vp;
 	mtx_lock(&nlm_global_lock);
 	TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
 	mtx_unlock(&nlm_global_lock);
 
 	return nw;
 }
 
 void
 nlm_deregister_wait_lock(void *handle)
 {
 	struct nlm_waiting_lock *nw = handle;
 
 	mtx_lock(&nlm_global_lock);
 	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
 	mtx_unlock(&nlm_global_lock);
 	
 	free(nw, M_NLM);
 }
 
 int
 nlm_wait_lock(void *handle, int timo)
 {
 	struct nlm_waiting_lock *nw = handle;
-	int error;
+	int error, stops_deferred;
 
 	/*
 	 * If the granted message arrived before we got here,
 	 * nw->nw_waiting will be FALSE - in that case, don't sleep.
 	 */
 	mtx_lock(&nlm_global_lock);
 	error = 0;
-	if (nw->nw_waiting)
+	if (nw->nw_waiting) {
+		stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
 		error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
+		sigallowstop(stops_deferred);
+	}
 	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
 	if (error) {
 		/*
 		 * The granted message may arrive after the
 		 * interrupt/timeout but before we manage to lock the
 		 * mutex. Detect this by examining nw_lock.
 		 */
 		if (!nw->nw_waiting)
 			error = 0;
 	} else {
 		/*
 		 * If nlm_cancel_wait is called, then error will be
 		 * zero but nw_waiting will still be TRUE. We
 		 * translate this into EINTR.
 		 */
 		if (nw->nw_waiting)
 			error = EINTR;
 	}
 	mtx_unlock(&nlm_global_lock);
 
 	free(nw, M_NLM);
 
 	return (error);
 }
 
 void
 nlm_cancel_wait(struct vnode *vp)
 {
 	struct nlm_waiting_lock *nw;
 
 	mtx_lock(&nlm_global_lock);
 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
 		if (nw->nw_vp == vp) {
 			wakeup(nw);
 		}
 	}
 	mtx_unlock(&nlm_global_lock);
 }
 
 
 /**********************************************************************/
 
 /*
  * Syscall interface with userland.
  */
 
 extern void nlm_prog_0(struct svc_req *rqstp, SVCXPRT *transp);
 extern void nlm_prog_1(struct svc_req *rqstp, SVCXPRT *transp);
 extern void nlm_prog_3(struct svc_req *rqstp, SVCXPRT *transp);
 extern void nlm_prog_4(struct svc_req *rqstp, SVCXPRT *transp);
 
 static int
 nlm_register_services(SVCPOOL *pool, int addr_count, char **addrs)
 {
 	static rpcvers_t versions[] = {
 		NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
 	};
 	static void (*dispatchers[])(struct svc_req *, SVCXPRT *) = {
 		nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
 	};
 
 	SVCXPRT **xprts;
 	char netid[16];
 	char uaddr[128];
 	struct netconfig *nconf;
 	int i, j, error;
 
 	if (!addr_count) {
 		NLM_ERR("NLM: no service addresses given - can't start server");
 		return (EINVAL);
 	}
 
 	if (addr_count < 0 || addr_count > 256 ) {
 		NLM_ERR("NLM:  too many service addresses (%d) given, "
 		    "max 256 - can't start server\n", addr_count);
 		return (EINVAL);
 	}
 
 	xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK|M_ZERO);
 	for (i = 0; i < nitems(versions); i++) {
 		for (j = 0; j < addr_count; j++) {
 			/*
 			 * Create transports for the first version and
 			 * then just register everything else to the
 			 * same transports.
 			 */
 			if (i == 0) {
 				char *up;
 
 				error = copyin(&addrs[2*j], &up,
 				    sizeof(char*));
 				if (error)
 					goto out;
 				error = copyinstr(up, netid, sizeof(netid),
 				    NULL);
 				if (error)
 					goto out;
 				error = copyin(&addrs[2*j+1], &up,
 				    sizeof(char*));
 				if (error)
 					goto out;
 				error = copyinstr(up, uaddr, sizeof(uaddr),
 				    NULL);
 				if (error)
 					goto out;
 				nconf = getnetconfigent(netid);
 				if (!nconf) {
 					NLM_ERR("Can't lookup netid %s\n",
 					    netid);
 					error = EINVAL;
 					goto out;
 				}
 				xprts[j] = svc_tp_create(pool, dispatchers[i],
 				    NLM_PROG, versions[i], uaddr, nconf);
 				if (!xprts[j]) {
 					NLM_ERR("NLM: unable to create "
 					    "(NLM_PROG, %d).\n", versions[i]);
 					error = EINVAL;
 					goto out;
 				}
 				freenetconfigent(nconf);
 			} else {
 				nconf = getnetconfigent(xprts[j]->xp_netid);
 				rpcb_unset(NLM_PROG, versions[i], nconf);
 				if (!svc_reg(xprts[j], NLM_PROG, versions[i],
 					dispatchers[i], nconf)) {
 					NLM_ERR("NLM: can't register "
 					    "(NLM_PROG, %d)\n", versions[i]);
 					error = EINVAL;
 					goto out;
 				}
 			}
 		}
 	}
 	error = 0;
 out:
 	for (j = 0; j < addr_count; j++) {
 		if (xprts[j])
 			SVC_RELEASE(xprts[j]);
 	}
 	free(xprts, M_NLM);
 	return (error);
 }
 
 /*
  * Main server entry point. Contacts the local NSM to get its current
  * state and send SM_UNMON_ALL. Registers the NLM services and then
  * services requests. Does not return until the server is interrupted
  * by a signal.
  */
 static int
 nlm_server_main(int addr_count, char **addrs)
 {
 	struct thread *td = curthread;
 	int error;
 	SVCPOOL *pool = NULL;
 	struct sockopt opt;
 	int portlow;
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	struct sockaddr_in sin;
 	my_id id;
 	sm_stat smstat;
 	struct timeval timo;
 	enum clnt_stat stat;
 	struct nlm_host *host, *nhost;
 	struct nlm_waiting_lock *nw;
 	vop_advlock_t *old_nfs_advlock;
 	vop_reclaim_t *old_nfs_reclaim;
 
 	if (nlm_is_running != 0) {
 		NLM_ERR("NLM: can't start server - "
 		    "it appears to be running already\n");
 		return (EPERM);
 	}
 
 	if (nlm_socket == NULL) {
 		memset(&opt, 0, sizeof(opt));
 
 		error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
 		    td->td_ucred, td);
 		if (error) {
 			NLM_ERR("NLM: can't create IPv4 socket - error %d\n",
 			    error);
 			return (error);
 		}
 		opt.sopt_dir = SOPT_SET;
 		opt.sopt_level = IPPROTO_IP;
 		opt.sopt_name = IP_PORTRANGE;
 		portlow = IP_PORTRANGE_LOW;
 		opt.sopt_val = &portlow;
 		opt.sopt_valsize = sizeof(portlow);
 		sosetopt(nlm_socket, &opt);
 
 #ifdef INET6
 		nlm_socket6 = NULL;
 		error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
 		    td->td_ucred, td);
 		if (error) {
 			NLM_ERR("NLM: can't create IPv6 socket - error %d\n",
 			    error);
 			soclose(nlm_socket);
 			nlm_socket = NULL;
 			return (error);
 		}
 		opt.sopt_dir = SOPT_SET;
 		opt.sopt_level = IPPROTO_IPV6;
 		opt.sopt_name = IPV6_PORTRANGE;
 		portlow = IPV6_PORTRANGE_LOW;
 		opt.sopt_val = &portlow;
 		opt.sopt_valsize = sizeof(portlow);
 		sosetopt(nlm_socket6, &opt);
 #endif
 	}
 
 	nlm_auth = authunix_create(curthread->td_ucred);
 
 #ifdef INET6
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(sin6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = in6addr_loopback;
 	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
 	if (!nlm_nsm) {
 #endif
 		memset(&sin, 0, sizeof(sin));
 		sin.sin_len = sizeof(sin);
 		sin.sin_family = AF_INET;
 		sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 		nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
 		    SM_VERS);
 #ifdef INET6
 	}
 #endif
 
 	if (!nlm_nsm) {
 		NLM_ERR("Can't start NLM - unable to contact NSM\n");
 		error = EINVAL;
 		goto out;
 	}
 
 	pool = svcpool_create("NLM", NULL);
 
 	error = nlm_register_services(pool, addr_count, addrs);
 	if (error)
 		goto out;
 
 	memset(&id, 0, sizeof(id));
 	id.my_name = "NFS NLM";
 
 	timo.tv_sec = 25;
 	timo.tv_usec = 0;
 	stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
 	    (xdrproc_t) xdr_my_id, &id,
 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
 
 	if (stat != RPC_SUCCESS) {
 		struct rpc_err err;
 
 		CLNT_GETERR(nlm_nsm, &err);
 		NLM_ERR("NLM: unexpected error contacting NSM, "
 		    "stat=%d, errno=%d\n", stat, err.re_errno);
 		error = EINVAL;
 		goto out;
 	}
 	nlm_is_running = 1;
 
 	NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
 	nlm_nsm_state = smstat.state;
 
 	old_nfs_advlock = nfs_advlock_p;
 	nfs_advlock_p = nlm_advlock;
 	old_nfs_reclaim = nfs_reclaim_p;
 	nfs_reclaim_p = nlm_reclaim;
 
 	svc_run(pool);
 	error = 0;
 
 	nfs_advlock_p = old_nfs_advlock;
 	nfs_reclaim_p = old_nfs_reclaim;
 
 out:
 	nlm_is_running = 0;
 	if (pool)
 		svcpool_destroy(pool);
 
 	/*
 	 * We are finished communicating with the NSM.
 	 */
 	if (nlm_nsm) {
 		CLNT_RELEASE(nlm_nsm);
 		nlm_nsm = NULL;
 	}
 
 	/*
 	 * Trash all the existing state so that if the server
 	 * restarts, it gets a clean slate. This is complicated by the
 	 * possibility that there may be other threads trying to make
 	 * client locking requests.
 	 *
 	 * First we fake a client reboot notification which will
 	 * cancel any pending async locks and purge remote lock state
 	 * from the local lock manager. We release the reference from
 	 * nlm_hosts to the host (which may remove it from the list
 	 * and free it). After this phase, the only entries in the
 	 * nlm_host list should be from other threads performing
 	 * client lock requests.
 	 */
 	mtx_lock(&nlm_global_lock);
 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
 		wakeup(nw);
 	}
 	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
 		mtx_unlock(&nlm_global_lock);
 		nlm_host_notify(host, 0);
 		nlm_host_release(host);
 		mtx_lock(&nlm_global_lock);
 	}
 	mtx_unlock(&nlm_global_lock);
 
 	AUTH_DESTROY(nlm_auth);
 
 	return (error);
 }
 
 int
 sys_nlm_syscall(struct thread *td, struct nlm_syscall_args *uap)
 {
 	int error;
 
 #if __FreeBSD_version >= 700000
 	error = priv_check(td, PRIV_NFS_LOCKD);
 #else
 	error = suser(td);
 #endif
 	if (error)
 		return (error);
 
 	nlm_debug_level = uap->debug_level;
 	nlm_grace_threshold = time_uptime + uap->grace_period;
 	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
 
 	return nlm_server_main(uap->addr_count, uap->addrs);
 }
 
 /**********************************************************************/
 
 /*
  * NLM implementation details, called from the RPC stubs.
  */
 
 
 void
 nlm_sm_notify(struct nlm_sm_status *argp)
 {
 	uint32_t sysid;
 	struct nlm_host *host;
 
 	NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
 	memcpy(&sysid, &argp->priv, sizeof(sysid));
 	host = nlm_find_host_by_sysid(sysid);
 	if (host) {
 		nlm_host_notify(host, argp->state);
 		nlm_host_release(host);
 	}
 }
 
 static void
 nlm_convert_to_fhandle_t(fhandle_t *fhp, struct netobj *p)
 {
 	memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
 }
 
 struct vfs_state {
 	struct mount	*vs_mp;
 	struct vnode	*vs_vp;
 	int		vs_vnlocked;
 };
 
 static int
 nlm_get_vfs_state(struct nlm_host *host, struct svc_req *rqstp,
     fhandle_t *fhp, struct vfs_state *vs, accmode_t accmode)
 {
 	int error, exflags;
 	struct ucred *cred = NULL, *credanon = NULL;
 	
 	memset(vs, 0, sizeof(*vs));
 
 	vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
 	if (!vs->vs_mp) {
 		return (ESTALE);
 	}
 
 	/* accmode == 0 means don't check, since it is an unlock. */
 	if (accmode != 0) {
 		error = VFS_CHECKEXP(vs->vs_mp,
 		    (struct sockaddr *)&host->nh_addr, &exflags, &credanon,
 		    NULL, NULL);
 		if (error)
 			goto out;
 
 		if (exflags & MNT_EXRDONLY ||
 		    (vs->vs_mp->mnt_flag & MNT_RDONLY)) {
 			error = EROFS;
 			goto out;
 		}
 	}
 
 	error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
 	if (error)
 		goto out;
 	vs->vs_vnlocked = TRUE;
 
 	if (accmode != 0) {
 		if (!svc_getcred(rqstp, &cred, NULL)) {
 			error = EINVAL;
 			goto out;
 		}
 		if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 			crfree(cred);
 			cred = credanon;
 			credanon = NULL;
 		}
 
 		/*
 		 * Check cred.
 		 */
 		error = VOP_ACCESS(vs->vs_vp, accmode, cred, curthread);
 		/*
 		 * If this failed and accmode != VWRITE, try again with
 		 * VWRITE to maintain backwards compatibility with the
 		 * old code that always used VWRITE.
 		 */
 		if (error != 0 && accmode != VWRITE)
 			error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
 		if (error)
 			goto out;
 	}
 
 #if __FreeBSD_version < 800011
 	VOP_UNLOCK(vs->vs_vp, 0, curthread);
 #else
 	VOP_UNLOCK(vs->vs_vp, 0);
 #endif
 	vs->vs_vnlocked = FALSE;
 
 out:
 	if (cred)
 		crfree(cred);
 	if (credanon)
 		crfree(credanon);
 
 	return (error);
 }
 
 static void
 nlm_release_vfs_state(struct vfs_state *vs)
 {
 
 	if (vs->vs_vp) {
 		if (vs->vs_vnlocked)
 			vput(vs->vs_vp);
 		else
 			vrele(vs->vs_vp);
 	}
 	if (vs->vs_mp)
 		vfs_rel(vs->vs_mp);
 }
 
 static nlm4_stats
 nlm_convert_error(int error)
 {
 
 	if (error == ESTALE)
 		return nlm4_stale_fh;
 	else if (error == EROFS)
 		return nlm4_rofs;
 	else
 		return nlm4_failed;
 }
 
 int
 nlm_do_test(nlm4_testargs *argp, nlm4_testres *result, struct svc_req *rqstp,
 	CLIENT **rpcp)
 {
 	fhandle_t fh;
 	struct vfs_state vs;
 	struct nlm_host *host, *bhost;
 	int error, sysid;
 	struct flock fl;
 	accmode_t accmode;
 	
 	memset(result, 0, sizeof(*result));
 	memset(&vs, 0, sizeof(vs));
 
 	host = nlm_find_host_by_name(argp->alock.caller_name,
 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
 	if (!host) {
 		result->stat.stat = nlm4_denied_nolocks;
 		return (ENOMEM);
 	}
 
 	NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	nlm_check_expired_locks(host);
 	sysid = host->nh_sysid;
 
 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
 
 	if (time_uptime < nlm_grace_threshold) {
 		result->stat.stat = nlm4_denied_grace_period;
 		goto out;
 	}
 
 	accmode = argp->exclusive ? VWRITE : VREAD;
 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
 	if (error) {
 		result->stat.stat = nlm_convert_error(error);
 		goto out;
 	}
 
 	fl.l_start = argp->alock.l_offset;
 	fl.l_len = argp->alock.l_len;
 	fl.l_pid = argp->alock.svid;
 	fl.l_sysid = sysid;
 	fl.l_whence = SEEK_SET;
 	if (argp->exclusive)
 		fl.l_type = F_WRLCK;
 	else
 		fl.l_type = F_RDLCK;
 	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
 	if (error) {
 		result->stat.stat = nlm4_failed;
 		goto out;
 	}
 
 	if (fl.l_type == F_UNLCK) {
 		result->stat.stat = nlm4_granted;
 	} else {
 		result->stat.stat = nlm4_denied;
 		result->stat.nlm4_testrply_u.holder.exclusive =
 			(fl.l_type == F_WRLCK);
 		result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
 		bhost = nlm_find_host_by_sysid(fl.l_sysid);
 		if (bhost) {
 			/*
 			 * We don't have any useful way of recording
 			 * the value of oh used in the original lock
 			 * request. Ideally, the test reply would have
 			 * a space for the owning host's name allowing
 			 * our caller's NLM to keep track.
 			 *
 			 * As far as I can see, Solaris uses an eight
 			 * byte structure for oh which contains a four
 			 * byte pid encoded in local byte order and
 			 * the first four bytes of the host
 			 * name. Linux uses a variable length string
 			 * 'pid@hostname' in ascii but doesn't even
 			 * return that in test replies.
 			 *
 			 * For the moment, return nothing in oh
 			 * (already zero'ed above).
 			 */
 			nlm_host_release(bhost);
 		}
 		result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
 		result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
 	}
 
 out:
 	nlm_release_vfs_state(&vs);
 	if (rpcp)
 		*rpcp = nlm_host_get_rpc(host, TRUE);
 	nlm_host_release(host);
 	return (0);
 }
 
 int
 nlm_do_lock(nlm4_lockargs *argp, nlm4_res *result, struct svc_req *rqstp,
     bool_t monitor, CLIENT **rpcp)
 {
 	fhandle_t fh;
 	struct vfs_state vs;
 	struct nlm_host *host;
 	int error, sysid;
 	struct flock fl;
 	accmode_t accmode;
 	
 	memset(result, 0, sizeof(*result));
 	memset(&vs, 0, sizeof(vs));
 
 	host = nlm_find_host_by_name(argp->alock.caller_name,
 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
 	if (!host) {
 		result->stat.stat = nlm4_denied_nolocks;
 		return (ENOMEM);
 	}
 
 	NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	if (monitor && host->nh_state && argp->state
 	    && host->nh_state != argp->state) {
 		/*
 		 * The host rebooted without telling us. Trash its
 		 * locks.
 		 */
 		nlm_host_notify(host, argp->state);
 	}
 
 	nlm_check_expired_locks(host);
 	sysid = host->nh_sysid;
 
 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
 
 	if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
 		result->stat.stat = nlm4_denied_grace_period;
 		goto out;
 	}
 
 	accmode = argp->exclusive ? VWRITE : VREAD;
 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
 	if (error) {
 		result->stat.stat = nlm_convert_error(error);
 		goto out;
 	}
 
 	fl.l_start = argp->alock.l_offset;
 	fl.l_len = argp->alock.l_len;
 	fl.l_pid = argp->alock.svid;
 	fl.l_sysid = sysid;
 	fl.l_whence = SEEK_SET;
 	if (argp->exclusive)
 		fl.l_type = F_WRLCK;
 	else
 		fl.l_type = F_RDLCK;
 	if (argp->block) {
 		struct nlm_async_lock *af;
 		CLIENT *client;
 		struct nlm_grantcookie cookie;
 
 		/*
 		 * First, make sure we can contact the host's NLM.
 		 */
 		client = nlm_host_get_rpc(host, TRUE);
 		if (!client) {
 			result->stat.stat = nlm4_failed;
 			goto out;
 		}
 
 		/*
 		 * First we need to check and see if there is an
 		 * existing blocked lock that matches. This could be a
 		 * badly behaved client or an RPC re-send. If we find
 		 * one, just return nlm4_blocked.
 		 */
 		mtx_lock(&host->nh_lock);
 		TAILQ_FOREACH(af, &host->nh_pending, af_link) {
 			if (af->af_fl.l_start == fl.l_start
 			    && af->af_fl.l_len == fl.l_len
 			    && af->af_fl.l_pid == fl.l_pid
 			    && af->af_fl.l_type == fl.l_type) {
 				break;
 			}
 		}
 		if (!af) {
 			cookie.ng_sysid = host->nh_sysid;
 			cookie.ng_cookie = host->nh_grantcookie++;
 		}
 		mtx_unlock(&host->nh_lock);
 		if (af) {
 			CLNT_RELEASE(client);
 			result->stat.stat = nlm4_blocked;
 			goto out;
 		}
 
 		af = malloc(sizeof(struct nlm_async_lock), M_NLM,
 		    M_WAITOK|M_ZERO);
 		TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
 		af->af_vp = vs.vs_vp;
 		af->af_fl = fl;
 		af->af_host = host;
 		af->af_rpc = client;
 		/*
 		 * We use M_RPC here so that we can xdr_free the thing
 		 * later.
 		 */
 		nlm_make_netobj(&af->af_granted.cookie,
 		    (caddr_t)&cookie, sizeof(cookie), M_RPC);
 		af->af_granted.exclusive = argp->exclusive;
 		af->af_granted.alock.caller_name =
 			strdup(argp->alock.caller_name, M_RPC);
 		nlm_copy_netobj(&af->af_granted.alock.fh,
 		    &argp->alock.fh, M_RPC);
 		nlm_copy_netobj(&af->af_granted.alock.oh,
 		    &argp->alock.oh, M_RPC);
 		af->af_granted.alock.svid = argp->alock.svid;
 		af->af_granted.alock.l_offset = argp->alock.l_offset;
 		af->af_granted.alock.l_len = argp->alock.l_len;
 
 		/*
 		 * Put the entry on the pending list before calling
 		 * VOP_ADVLOCKASYNC. We do this in case the lock
 		 * request was blocked (returning EINPROGRESS) but
 		 * then granted before we manage to run again. The
 		 * client may receive the granted message before we
 		 * send our blocked reply but thats their problem.
 		 */
 		mtx_lock(&host->nh_lock);
 		TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
 		mtx_unlock(&host->nh_lock);
 
 		error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
 		    &af->af_task, &af->af_cookie);
 
 		/*
 		 * If the lock completed synchronously, just free the
 		 * tracking structure now.
 		 */
 		if (error != EINPROGRESS) {
 			CLNT_RELEASE(af->af_rpc);
 			mtx_lock(&host->nh_lock);
 			TAILQ_REMOVE(&host->nh_pending, af, af_link);
 			mtx_unlock(&host->nh_lock);
 			xdr_free((xdrproc_t) xdr_nlm4_testargs,
 			    &af->af_granted);
 			free(af, M_NLM);
 		} else {
 			NLM_DEBUG(2, "NLM: pending async lock %p for %s "
 			    "(sysid %d)\n", af, host->nh_caller_name, sysid);
 			/*
 			 * Don't vrele the vnode just yet - this must
 			 * wait until either the async callback
 			 * happens or the lock is cancelled.
 			 */
 			vs.vs_vp = NULL;
 		}
 	} else {
 		error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
 	}
 
 	if (error) {
 		if (error == EINPROGRESS) {
 			result->stat.stat = nlm4_blocked;
 		} else if (error == EDEADLK) {
 			result->stat.stat = nlm4_deadlck;
 		} else if (error == EAGAIN) {
 			result->stat.stat = nlm4_denied;
 		} else {
 			result->stat.stat = nlm4_failed;
 		}
 	} else {
 		if (monitor)
 			nlm_host_monitor(host, argp->state);
 		result->stat.stat = nlm4_granted;
 	}       
 
 out:
 	nlm_release_vfs_state(&vs);
 	if (rpcp)
 		*rpcp = nlm_host_get_rpc(host, TRUE);
 	nlm_host_release(host);
 	return (0);
 }
 
 int
 nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *result, struct svc_req *rqstp,
     CLIENT **rpcp)
 {
 	fhandle_t fh;
 	struct vfs_state vs;
 	struct nlm_host *host;
 	int error, sysid;
 	struct flock fl;
 	struct nlm_async_lock *af;
 	
 	memset(result, 0, sizeof(*result));
 	memset(&vs, 0, sizeof(vs));
 
 	host = nlm_find_host_by_name(argp->alock.caller_name,
 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
 	if (!host) {
 		result->stat.stat = nlm4_denied_nolocks;
 		return (ENOMEM);
 	}
 
 	NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	nlm_check_expired_locks(host);
 	sysid = host->nh_sysid;
 
 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
 
 	if (time_uptime < nlm_grace_threshold) {
 		result->stat.stat = nlm4_denied_grace_period;
 		goto out;
 	}
 
 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
 	if (error) {
 		result->stat.stat = nlm_convert_error(error);
 		goto out;
 	}
 
 	fl.l_start = argp->alock.l_offset;
 	fl.l_len = argp->alock.l_len;
 	fl.l_pid = argp->alock.svid;
 	fl.l_sysid = sysid;
 	fl.l_whence = SEEK_SET;
 	if (argp->exclusive)
 		fl.l_type = F_WRLCK;
 	else
 		fl.l_type = F_RDLCK;
 
 	/*
 	 * First we need to try and find the async lock request - if
 	 * there isn't one, we give up and return nlm4_denied.
 	 */
 	mtx_lock(&host->nh_lock);
 
 	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
 		if (af->af_fl.l_start == fl.l_start
 		    && af->af_fl.l_len == fl.l_len
 		    && af->af_fl.l_pid == fl.l_pid
 		    && af->af_fl.l_type == fl.l_type) {
 			break;
 		}
 	}
 
 	if (!af) {
 		mtx_unlock(&host->nh_lock);
 		result->stat.stat = nlm4_denied;
 		goto out;
 	}
 
 	error = nlm_cancel_async_lock(af);
 
 	if (error) {
 		result->stat.stat = nlm4_denied;
 	} else {
 		result->stat.stat = nlm4_granted;
 	}
 
 	mtx_unlock(&host->nh_lock);
 
 out:
 	nlm_release_vfs_state(&vs);
 	if (rpcp)
 		*rpcp = nlm_host_get_rpc(host, TRUE);
 	nlm_host_release(host);
 	return (0);
 }
 
 int
 nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *result, struct svc_req *rqstp,
     CLIENT **rpcp)
 {
 	fhandle_t fh;
 	struct vfs_state vs;
 	struct nlm_host *host;
 	int error, sysid;
 	struct flock fl;
 	
 	memset(result, 0, sizeof(*result));
 	memset(&vs, 0, sizeof(vs));
 
 	host = nlm_find_host_by_name(argp->alock.caller_name,
 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
 	if (!host) {
 		result->stat.stat = nlm4_denied_nolocks;
 		return (ENOMEM);
 	}
 
 	NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
 	    host->nh_caller_name, host->nh_sysid);
 
 	nlm_check_expired_locks(host);
 	sysid = host->nh_sysid;
 
 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
 
 	if (time_uptime < nlm_grace_threshold) {
 		result->stat.stat = nlm4_denied_grace_period;
 		goto out;
 	}
 
 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
 	if (error) {
 		result->stat.stat = nlm_convert_error(error);
 		goto out;
 	}
 
 	fl.l_start = argp->alock.l_offset;
 	fl.l_len = argp->alock.l_len;
 	fl.l_pid = argp->alock.svid;
 	fl.l_sysid = sysid;
 	fl.l_whence = SEEK_SET;
 	fl.l_type = F_UNLCK;
 	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);
 
 	/*
 	 * Ignore the error - there is no result code for failure,
 	 * only for grace period.
 	 */
 	result->stat.stat = nlm4_granted;
 
 out:
 	nlm_release_vfs_state(&vs);
 	if (rpcp)
 		*rpcp = nlm_host_get_rpc(host, TRUE);
 	nlm_host_release(host);
 	return (0);
 }
 
 int
 nlm_do_granted(nlm4_testargs *argp, nlm4_res *result, struct svc_req *rqstp,
 
     CLIENT **rpcp)
 {
 	struct nlm_host *host;
 	struct nlm_waiting_lock *nw;
 	
 	memset(result, 0, sizeof(*result));
 
 	host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
 	if (!host) {
 		result->stat.stat = nlm4_denied_nolocks;
 		return (ENOMEM);
 	}
 
 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
 	result->stat.stat = nlm4_denied;
 	KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);
 
 	mtx_lock(&nlm_global_lock);
 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
 		if (!nw->nw_waiting)
 			continue;
 		if (argp->alock.svid == nw->nw_lock.svid
 		    && argp->alock.l_offset == nw->nw_lock.l_offset
 		    && argp->alock.l_len == nw->nw_lock.l_len
 		    && argp->alock.fh.n_len == nw->nw_lock.fh.n_len
 		    && !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
 			nw->nw_lock.fh.n_len)) {
 			nw->nw_waiting = FALSE;
 			wakeup(nw);
 			result->stat.stat = nlm4_granted;
 			break;
 		}
 	}
 	mtx_unlock(&nlm_global_lock);
 
 out:
 	if (rpcp)
 		*rpcp = nlm_host_get_rpc(host, TRUE);
 	nlm_host_release(host);
 	return (0);
 }
 
 void
 nlm_do_granted_res(nlm4_res *argp, struct svc_req *rqstp)
 {
 	struct nlm_host *host = NULL;
 	struct nlm_async_lock *af = NULL;
 	int error;
 
 	if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
 		NLM_DEBUG(1, "NLM: bogus grant cookie");
 		goto out;
 	}
 
 	host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
 	if (!host) {
 		NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
 		goto out;
 	}
 
 	mtx_lock(&host->nh_lock);
 	TAILQ_FOREACH(af, &host->nh_granted, af_link)
 	    if (ng_cookie(&argp->cookie) ==
 		ng_cookie(&af->af_granted.cookie))
 		    break;
 	if (af)
 		TAILQ_REMOVE(&host->nh_granted, af, af_link);
 	mtx_unlock(&host->nh_lock);
 
 	if (!af) {
 		NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
 		    "with unrecognized cookie %d:%d", host->nh_caller_name,
 		    host->nh_sysid, ng_sysid(&argp->cookie),
 		    ng_cookie(&argp->cookie));
 		goto out;
 	}
 
 	if (argp->stat.stat != nlm4_granted) {
 		af->af_fl.l_type = F_UNLCK;
 		error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
 		if (error) {
 			NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
 			    "and we failed to unlock (%d)", host->nh_caller_name,
 			    host->nh_sysid, error);
 			goto out;
 		}
 
 		NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
 		    af, host->nh_caller_name, host->nh_sysid);
 	} else {
 		NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
 		    af, host->nh_caller_name, host->nh_sysid);
 	}
 
  out:
 	if (af)
 		nlm_free_async_lock(af);
 	if (host)
 		nlm_host_release(host);
 }
 
 void
 nlm_do_free_all(nlm4_notify *argp)
 {
 	struct nlm_host *host, *thost;
 
 	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
 		if (!strcmp(host->nh_caller_name, argp->name))
 			nlm_host_notify(host, argp->state);
 	}
 }
 
 /*
  * Kernel module glue
  */
 static int
 nfslockd_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		return (0);
 	case MOD_UNLOAD:
 		/* The NLM module cannot be safely unloaded. */
 		/* FALLTHROUGH */
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 static moduledata_t nfslockd_mod = {
 	"nfslockd",
 	nfslockd_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
 MODULE_DEPEND(nfslockd, nfslock, 1, 1, 1);
 MODULE_VERSION(nfslockd, 1);