Index: head/lib/libnetbsd/sys/time.h
===================================================================
--- head/lib/libnetbsd/sys/time.h	(revision 336913)
+++ head/lib/libnetbsd/sys/time.h	(nonexistent)
@@ -1,65 +0,0 @@
-/* $FreeBSD$ */
-
-/*
- * Copyright (c) 1982, 1986, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)time.h	8.5 (Berkeley) 5/4/95
- */
-
-#ifndef _LIBNETBSD_SYS_TIME_H_
-#define	_LIBNETBSD_SYS_TIME_H_
-
-#include_next <sys/time.h>
-
-/* Operations on timespecs. */
-#define	timespecclear(tsp)	(tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L)
-#define	timespecisset(tsp)	((tsp)->tv_sec || (tsp)->tv_nsec)
-#define	timespeccmp(tsp, usp, cmp)					\
-	(((tsp)->tv_sec == (usp)->tv_sec) ?				\
-	    ((tsp)->tv_nsec cmp (usp)->tv_nsec) :			\
-	    ((tsp)->tv_sec cmp (usp)->tv_sec))
-#define	timespecadd(tsp, usp, vsp)					\
-	do {								\
-		(vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec;		\
-		(vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec;	\
-		if ((vsp)->tv_nsec >= 1000000000L) {			\
-			(vsp)->tv_sec++;				\
-			(vsp)->tv_nsec -= 1000000000L;			\
-		}							\
-	} while (/* CONSTCOND */ 0)
-#define	timespecsub(tsp, usp, vsp)					\
-	do {								\
-		(vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec;		\
-		(vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec;	\
-		if ((vsp)->tv_nsec < 0) {				\
-			(vsp)->tv_sec--;				\
-			(vsp)->tv_nsec += 1000000000L;			\
-		}							\
-	} while (/* CONSTCOND */ 0)
-
-#endif

Property changes on: head/lib/libnetbsd/sys/time.h
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: head/sbin/fsck_ffs/fsutil.c
===================================================================
--- head/sbin/fsck_ffs/fsutil.c	(revision 336913)
+++ head/sbin/fsck_ffs/fsutil.c	(revision 336914)
@@ -1,1059 +1,1038 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static const char sccsid[] = "@(#)utilities.c	8.6 (Berkeley) 5/19/95";
 #endif /* not lint */
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/disk.h>
 #include <sys/disklabel.h>
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 
 #include <ufs/ufs/dinode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ffs/fs.h>
 
 #include <err.h>
 #include <errno.h>
 #include <string.h>
 #include <ctype.h>
 #include <fstab.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <unistd.h>
 #include <libufs.h>
 
 #include "fsck.h"
 
 static void slowio_start(void);
 static void slowio_end(void);
 static void printIOstats(void);
 
 static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */
 static struct timespec startpass, finishpass;
 struct timeval slowio_starttime;
 int slowio_delay_usec = 10000;	/* Initial IO delay for background fsck */
 int slowio_pollcnt;
 static struct bufarea cgblk;	/* backup buffer for cylinder group blocks */
 static TAILQ_HEAD(buflist, bufarea) bufhead;	/* head of buffer cache list */
 static int numbufs;				/* size of buffer cache */
 static char *buftype[BT_NUMBUFTYPES] = BT_NAMES;
 static struct bufarea *cgbufs;	/* header for cylinder group cache */
 static int flushtries;		/* number of tries to reclaim memory */
 
 void
 fsutilinit(void)
 {
 	diskreads = totaldiskreads = totalreads = 0;
 	bzero(&startpass, sizeof(struct timespec));
 	bzero(&finishpass, sizeof(struct timespec));
 	bzero(&slowio_starttime, sizeof(struct timeval));
 	slowio_delay_usec = 10000;
 	slowio_pollcnt = 0;
 	bzero(&cgblk, sizeof(struct bufarea));
 	TAILQ_INIT(&bufhead);
 	numbufs = 0;
 	/* buftype ? */
 	cgbufs = NULL;
 	flushtries = 0;
 }
 
 int
 ftypeok(union dinode *dp)
 {
 	switch (DIP(dp, di_mode) & IFMT) {
 
 	case IFDIR:
 	case IFREG:
 	case IFBLK:
 	case IFCHR:
 	case IFLNK:
 	case IFSOCK:
 	case IFIFO:
 		return (1);
 
 	default:
 		if (debug)
 			printf("bad file type 0%o\n", DIP(dp, di_mode));
 		return (0);
 	}
 }
 
 int
 reply(const char *question)
 {
 	int persevere;
 	char c;
 
 	if (preen)
 		pfatal("INTERNAL ERROR: GOT TO reply()");
 	persevere = !strcmp(question, "CONTINUE");
 	printf("\n");
 	if (!persevere && (nflag || (fswritefd < 0 && bkgrdflag == 0))) {
 		printf("%s? no\n\n", question);
 		resolved = 0;
 		return (0);
 	}
 	if (yflag || (persevere && nflag)) {
 		printf("%s? yes\n\n", question);
 		return (1);
 	}
 	do	{
 		printf("%s? [yn] ", question);
 		(void) fflush(stdout);
 		c = getc(stdin);
 		while (c != '\n' && getc(stdin) != '\n') {
 			if (feof(stdin)) {
 				resolved = 0;
 				return (0);
 			}
 		}
 	} while (c != 'y' && c != 'Y' && c != 'n' && c != 'N');
 	printf("\n");
 	if (c == 'y' || c == 'Y')
 		return (1);
 	resolved = 0;
 	return (0);
 }
 
 /*
  * Look up state information for an inode.
  */
 struct inostat *
 inoinfo(ino_t inum)
 {
 	static struct inostat unallocated = { USTATE, 0, 0 };
 	struct inostatlist *ilp;
 	int iloff;
 
 	if (inum > maxino)
 		errx(EEXIT, "inoinfo: inumber %ju out of range",
 		    (uintmax_t)inum);
 	ilp = &inostathead[inum / sblock.fs_ipg];
 	iloff = inum % sblock.fs_ipg;
 	if (iloff >= ilp->il_numalloced)
 		return (&unallocated);
 	return (&ilp->il_stat[iloff]);
 }
 
 /*
  * Malloc buffers and set up cache.
  */
 void
 bufinit(void)
 {
 	struct bufarea *bp;
 	long bufcnt, i;
 	char *bufp;
 
 	pbp = pdirbp = (struct bufarea *)0;
 	bufp = Malloc((unsigned int)sblock.fs_bsize);
 	if (bufp == NULL)
 		errx(EEXIT, "cannot allocate buffer pool");
 	cgblk.b_un.b_buf = bufp;
 	initbarea(&cgblk, BT_CYLGRP);
 	TAILQ_INIT(&bufhead);
 	bufcnt = MAXBUFS;
 	if (bufcnt < MINBUFS)
 		bufcnt = MINBUFS;
 	for (i = 0; i < bufcnt; i++) {
 		bp = (struct bufarea *)Malloc(sizeof(struct bufarea));
 		bufp = Malloc((unsigned int)sblock.fs_bsize);
 		if (bp == NULL || bufp == NULL) {
 			if (i >= MINBUFS)
 				break;
 			errx(EEXIT, "cannot allocate buffer pool");
 		}
 		bp->b_un.b_buf = bufp;
 		TAILQ_INSERT_HEAD(&bufhead, bp, b_list);
 		initbarea(bp, BT_UNKNOWN);
 	}
 	numbufs = i;	/* save number of buffers */
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
 		readtime[i].tv_sec = totalreadtime[i].tv_sec = 0;
 		readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0;
 		readcnt[i] = totalreadcnt[i] = 0;
 	}
 }
 
 /*
  * Manage cylinder group buffers.
  */
 static struct bufarea *cgbufs;	/* header for cylinder group cache */
 static int flushtries;		/* number of tries to reclaim memory */
 
 struct bufarea *
 cglookup(int cg)
 {
 	struct bufarea *cgbp;
 	struct cg *cgp;
 
 	if (cgbufs == NULL) {
 		cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea));
 		if (cgbufs == NULL)
 			errx(EEXIT, "cannot allocate cylinder group buffers");
 	}
 	cgbp = &cgbufs[cg];
 	if (cgbp->b_un.b_cg != NULL)
 		return (cgbp);
 	cgp = NULL;
 	if (flushtries == 0)
 		cgp = malloc((unsigned int)sblock.fs_cgsize);
 	if (cgp == NULL) {
 		getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
 		return (&cgblk);
 	}
 	cgbp->b_un.b_cg = cgp;
 	initbarea(cgbp, BT_CYLGRP);
 	getblk(cgbp, cgtod(&sblock, cg), sblock.fs_cgsize);
 	return (cgbp);
 }
 
 /*
  * Attempt to flush a cylinder group cache entry.
  * Return whether the flush was successful.
  */
 int
 flushentry(void)
 {
 	struct bufarea *cgbp;
 
 	if (flushtries == sblock.fs_ncg || cgbufs == NULL)
 		return (0);
 	cgbp = &cgbufs[flushtries++];
 	if (cgbp->b_un.b_cg == NULL)
 		return (0);
 	flush(fswritefd, cgbp);
 	free(cgbp->b_un.b_buf);
 	cgbp->b_un.b_buf = NULL;
 	return (1);
 }
 
 /*
  * Manage a cache of directory blocks.
  */
 struct bufarea *
 getdatablk(ufs2_daddr_t blkno, long size, int type)
 {
 	struct bufarea *bp;
 
 	TAILQ_FOREACH(bp, &bufhead, b_list)
 		if (bp->b_bno == fsbtodb(&sblock, blkno))
 			goto foundit;
 	TAILQ_FOREACH_REVERSE(bp, &bufhead, buflist, b_list)
 		if ((bp->b_flags & B_INUSE) == 0)
 			break;
 	if (bp == NULL)
 		errx(EEXIT, "deadlocked buffer pool");
 	bp->b_type = type;
 	getblk(bp, blkno, size);
 	/* fall through */
 foundit:
 	if (debug && bp->b_type != type)
 		printf("Buffer type changed from %s to %s\n",
 		    buftype[bp->b_type], buftype[type]);
 	TAILQ_REMOVE(&bufhead, bp, b_list);
 	TAILQ_INSERT_HEAD(&bufhead, bp, b_list);
 	bp->b_flags |= B_INUSE;
 	return (bp);
 }
 
-/*
- * Timespec operations (from <sys/time.h>).
- */
-#define	timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-#define	timespecadd(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec += (uvp)->tv_sec;				\
-		(vvp)->tv_nsec += (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec >= 1000000000) {			\
-			(vvp)->tv_sec++;				\
-			(vvp)->tv_nsec -= 1000000000;			\
-		}							\
-	} while (0)
-
 void
 getblk(struct bufarea *bp, ufs2_daddr_t blk, long size)
 {
 	ufs2_daddr_t dblk;
 	struct timespec start, finish;
 
 	dblk = fsbtodb(&sblock, blk);
 	if (bp->b_bno == dblk) {
 		totalreads++;
 	} else {
 		flush(fswritefd, bp);
 		if (debug) {
 			readcnt[bp->b_type]++;
 			clock_gettime(CLOCK_REALTIME_PRECISE, &start);
 		}
 		bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size);
 		if (debug) {
 			clock_gettime(CLOCK_REALTIME_PRECISE, &finish);
-			timespecsub(&finish, &start);
-			timespecadd(&readtime[bp->b_type], &finish);
+			timespecsub(&finish, &start, &finish);
+			timespecadd(&readtime[bp->b_type], &finish,
+			    &readtime[bp->b_type]);
 		}
 		bp->b_bno = dblk;
 		bp->b_size = size;
 	}
 }
 
 void
 flush(int fd, struct bufarea *bp)
 {
 
 	if (!bp->b_dirty)
 		return;
 	bp->b_dirty = 0;
 	if (fswritefd < 0) {
 		pfatal("WRITING IN READ_ONLY MODE.\n");
 		return;
 	}
 	if (bp->b_errs != 0)
 		pfatal("WRITING %sZERO'ED BLOCK %lld TO DISK\n",
 		    (bp->b_errs == bp->b_size / dev_bsize) ? "" : "PARTIALLY ",
 		    (long long)bp->b_bno);
 	bp->b_errs = 0;
 	/*
 	 * Write using the appropriate function.
 	 */
 	switch (bp->b_type) {
 	case BT_SUPERBLK:
 		if (bp != &sblk)
 			pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n",
 			    bp, &sblk);
 		if (sbput(fd, (struct fs *)bp->b_un.b_buf, 0) == 0)
 			fsmodified = 1;
 		break;
 	case BT_CYLGRP:
 		if (cgput(&disk, (struct cg *)bp->b_un.b_buf) == 0)
 			fsmodified = 1;
 		break;
 	default:
 		blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size);
 		break;
 	}
 }
 
 void
 rwerror(const char *mesg, ufs2_daddr_t blk)
 {
 
 	if (bkgrdcheck)
 		exit(EEXIT);
 	if (preen == 0)
 		printf("\n");
 	pfatal("CANNOT %s: %ld", mesg, (long)blk);
 	if (reply("CONTINUE") == 0)
 		exit(EEXIT);
 }
 
 void
 ckfini(int markclean)
 {
 	struct bufarea *bp, *nbp;
 	int ofsmodified, cnt;
 
 	if (bkgrdflag) {
 		unlink(snapname);
 		if ((!(sblock.fs_flags & FS_UNCLEAN)) != markclean) {
 			cmd.value = FS_UNCLEAN;
 			cmd.size = markclean ? -1 : 1;
 			if (sysctlbyname("vfs.ffs.setflags", 0, 0,
 			    &cmd, sizeof cmd) == -1)
 				rwerror("SET FILE SYSTEM FLAGS", FS_UNCLEAN);
 			if (!preen) {
 				printf("\n***** FILE SYSTEM MARKED %s *****\n",
 				    markclean ? "CLEAN" : "DIRTY");
 				if (!markclean)
 					rerun = 1;
 			}
 		} else if (!preen && !markclean) {
 			printf("\n***** FILE SYSTEM STILL DIRTY *****\n");
 			rerun = 1;
 		}
 	}
 	if (debug && totalreads > 0)
 		printf("cache with %d buffers missed %ld of %ld (%d%%)\n",
 		    numbufs, totaldiskreads, totalreads,
 		    (int)(totaldiskreads * 100 / totalreads));
 	if (fswritefd < 0) {
 		(void)close(fsreadfd);
 		return;
 	}
 	flush(fswritefd, &sblk);
 	if (havesb && cursnapshot == 0 && sblock.fs_magic == FS_UFS2_MAGIC &&
 	    sblk.b_bno != sblock.fs_sblockloc / dev_bsize &&
 	    !preen && reply("UPDATE STANDARD SUPERBLOCK")) {
 		/* Change the write destination to standard superblock */
 		sblock.fs_sblockactualloc = sblock.fs_sblockloc;
 		sblk.b_bno = sblock.fs_sblockloc / dev_bsize;
 		sbdirty();
 		flush(fswritefd, &sblk);
 	}
 	flush(fswritefd, &cgblk);
 	free(cgblk.b_un.b_buf);
 	cnt = 0;
 	TAILQ_FOREACH_REVERSE_SAFE(bp, &bufhead, buflist, b_list, nbp) {
 		TAILQ_REMOVE(&bufhead, bp, b_list);
 		cnt++;
 		flush(fswritefd, bp);
 		free(bp->b_un.b_buf);
 		free((char *)bp);
 	}
 	if (numbufs != cnt)
 		errx(EEXIT, "panic: lost %d buffers", numbufs - cnt);
 	if (cgbufs != NULL) {
 		for (cnt = 0; cnt < sblock.fs_ncg; cnt++) {
 			if (cgbufs[cnt].b_un.b_cg == NULL)
 				continue;
 			flush(fswritefd, &cgbufs[cnt]);
 			free(cgbufs[cnt].b_un.b_cg);
 		}
 		free(cgbufs);
 	}
 	pbp = pdirbp = (struct bufarea *)0;
 	if (cursnapshot == 0 && sblock.fs_clean != markclean) {
 		if ((sblock.fs_clean = markclean) != 0) {
 			sblock.fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK);
 			sblock.fs_pendingblocks = 0;
 			sblock.fs_pendinginodes = 0;
 		}
 		sbdirty();
 		ofsmodified = fsmodified;
 		flush(fswritefd, &sblk);
 		fsmodified = ofsmodified;
 		if (!preen) {
 			printf("\n***** FILE SYSTEM MARKED %s *****\n",
 			    markclean ? "CLEAN" : "DIRTY");
 			if (!markclean)
 				rerun = 1;
 		}
 	} else if (!preen) {
 		if (markclean) {
 			printf("\n***** FILE SYSTEM IS CLEAN *****\n");
 		} else {
 			printf("\n***** FILE SYSTEM STILL DIRTY *****\n");
 			rerun = 1;
 		}
 	}
 	(void)close(fsreadfd);
 	(void)close(fswritefd);
 }
 
 /*
  * Print out I/O statistics.
  */
 void
 IOstats(char *what)
 {
 	int i;
 
 	if (debug == 0)
 		return;
 	if (diskreads == 0) {
 		printf("%s: no I/O\n\n", what);
 		return;
 	}
 	if (startpass.tv_sec == 0)
 		startpass = startprog;
 	printf("%s: I/O statistics\n", what);
 	printIOstats();
 	totaldiskreads += diskreads;
 	diskreads = 0;
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
-		timespecadd(&totalreadtime[i], &readtime[i]);
+		timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]);
 		totalreadcnt[i] += readcnt[i];
 		readtime[i].tv_sec = readtime[i].tv_nsec = 0;
 		readcnt[i] = 0;
 	}
 	clock_gettime(CLOCK_REALTIME_PRECISE, &startpass);
 }
 
 void
 finalIOstats(void)
 {
 	int i;
 
 	if (debug == 0)
 		return;
 	printf("Final I/O statistics\n");
 	totaldiskreads += diskreads;
 	diskreads = totaldiskreads;
 	startpass = startprog;
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
-		timespecadd(&totalreadtime[i], &readtime[i]);
+		timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]);
 		totalreadcnt[i] += readcnt[i];
 		readtime[i] = totalreadtime[i];
 		readcnt[i] = totalreadcnt[i];
 	}
 	printIOstats();
 }
 
 static void printIOstats(void)
 {
 	long long msec, totalmsec;
 	int i;
 
 	clock_gettime(CLOCK_REALTIME_PRECISE, &finishpass);
-	timespecsub(&finishpass, &startpass);
+	timespecsub(&finishpass, &startpass, &finishpass);
 	printf("Running time: %jd.%03ld sec\n",
 		(intmax_t)finishpass.tv_sec, finishpass.tv_nsec / 1000000);
 	printf("buffer reads by type:\n");
 	for (totalmsec = 0, i = 0; i < BT_NUMBUFTYPES; i++)
 		totalmsec += readtime[i].tv_sec * 1000 +
 		    readtime[i].tv_nsec / 1000000;
 	if (totalmsec == 0)
 		totalmsec = 1;
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
 		if (readcnt[i] == 0)
 			continue;
 		msec =
 		    readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000;
 		printf("%21s:%8ld %2ld.%ld%% %4jd.%03ld sec %2lld.%lld%%\n",
 		    buftype[i], readcnt[i], readcnt[i] * 100 / diskreads,
 		    (readcnt[i] * 1000 / diskreads) % 10,
 		    (intmax_t)readtime[i].tv_sec, readtime[i].tv_nsec / 1000000,
 		    msec * 100 / totalmsec, (msec * 1000 / totalmsec) % 10);
 	}
 	printf("\n");
 }
 
 int
 blread(int fd, char *buf, ufs2_daddr_t blk, long size)
 {
 	char *cp;
 	int i, errs;
 	off_t offset;
 
 	offset = blk;
 	offset *= dev_bsize;
 	if (bkgrdflag)
 		slowio_start();
 	totalreads++;
 	diskreads++;
 	if (lseek(fd, offset, 0) < 0)
 		rwerror("SEEK BLK", blk);
 	else if (read(fd, buf, (int)size) == size) {
 		if (bkgrdflag)
 			slowio_end();
 		return (0);
 	}
 
 	/*
 	 * This is handled specially here instead of in rwerror because
 	 * rwerror is used for all sorts of errors, not just true read/write
 	 * errors.  It should be refactored and fixed.
 	 */
 	if (surrender) {
 		pfatal("CANNOT READ_BLK: %ld", (long)blk);
 		errx(EEXIT, "ABORTING DUE TO READ ERRORS");
 	} else
 		rwerror("READ BLK", blk);
 
 	if (lseek(fd, offset, 0) < 0)
 		rwerror("SEEK BLK", blk);
 	errs = 0;
 	memset(buf, 0, (size_t)size);
 	printf("THE FOLLOWING DISK SECTORS COULD NOT BE READ:");
 	for (cp = buf, i = 0; i < size; i += secsize, cp += secsize) {
 		if (read(fd, cp, (int)secsize) != secsize) {
 			(void)lseek(fd, offset + i + secsize, 0);
 			if (secsize != dev_bsize && dev_bsize != 1)
 				printf(" %jd (%jd),",
 				    (intmax_t)(blk * dev_bsize + i) / secsize,
 				    (intmax_t)blk + i / dev_bsize);
 			else
 				printf(" %jd,", (intmax_t)blk + i / dev_bsize);
 			errs++;
 		}
 	}
 	printf("\n");
 	if (errs)
 		resolved = 0;
 	return (errs);
 }
 
 void
 blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size)
 {
 	int i;
 	char *cp;
 	off_t offset;
 
 	if (fd < 0)
 		return;
 	offset = blk;
 	offset *= dev_bsize;
 	if (lseek(fd, offset, 0) < 0)
 		rwerror("SEEK BLK", blk);
 	else if (write(fd, buf, size) == size) {
 		fsmodified = 1;
 		return;
 	}
 	resolved = 0;
 	rwerror("WRITE BLK", blk);
 	if (lseek(fd, offset, 0) < 0)
 		rwerror("SEEK BLK", blk);
 	printf("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:");
 	for (cp = buf, i = 0; i < size; i += dev_bsize, cp += dev_bsize)
 		if (write(fd, cp, dev_bsize) != dev_bsize) {
 			(void)lseek(fd, offset + i + dev_bsize, 0);
 			printf(" %jd,", (intmax_t)blk + i / dev_bsize);
 		}
 	printf("\n");
 	return;
 }
 
 void
 blerase(int fd, ufs2_daddr_t blk, long size)
 {
 	off_t ioarg[2];
 
 	if (fd < 0)
 		return;
 	ioarg[0] = blk * dev_bsize;
 	ioarg[1] = size;
 	ioctl(fd, DIOCGDELETE, ioarg);
 	/* we don't really care if we succeed or not */
 	return;
 }
 
 /*
  * Fill a contiguous region with all-zeroes.  Note ZEROBUFSIZE is by
  * definition a multiple of dev_bsize.
  */
 void
 blzero(int fd, ufs2_daddr_t blk, long size)
 {
 	static char *zero;
 	off_t offset, len;
 
 	if (fd < 0)
 		return;
 	if (zero == NULL) {
 		zero = calloc(ZEROBUFSIZE, 1);
 		if (zero == NULL)
 			errx(EEXIT, "cannot allocate buffer pool");
 	}
 	offset = blk * dev_bsize;
 	if (lseek(fd, offset, 0) < 0)
 		rwerror("SEEK BLK", blk);
 	while (size > 0) {
 		len = MIN(ZEROBUFSIZE, size);
 		if (write(fd, zero, len) != len)
 			rwerror("WRITE BLK", blk);
 		blk += len / dev_bsize;
 		size -= len;
 	}
 }
 
 /*
  * Verify cylinder group's magic number and other parameters.  If the
  * test fails, offer an option to rebuild the whole cylinder group.
  */
 int
 check_cgmagic(int cg, struct bufarea *cgbp)
 {
 	struct cg *cgp = cgbp->b_un.b_cg;
 
 	/*
 	 * Extended cylinder group checks.
 	 */
 	if (cg_chkmagic(cgp) &&
 	    ((sblock.fs_magic == FS_UFS1_MAGIC &&
 	      cgp->cg_old_niblk == sblock.fs_ipg &&
 	      cgp->cg_ndblk <= sblock.fs_fpg &&
 	      cgp->cg_old_ncyl <= sblock.fs_old_cpg) ||
 	     (sblock.fs_magic == FS_UFS2_MAGIC &&
 	      cgp->cg_niblk == sblock.fs_ipg &&
 	      cgp->cg_ndblk <= sblock.fs_fpg &&
 	      cgp->cg_initediblk <= sblock.fs_ipg))) {
 		return (1);
 	}
 	pfatal("CYLINDER GROUP %d: BAD MAGIC NUMBER", cg);
 	if (!reply("REBUILD CYLINDER GROUP")) {
 		printf("YOU WILL NEED TO RERUN FSCK.\n");
 		rerun = 1;
 		return (1);
 	}
 	/*
 	 * Zero out the cylinder group and then initialize critical fields.
 	 * Bit maps and summaries will be recalculated by later passes.
 	 */
 	memset(cgp, 0, (size_t)sblock.fs_cgsize);
 	cgp->cg_magic = CG_MAGIC;
 	cgp->cg_cgx = cg;
 	cgp->cg_niblk = sblock.fs_ipg;
 	cgp->cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock));
 	if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size)
 		cgp->cg_ndblk = sblock.fs_fpg;
 	else
 		cgp->cg_ndblk = sblock.fs_size - cgbase(&sblock, cg);
 	cgp->cg_iusedoff = &cgp->cg_space[0] - (u_char *)(&cgp->cg_firstfield);
 	if (sblock.fs_magic == FS_UFS1_MAGIC) {
 		cgp->cg_niblk = 0;
 		cgp->cg_initediblk = 0;
 		cgp->cg_old_ncyl = sblock.fs_old_cpg;
 		cgp->cg_old_niblk = sblock.fs_ipg;
 		cgp->cg_old_btotoff = cgp->cg_iusedoff;
 		cgp->cg_old_boff = cgp->cg_old_btotoff +
 		    sblock.fs_old_cpg * sizeof(int32_t);
 		cgp->cg_iusedoff = cgp->cg_old_boff +
 		    sblock.fs_old_cpg * sizeof(u_int16_t);
 	}
 	cgp->cg_freeoff = cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT);
 	cgp->cg_nextfreeoff = cgp->cg_freeoff + howmany(sblock.fs_fpg,CHAR_BIT);
 	if (sblock.fs_contigsumsize > 0) {
 		cgp->cg_nclusterblks = cgp->cg_ndblk / sblock.fs_frag;
 		cgp->cg_clustersumoff =
 		    roundup(cgp->cg_nextfreeoff, sizeof(u_int32_t));
 		cgp->cg_clustersumoff -= sizeof(u_int32_t);
 		cgp->cg_clusteroff = cgp->cg_clustersumoff +
 		    (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t);
 		cgp->cg_nextfreeoff = cgp->cg_clusteroff +
 		    howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT);
 	}
 	dirty(cgbp);
 	return (0);
 }
 
 /*
  * allocate a data block with the specified number of fragments
  */
 ufs2_daddr_t
 allocblk(long frags)
 {
 	int i, j, k, cg, baseblk;
 	struct bufarea *cgbp;
 	struct cg *cgp;
 
 	if (frags <= 0 || frags > sblock.fs_frag)
 		return (0);
 	for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) {
 		for (j = 0; j <= sblock.fs_frag - frags; j++) {
 			if (testbmap(i + j))
 				continue;
 			for (k = 1; k < frags; k++)
 				if (testbmap(i + j + k))
 					break;
 			if (k < frags) {
 				j += k;
 				continue;
 			}
 			cg = dtog(&sblock, i + j);
 			cgbp = cglookup(cg);
 			cgp = cgbp->b_un.b_cg;
 			if (!check_cgmagic(cg, cgbp))
 				return (0);
 			baseblk = dtogd(&sblock, i + j);
 			for (k = 0; k < frags; k++) {
 				setbmap(i + j + k);
 				clrbit(cg_blksfree(cgp), baseblk + k);
 			}
 			n_blks += frags;
 			if (frags == sblock.fs_frag)
 				cgp->cg_cs.cs_nbfree--;
 			else
 				cgp->cg_cs.cs_nffree -= frags;
 			dirty(cgbp);
 			return (i + j);
 		}
 	}
 	return (0);
 }
 
 /*
  * Free a previously allocated block
  */
 void
 freeblk(ufs2_daddr_t blkno, long frags)
 {
 	struct inodesc idesc;
 
 	idesc.id_blkno = blkno;
 	idesc.id_numfrags = frags;
 	(void)pass4check(&idesc);
 }
 
 /* Slow down IO so as to leave some disk bandwidth for other processes */
 void
 slowio_start()
 {
 
 	/* Delay one in every 8 operations */
 	slowio_pollcnt = (slowio_pollcnt + 1) & 7;
 	if (slowio_pollcnt == 0) {
 		gettimeofday(&slowio_starttime, NULL);
 	}
 }
 
 void
 slowio_end()
 {
 	struct timeval tv;
 	int delay_usec;
 
 	if (slowio_pollcnt != 0)
 		return;
 
 	/* Update the slowdown interval. */
 	gettimeofday(&tv, NULL);
 	delay_usec = (tv.tv_sec - slowio_starttime.tv_sec) * 1000000 +
 	    (tv.tv_usec - slowio_starttime.tv_usec);
 	if (delay_usec < 64)
 		delay_usec = 64;
 	if (delay_usec > 2500000)
 		delay_usec = 2500000;
 	slowio_delay_usec = (slowio_delay_usec * 63 + delay_usec) >> 6;
 	/* delay by 8 times the average IO delay */
 	if (slowio_delay_usec > 64)
 		usleep(slowio_delay_usec * 8);
 }
 
 /*
  * Find a pathname
  */
 void
 getpathname(char *namebuf, ino_t curdir, ino_t ino)
 {
 	int len;
 	char *cp;
 	struct inodesc idesc;
 	static int busy = 0;
 
 	if (curdir == ino && ino == UFS_ROOTINO) {
 		(void)strcpy(namebuf, "/");
 		return;
 	}
 	if (busy || !INO_IS_DVALID(curdir)) {
 		(void)strcpy(namebuf, "?");
 		return;
 	}
 	busy = 1;
 	memset(&idesc, 0, sizeof(struct inodesc));
 	idesc.id_type = DATA;
 	idesc.id_fix = IGNORE;
 	cp = &namebuf[MAXPATHLEN - 1];
 	*cp = '\0';
 	if (curdir != ino) {
 		idesc.id_parent = curdir;
 		goto namelookup;
 	}
 	while (ino != UFS_ROOTINO) {
 		idesc.id_number = ino;
 		idesc.id_func = findino;
 		idesc.id_name = strdup("..");
 		if ((ckinode(ginode(ino), &idesc) & FOUND) == 0)
 			break;
 	namelookup:
 		idesc.id_number = idesc.id_parent;
 		idesc.id_parent = ino;
 		idesc.id_func = findname;
 		idesc.id_name = namebuf;
 		if ((ckinode(ginode(idesc.id_number), &idesc)&FOUND) == 0)
 			break;
 		len = strlen(namebuf);
 		cp -= len;
 		memmove(cp, namebuf, (size_t)len);
 		*--cp = '/';
 		if (cp < &namebuf[UFS_MAXNAMLEN])
 			break;
 		ino = idesc.id_number;
 	}
 	busy = 0;
 	if (ino != UFS_ROOTINO)
 		*--cp = '?';
 	memmove(namebuf, cp, (size_t)(&namebuf[MAXPATHLEN] - cp));
 }
 
 void
 catch(int sig __unused)
 {
 
 	ckfini(0);
 	exit(12);
 }
 
 /*
  * When preening, allow a single quit to signal
  * a special exit after file system checks complete
  * so that reboot sequence may be interrupted.
  */
 void
 catchquit(int sig __unused)
 {
 	printf("returning to single-user after file system check\n");
 	returntosingle = 1;
 	(void)signal(SIGQUIT, SIG_DFL);
 }
 
 /*
  * determine whether an inode should be fixed.
  */
 int
 dofix(struct inodesc *idesc, const char *msg)
 {
 
 	switch (idesc->id_fix) {
 
 	case DONTKNOW:
 		if (idesc->id_type == DATA)
 			direrror(idesc->id_number, msg);
 		else
 			pwarn("%s", msg);
 		if (preen) {
 			printf(" (SALVAGED)\n");
 			idesc->id_fix = FIX;
 			return (ALTERED);
 		}
 		if (reply("SALVAGE") == 0) {
 			idesc->id_fix = NOFIX;
 			return (0);
 		}
 		idesc->id_fix = FIX;
 		return (ALTERED);
 
 	case FIX:
 		return (ALTERED);
 
 	case NOFIX:
 	case IGNORE:
 		return (0);
 
 	default:
 		errx(EEXIT, "UNKNOWN INODESC FIX MODE %d", idesc->id_fix);
 	}
 	/* NOTREACHED */
 	return (0);
 }
 
 #include <stdarg.h>
 
 /*
  * An unexpected inconsistency occurred.
  * Die if preening or file system is running with soft dependency protocol,
  * otherwise just print message and continue.
  */
 void
 pfatal(const char *fmt, ...)
 {
 	va_list ap;
 	va_start(ap, fmt);
 	if (!preen) {
 		(void)vfprintf(stdout, fmt, ap);
 		va_end(ap);
 		if (usedsoftdep)
 			(void)fprintf(stdout,
 			    "\nUNEXPECTED SOFT UPDATE INCONSISTENCY\n");
 		/*
 		 * Force foreground fsck to clean up inconsistency.
 		 */
 		if (bkgrdflag) {
 			cmd.value = FS_NEEDSFSCK;
 			cmd.size = 1;
 			if (sysctlbyname("vfs.ffs.setflags", 0, 0,
 			    &cmd, sizeof cmd) == -1)
 				pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n");
 			fprintf(stdout, "CANNOT RUN IN BACKGROUND\n");
 			ckfini(0);
 			exit(EEXIT);
 		}
 		return;
 	}
 	if (cdevname == NULL)
 		cdevname = strdup("fsck");
 	(void)fprintf(stdout, "%s: ", cdevname);
 	(void)vfprintf(stdout, fmt, ap);
 	(void)fprintf(stdout,
 	    "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n",
 	    cdevname, usedsoftdep ? " SOFT UPDATE " : " ");
 	/*
 	 * Force foreground fsck to clean up inconsistency.
 	 */
 	if (bkgrdflag) {
 		cmd.value = FS_NEEDSFSCK;
 		cmd.size = 1;
 		if (sysctlbyname("vfs.ffs.setflags", 0, 0,
 		    &cmd, sizeof cmd) == -1)
 			pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n");
 	}
 	ckfini(0);
 	exit(EEXIT);
 }
 
 /*
  * Pwarn just prints a message when not preening or running soft dependency
  * protocol, or a warning (preceded by filename) when preening.
  */
 void
 pwarn(const char *fmt, ...)
 {
 	va_list ap;
 	va_start(ap, fmt);
 	if (preen)
 		(void)fprintf(stdout, "%s: ", cdevname);
 	(void)vfprintf(stdout, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Stub for routines from kernel.
  */
 void
 panic(const char *fmt, ...)
 {
 	va_list ap;
 	va_start(ap, fmt);
 	pfatal("INTERNAL INCONSISTENCY:");
 	(void)vfprintf(stdout, fmt, ap);
 	va_end(ap);
 	exit(EEXIT);
 }
Index: head/share/man/man3/Makefile
===================================================================
--- head/share/man/man3/Makefile	(revision 336913)
+++ head/share/man/man3/Makefile	(revision 336914)
@@ -1,341 +1,346 @@
 #	@(#)Makefile	8.2 (Berkeley) 12/13/93
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=runtime-manuals
 
 MAN=		assert.3 \
 		ATOMIC_VAR_INIT.3 \
 		bitstring.3 \
 		end.3 \
 		fpgetround.3 \
 		intro.3 \
 		makedev.3 \
 		offsetof.3 \
 		${PTHREAD_MAN} \
 		queue.3 \
 		sigevent.3 \
 		siginfo.3 \
 		stdarg.3 \
 		sysexits.3 \
 		tgmath.3 \
 		timeradd.3 \
 		tree.3
 
 MLINKS=		ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_exchange.3 \
 		ATOMIC_VAR_INIT.3 atomic_exchange_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_add.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_add_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_and.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_and_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_or.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_or_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_sub.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_sub_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_xor.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_xor_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_init.3 \
 		ATOMIC_VAR_INIT.3 atomic_is_lock_free.3 \
 		ATOMIC_VAR_INIT.3 atomic_load.3 \
 		ATOMIC_VAR_INIT.3 atomic_load_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_store.3 \
 		ATOMIC_VAR_INIT.3 atomic_store_explicit.3
 MLINKS+=	bitstring.3 bit_alloc.3 \
 		bitstring.3 bit_clear.3 \
 		bitstring.3 bit_decl.3 \
 		bitstring.3 bit_ffc.3 \
 		bitstring.3 bit_ffc_at.3 \
 		bitstring.3 bit_ffs.3 \
 		bitstring.3 bit_ffs_at.3 \
 		bitstring.3 bit_nclear.3 \
 		bitstring.3 bit_nset.3 \
 		bitstring.3 bit_set.3 \
 		bitstring.3 bitstr_size.3 \
 		bitstring.3 bit_test.3
 MLINKS+=	end.3 edata.3 \
 		end.3 etext.3
 MLINKS+=	fpgetround.3 fpgetmask.3 \
 		fpgetround.3 fpgetprec.3 \
 		fpgetround.3 fpgetsticky.3 \
 		fpgetround.3 fpresetsticky.3 \
 		fpgetround.3 fpsetmask.3 \
 		fpgetround.3 fpsetprec.3 \
 		fpgetround.3 fpsetround.3
 MLINKS+=	makedev.3 major.3 \
 		makedev.3 minor.3
 MLINKS+=	${PTHREAD_MLINKS}
 MLINKS+=	queue.3 LIST_CLASS_ENTRY.3 \
 		queue.3 LIST_CLASS_HEAD.3 \
 		queue.3 LIST_EMPTY.3 \
 		queue.3 LIST_ENTRY.3 \
 		queue.3 LIST_FIRST.3 \
 		queue.3 LIST_FOREACH.3 \
 		queue.3 LIST_FOREACH_FROM.3 \
 		queue.3 LIST_FOREACH_FROM_SAFE.3 \
 		queue.3 LIST_FOREACH_SAFE.3 \
 		queue.3 LIST_HEAD.3 \
 		queue.3 LIST_HEAD_INITIALIZER.3 \
 		queue.3 LIST_INIT.3 \
 		queue.3 LIST_INSERT_AFTER.3 \
 		queue.3 LIST_INSERT_BEFORE.3 \
 		queue.3 LIST_INSERT_HEAD.3 \
 		queue.3 LIST_NEXT.3 \
 		queue.3 LIST_PREV.3 \
 		queue.3 LIST_REMOVE.3 \
 		queue.3 LIST_SWAP.3 \
 		queue.3 SLIST_CLASS_ENTRY.3 \
 		queue.3 SLIST_CLASS_HEAD.3 \
 		queue.3 SLIST_EMPTY.3 \
 		queue.3 SLIST_ENTRY.3 \
 		queue.3 SLIST_FIRST.3 \
 		queue.3 SLIST_FOREACH.3 \
 		queue.3 SLIST_FOREACH_FROM.3 \
 		queue.3 SLIST_FOREACH_FROM_SAFE.3 \
 		queue.3 SLIST_FOREACH_SAFE.3 \
 		queue.3 SLIST_HEAD.3 \
 		queue.3 SLIST_HEAD_INITIALIZER.3 \
 		queue.3 SLIST_INIT.3 \
 		queue.3 SLIST_INSERT_AFTER.3 \
 		queue.3 SLIST_INSERT_HEAD.3 \
 		queue.3 SLIST_NEXT.3 \
 		queue.3 SLIST_REMOVE.3 \
 		queue.3 SLIST_REMOVE_AFTER.3 \
 		queue.3 SLIST_REMOVE_HEAD.3 \
 		queue.3 SLIST_REMOVE_PREVPTR.3 \
 		queue.3 SLIST_SWAP.3 \
 		queue.3 STAILQ_CLASS_ENTRY.3 \
 		queue.3 STAILQ_CLASS_HEAD.3 \
 		queue.3 STAILQ_CONCAT.3 \
 		queue.3 STAILQ_EMPTY.3 \
 		queue.3 STAILQ_ENTRY.3 \
 		queue.3 STAILQ_FIRST.3 \
 		queue.3 STAILQ_FOREACH.3 \
 		queue.3 STAILQ_FOREACH_FROM.3 \
 		queue.3 STAILQ_FOREACH_FROM_SAFE.3 \
 		queue.3 STAILQ_FOREACH_SAFE.3 \
 		queue.3 STAILQ_HEAD.3 \
 		queue.3 STAILQ_HEAD_INITIALIZER.3 \
 		queue.3 STAILQ_INIT.3 \
 		queue.3 STAILQ_INSERT_AFTER.3 \
 		queue.3 STAILQ_INSERT_HEAD.3 \
 		queue.3 STAILQ_INSERT_TAIL.3 \
 		queue.3 STAILQ_LAST.3 \
 		queue.3 STAILQ_NEXT.3 \
 		queue.3 STAILQ_REMOVE.3 \
 		queue.3 STAILQ_REMOVE_AFTER.3 \
 		queue.3 STAILQ_REMOVE_HEAD.3 \
 		queue.3 STAILQ_SWAP.3 \
 		queue.3 TAILQ_CLASS_ENTRY.3 \
 		queue.3 TAILQ_CLASS_HEAD.3 \
 		queue.3 TAILQ_CONCAT.3 \
 		queue.3 TAILQ_EMPTY.3 \
 		queue.3 TAILQ_ENTRY.3 \
 		queue.3 TAILQ_FIRST.3 \
 		queue.3 TAILQ_FOREACH.3 \
 		queue.3 TAILQ_FOREACH_FROM.3 \
 		queue.3 TAILQ_FOREACH_FROM_SAFE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_FROM.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_FROM_SAFE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_SAFE.3 \
 		queue.3 TAILQ_FOREACH_SAFE.3 \
 		queue.3 TAILQ_HEAD.3 \
 		queue.3 TAILQ_HEAD_INITIALIZER.3 \
 		queue.3 TAILQ_INIT.3 \
 		queue.3 TAILQ_INSERT_AFTER.3 \
 		queue.3 TAILQ_INSERT_BEFORE.3 \
 		queue.3 TAILQ_INSERT_HEAD.3 \
 		queue.3 TAILQ_INSERT_TAIL.3 \
 		queue.3 TAILQ_LAST.3 \
 		queue.3 TAILQ_NEXT.3 \
 		queue.3 TAILQ_PREV.3 \
 		queue.3 TAILQ_REMOVE.3 \
 		queue.3 TAILQ_SWAP.3
 MLINKS+=	stdarg.3 va_arg.3 \
 		stdarg.3 va_copy.3 \
 		stdarg.3 va_end.3 \
 		stdarg.3 varargs.3 \
 		stdarg.3 va_start.3
 MLINKS+=	timeradd.3 timerclear.3 \
 		timeradd.3 timercmp.3 \
 		timeradd.3 timerisset.3 \
-		timeradd.3 timersub.3
+		timeradd.3 timersub.3 \
+		timeradd.3 timespecadd.3 \
+		timeradd.3 timespecsub.3 \
+		timeradd.3 timespecclear.3 \
+		timeradd.3 timespecisset.3 \
+		timeradd.3 timespeccmp.3
 MLINKS+=	tree.3 RB_EMPTY.3 \
 		tree.3 RB_ENTRY.3 \
 		tree.3 RB_FIND.3 \
 		tree.3 RB_FOREACH.3 \
 		tree.3 RB_FOREACH_REVERSE.3 \
 		tree.3 RB_GENERATE.3 \
 		tree.3 RB_GENERATE_STATIC.3 \
 		tree.3 RB_HEAD.3 \
 		tree.3 RB_INIT.3 \
 		tree.3 RB_INITIALIZER.3 \
 		tree.3 RB_INSERT.3 \
 		tree.3 RB_LEFT.3 \
 		tree.3 RB_MAX.3 \
 		tree.3 RB_MIN.3 \
 		tree.3 RB_NEXT.3 \
 		tree.3 RB_NFIND.3 \
 		tree.3 RB_PARENT.3 \
 		tree.3 RB_PREV.3 \
 		tree.3 RB_PROTOTYPE.3 \
 		tree.3 RB_PROTOTYPE_STATIC.3 \
 		tree.3 RB_REMOVE.3 \
 		tree.3 RB_RIGHT.3 \
 		tree.3 RB_ROOT.3 \
 		tree.3 SPLAY_EMPTY.3 \
 		tree.3 SPLAY_ENTRY.3 \
 		tree.3 SPLAY_FIND.3 \
 		tree.3 SPLAY_FOREACH.3 \
 		tree.3 SPLAY_GENERATE.3 \
 		tree.3 SPLAY_HEAD.3 \
 		tree.3 SPLAY_INIT.3 \
 		tree.3 SPLAY_INITIALIZER.3 \
 		tree.3 SPLAY_INSERT.3 \
 		tree.3 SPLAY_LEFT.3 \
 		tree.3 SPLAY_MAX.3 \
 		tree.3 SPLAY_MIN.3 \
 		tree.3 SPLAY_NEXT.3 \
 		tree.3 SPLAY_PROTOTYPE.3 \
 		tree.3 SPLAY_REMOVE.3 \
 		tree.3 SPLAY_RIGHT.3 \
 		tree.3 SPLAY_ROOT.3
 
 .if ${MK_LIBTHR} != "no"
 PTHREAD_MAN=	pthread.3 \
 		pthread_affinity_np.3 \
 		pthread_atfork.3 \
 		pthread_attr.3 \
 		pthread_attr_affinity_np.3 \
 		pthread_attr_get_np.3 \
 		pthread_attr_setcreatesuspend_np.3 \
 		pthread_barrierattr.3 \
 		pthread_barrier_destroy.3 \
 		pthread_cancel.3 \
 		pthread_cleanup_pop.3 \
 		pthread_cleanup_push.3 \
 		pthread_condattr.3 \
 		pthread_cond_broadcast.3 \
 		pthread_cond_destroy.3 \
 		pthread_cond_init.3 \
 		pthread_cond_signal.3 \
 		pthread_cond_timedwait.3 \
 		pthread_cond_wait.3 \
 		pthread_create.3 \
 		pthread_detach.3 \
 		pthread_equal.3 \
 		pthread_exit.3 \
 		pthread_getconcurrency.3 \
 		pthread_getcpuclockid.3 \
 		pthread_getspecific.3 \
 		pthread_getthreadid_np.3 \
 		pthread_join.3 \
 		pthread_key_create.3 \
 		pthread_key_delete.3 \
 		pthread_kill.3 \
 		pthread_main_np.3 \
 		pthread_multi_np.3 \
 		pthread_mutexattr.3 \
 		pthread_mutexattr_getkind_np.3 \
 		pthread_mutex_consistent.3 \
 		pthread_mutex_destroy.3 \
 		pthread_mutex_init.3 \
 		pthread_mutex_lock.3 \
 		pthread_mutex_timedlock.3 \
 		pthread_mutex_trylock.3 \
 		pthread_mutex_unlock.3 \
 		pthread_once.3 \
 		pthread_resume_all_np.3 \
 		pthread_resume_np.3 \
 		pthread_rwlockattr_destroy.3 \
 		pthread_rwlockattr_getpshared.3 \
 		pthread_rwlockattr_init.3 \
 		pthread_rwlockattr_setpshared.3 \
 		pthread_rwlock_destroy.3 \
 		pthread_rwlock_init.3 \
 		pthread_rwlock_rdlock.3 \
 		pthread_rwlock_timedrdlock.3 \
 		pthread_rwlock_timedwrlock.3 \
 		pthread_rwlock_unlock.3 \
 		pthread_rwlock_wrlock.3 \
 		pthread_schedparam.3 \
 		pthread_self.3 \
 		pthread_set_name_np.3 \
 		pthread_setspecific.3 \
 		pthread_sigmask.3 \
 		pthread_spin_init.3 \
 		pthread_spin_lock.3 \
 		pthread_suspend_all_np.3 \
 		pthread_suspend_np.3 \
 		pthread_switch_add_np.3 \
 		pthread_testcancel.3 \
 		pthread_yield.3
 
 PTHREAD_MLINKS=	pthread_affinity_np.3 pthread_getaffinity_np.3 \
 		pthread_affinity_np.3 pthread_setaffinity_np.3
 PTHREAD_MLINKS+=pthread_attr.3 pthread_attr_destroy.3 \
 		pthread_attr.3 pthread_attr_getdetachstate.3 \
 		pthread_attr.3 pthread_attr_getguardsize.3 \
 		pthread_attr.3 pthread_attr_getinheritsched.3 \
 		pthread_attr.3 pthread_attr_getschedparam.3 \
 		pthread_attr.3 pthread_attr_getschedpolicy.3 \
 		pthread_attr.3 pthread_attr_getscope.3 \
 		pthread_attr.3 pthread_attr_getstack.3 \
 		pthread_attr.3 pthread_attr_getstackaddr.3 \
 		pthread_attr.3 pthread_attr_getstacksize.3 \
 		pthread_attr.3 pthread_attr_init.3 \
 		pthread_attr.3 pthread_attr_setdetachstate.3 \
 		pthread_attr.3 pthread_attr_setguardsize.3 \
 		pthread_attr.3 pthread_attr_setinheritsched.3 \
 		pthread_attr.3 pthread_attr_setschedparam.3 \
 		pthread_attr.3 pthread_attr_setschedpolicy.3 \
 		pthread_attr.3 pthread_attr_setscope.3 \
 		pthread_attr.3 pthread_attr_setstack.3 \
 		pthread_attr.3 pthread_attr_setstackaddr.3 \
 		pthread_attr.3 pthread_attr_setstacksize.3
 PTHREAD_MLINKS+=pthread_attr_affinity_np.3 pthread_attr_getaffinity_np.3 \
 		pthread_attr_affinity_np.3 pthread_attr_setaffinity_np.3
 PTHREAD_MLINKS+=pthread_barrierattr.3 pthread_barrierattr_destroy.3 \
 		pthread_barrierattr.3 pthread_barrierattr_getpshared.3 \
 		pthread_barrierattr.3 pthread_barrierattr_init.3 \
 		pthread_barrierattr.3 pthread_barrierattr_setpshared.3
 PTHREAD_MLINKS+=pthread_barrier_destroy.3 pthread_barrier_init.3 \
 		pthread_barrier_destroy.3 pthread_barrier_wait.3
 PTHREAD_MLINKS+=pthread_condattr.3 pthread_condattr_destroy.3 \
 		pthread_condattr.3 pthread_condattr_init.3 \
 		pthread_condattr.3 pthread_condattr_getclock.3 \
 		pthread_condattr.3 pthread_condattr_setclock.3 \
 		pthread_condattr.3 pthread_condattr_getpshared.3 \
 		pthread_condattr.3 pthread_condattr_setpshared.3
 PTHREAD_MLINKS+=pthread_getconcurrency.3 pthread_setconcurrency.3
 PTHREAD_MLINKS+=pthread_multi_np.3 pthread_single_np.3
 PTHREAD_MLINKS+=pthread_mutexattr.3 pthread_mutexattr_destroy.3 \
 		pthread_mutexattr.3 pthread_mutexattr_getprioceiling.3 \
 		pthread_mutexattr.3 pthread_mutexattr_getprotocol.3 \
 		pthread_mutexattr.3 pthread_mutexattr_getrobust.3 \
 		pthread_mutexattr.3 pthread_mutexattr_gettype.3 \
 		pthread_mutexattr.3 pthread_mutexattr_init.3 \
 		pthread_mutexattr.3 pthread_mutexattr_setprioceiling.3 \
 		pthread_mutexattr.3 pthread_mutexattr_setprotocol.3 \
 		pthread_mutexattr.3 pthread_mutexattr_setrobust.3 \
 		pthread_mutexattr.3 pthread_mutexattr_settype.3
 PTHREAD_MLINKS+=pthread_mutexattr_getkind_np.3 pthread_mutexattr_setkind_np.3
 PTHREAD_MLINKS+=pthread_rwlock_rdlock.3 pthread_rwlock_tryrdlock.3
 PTHREAD_MLINKS+=pthread_rwlock_wrlock.3 pthread_rwlock_trywrlock.3
 PTHREAD_MLINKS+=pthread_schedparam.3 pthread_getschedparam.3 \
 		pthread_schedparam.3 pthread_setschedparam.3
 PTHREAD_MLINKS+=pthread_spin_init.3 pthread_spin_destroy.3 \
 		pthread_spin_lock.3 pthread_spin_trylock.3 \
 		pthread_spin_lock.3 pthread_spin_unlock.3
 PTHREAD_MLINKS+=pthread_switch_add_np.3 pthread_switch_delete_np.3
 PTHREAD_MLINKS+=pthread_testcancel.3 pthread_setcancelstate.3 \
 		pthread_testcancel.3 pthread_setcanceltype.3
 PTHREAD_MLINKS+=pthread_join.3 pthread_timedjoin_np.3
 .endif
 
 .include <bsd.prog.mk>
Index: head/share/man/man3/timeradd.3
===================================================================
--- head/share/man/man3/timeradd.3	(revision 336913)
+++ head/share/man/man3/timeradd.3	(revision 336914)
@@ -1,119 +1,165 @@
 .\" Copyright (c) 1999 Kelly Yancey <kbyanc@posi.net>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the author nor the names of any co-contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 11, 1999
+.Dd July 30, 2018
 .Dt TIMERADD 3
 .Os
 .Sh NAME
 .Nm timeradd ,
 .Nm timersub ,
 .Nm timerclear ,
 .Nm timerisset ,
-.Nm timercmp
-.Nd operations on timevals
+.Nm timercmp ,
+.Nm timespecadd ,
+.Nm timespecsub ,
+.Nm timespecclear ,
+.Nm timespecisset ,
+.Nm timespeccmp
+.Nd operations on timevals and timespecs
 .Sh SYNOPSIS
 .In sys/time.h
 .Ft void
 .Fn timeradd "struct timeval *a" "struct timeval *b" "struct timeval *res"
 .Ft void
 .Fn timersub "struct timeval *a" "struct timeval *b" "struct timeval *res"
 .Ft void
 .Fn timerclear "struct timeval *tvp"
 .Ft int
 .Fn timerisset "struct timeval *tvp"
 .Ft int
 .Fn timercmp "struct timeval *a" "struct timeval *b" CMP
+.Ft void
+.Fn timespecadd "struct timespec *a" "struct timespec *b" "struct timespec *res"
+.Ft void
+.Fn timespecsub "struct timespec *a" "struct timespec *b" "struct timespec *res"
+.Ft void
+.Fn timespecclear "struct timespec *ts"
+.Ft int
+.Fn timespecisset "struct timespec *ts"
+.Ft int
+.Fn timespeccmp "struct timespec *a" "struct timespec *b" CMP
 .Sh DESCRIPTION
 These macros are provided for manipulating
 .Fa timeval
+and
+.Fa timespec
 structures for use with the
+.Xr clock_gettime 2 ,
+.Xr clock_settime 2 ,
 .Xr gettimeofday 2
 and
 .Xr settimeofday 2
 calls.
-The structure is defined in
+The
+.Fa timeval
+structure is defined in
 .In sys/time.h
 as:
 .Bd -literal
 struct timeval {
 	long	tv_sec;		/* seconds since Jan. 1, 1970 */
 	long	tv_usec;	/* and microseconds */
 };
 .Ed
+And the
+.Fa timespec
+structure is defined in
+.In time.h
+as:
+.Bd -literal
+struct timespec {
+	time_t tv_nsec;		/* seconds */
+	long   tv_nsec;		/* and nanoseconds */
+};
+.Ed
 .Pp
 .Fn timeradd
-adds the time information stored in
+and
+.Fn timespecadd
+add the time information stored in
 .Fa a
 to
 .Fa b
-and stores the resulting
-.Vt timeval
-in
+and store the result in
 .Fa res .
 The results are simplified such that the value of
 .Fa res->tv_usec
-is always less than 1,000,000 (1 second).
+or
+.Fa res->tv_nsec
+is always less than 1 second.
 .Pp
 .Fn timersub
-subtracts the time information stored in
+and
+.Fn timespecsub
+subtract the time information stored in
 .Fa b
 from
 .Fa a
-and stores the resulting
-.Vt timeval
+and store the result
 in
 .Fa res .
 .Pp
 .Fn timerclear
-initializes
-.Fa tvp
-to midnight (0 hour) January 1st, 1970 (the Epoch).
+and
+.Fn timespecclear
+initialize their argument to midnight (0 hour) January 1st, 1970 (the Epoch).
 .Pp
 .Fn timerisset
-returns true if
-.Fa tvp
-is set to any time value other than the Epoch.
+and
+.Fn timespecisset
+return true if their argument is set to any time value other than the Epoch.
 .Pp
 .Fn timercmp
-compares
+and
+.Fn timespeccmp
+compare
 .Fa a
 to
 .Fa b
 using the comparison operator given in
 .Fa CMP ,
-and returns the result of that comparison.
+and return the result of that comparison.
 .Sh SEE ALSO
-.Xr gettimeofday 2
+.Xr gettimeofday 2 ,
+.Xr clock_gettime 2
 .Sh HISTORY
 The
 .Fn timeradd
 family of macros were imported from
 .Nx 1.1 ,
 and appeared in
 .Fx 2.2.6 .
+The
+.Fn timespecadd
+family of macros were imported from
+.Nx 1.3
+into
+.Fx 3.0 ,
+though they were not exposed to userland until
+.Fx 12.0 .
Index: head/sys/compat/linux/linux_event.c
===================================================================
--- head/sys/compat/linux/linux_event.c	(revision 336913)
+++ head/sys/compat/linux/linux_event.c	(revision 336914)
@@ -1,1328 +1,1328 @@
 /*-
  * Copyright (c) 2007 Roman Divacky
  * Copyright (c) 2014 Dmitry Chagin
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/callout.h>
 #include <sys/capsicum.h>
 #include <sys/types.h>
 #include <sys/user.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/errno.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/timespec.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_event.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 
 /*
  * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
  * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
  * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
  * data verbatuim. Therefore we allocate 64-bit memory block to pass
  * user supplied data for every file descriptor.
  */
 
 typedef uint64_t	epoll_udata_t;
 
 struct epoll_emuldata {
 	uint32_t	fdc;		/* epoll udata max index */
 	epoll_udata_t	udata[1];	/* epoll user data vector */
 };
 
 #define	EPOLL_DEF_SZ		16
 #define	EPOLL_SIZE(fdn)			\
 	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
 
 struct epoll_event {
 	uint32_t	events;
 	epoll_udata_t	data;
 }
 #if defined(__amd64__)
 __attribute__((packed))
 #endif
 ;
 
 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
 
 static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
 static int	epoll_to_kevent(struct thread *td, struct file *epfp,
 		    int fd, struct epoll_event *l_event, int *kev_flags,
 		    struct kevent *kevent, int *nkevents);
 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
 static int	epoll_delete_event(struct thread *td, struct file *epfp,
 		    int fd, int filter);
 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
 		    int fd);
 
 struct epoll_copyin_args {
 	struct kevent	*changelist;
 };
 
 struct epoll_copyout_args {
 	struct epoll_event	*leventlist;
 	struct proc		*p;
 	uint32_t		count;
 	int			error;
 };
 
 /* eventfd */
 typedef uint64_t	eventfd_t;
 
 static fo_rdwr_t	eventfd_read;
 static fo_rdwr_t	eventfd_write;
 static fo_ioctl_t	eventfd_ioctl;
 static fo_poll_t	eventfd_poll;
 static fo_kqfilter_t	eventfd_kqfilter;
 static fo_stat_t	eventfd_stat;
 static fo_close_t	eventfd_close;
 static fo_fill_kinfo_t	eventfd_fill_kinfo;
 
 static struct fileops eventfdops = {
 	.fo_read = eventfd_read,
 	.fo_write = eventfd_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = eventfd_ioctl,
 	.fo_poll = eventfd_poll,
 	.fo_kqfilter = eventfd_kqfilter,
 	.fo_stat = eventfd_stat,
 	.fo_close = eventfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = eventfd_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_eventfddetach(struct knote *kn);
 static int	filt_eventfdread(struct knote *kn, long hint);
 static int	filt_eventfdwrite(struct knote *kn, long hint);
 
 static struct filterops eventfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdread
 };
 static struct filterops eventfd_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdwrite
 };
 
 /* timerfd */
 typedef uint64_t	timerfd_t;
 
 static fo_rdwr_t	timerfd_read;
 static fo_poll_t	timerfd_poll;
 static fo_kqfilter_t	timerfd_kqfilter;
 static fo_stat_t	timerfd_stat;
 static fo_close_t	timerfd_close;
 static fo_fill_kinfo_t	timerfd_fill_kinfo;
 
 static struct fileops timerfdops = {
 	.fo_read = timerfd_read,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = eventfd_ioctl,
 	.fo_poll = timerfd_poll,
 	.fo_kqfilter = timerfd_kqfilter,
 	.fo_stat = timerfd_stat,
 	.fo_close = timerfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = timerfd_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_timerfddetach(struct knote *kn);
 static int	filt_timerfdread(struct knote *kn, long hint);
 
 static struct filterops timerfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_timerfddetach,
 	.f_event = filt_timerfdread
 };
 
 struct eventfd {
 	eventfd_t	efd_count;
 	uint32_t	efd_flags;
 	struct selinfo	efd_sel;
 	struct mtx	efd_lock;
 };
 
 struct timerfd {
 	clockid_t	tfd_clockid;
 	struct itimerspec tfd_time;
 	struct callout	tfd_callout;
 	timerfd_t	tfd_count;
 	bool		tfd_canceled;
 	struct selinfo	tfd_sel;
 	struct mtx	tfd_lock;
 };
 
 static int	eventfd_create(struct thread *td, uint32_t initval, int flags);
 static void	linux_timerfd_expire(void *);
 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
 
 
 static void
 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
 {
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct proc *p;
 
 	p = td->td_proc;
 
 	pem = pem_find(p);
 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 
 	LINUX_PEM_XLOCK(pem);
 	if (pem->epoll == NULL) {
 		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
 		emd->fdc = fd;
 		pem->epoll = emd;
 	} else {
 		emd = pem->epoll;
 		if (fd > emd->fdc) {
 			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
 			emd->fdc = fd;
 			pem->epoll = emd;
 		}
 	}
 	emd->udata[fd] = udata;
 	LINUX_PEM_XUNLOCK(pem);
 }
 
 static int
 epoll_create_common(struct thread *td, int flags)
 {
 	int error;
 
 	error = kern_kqueue(td, flags, NULL);
 	if (error != 0)
 		return (error);
 
 	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
 
 	return (0);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
 {
 
 	/*
 	 * args->size is unused. Linux just tests it
 	 * and then forgets it as well.
 	 */
 	if (args->size <= 0)
 		return (EINVAL);
 
 	return (epoll_create_common(td, 0));
 }
 #endif
 
 int
 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
 {
 	int flags;
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	flags = 0;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
 
 	return (epoll_create_common(td, flags));
 }
 
 /* Structure converting function from epoll to kevent. */
 static int
 epoll_to_kevent(struct thread *td, struct file *epfp,
     int fd, struct epoll_event *l_event, int *kev_flags,
     struct kevent *kevent, int *nkevents)
 {
 	uint32_t levents = l_event->events;
 	struct linux_pemuldata *pem;
 	struct proc *p;
 
 	/* flags related to how event is registered */
 	if ((levents & LINUX_EPOLLONESHOT) != 0)
 		*kev_flags |= EV_ONESHOT;
 	if ((levents & LINUX_EPOLLET) != 0)
 		*kev_flags |= EV_CLEAR;
 	if ((levents & LINUX_EPOLLERR) != 0)
 		*kev_flags |= EV_ERROR;
 	if ((levents & LINUX_EPOLLRDHUP) != 0)
 		*kev_flags |= EV_EOF;
 
 	/* flags related to what event is registered */
 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
 		EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
 		++(*nkevents);
 	}
 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
 		EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
 		++(*nkevents);
 	}
 
 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
 		p = td->td_proc;
 
 		pem = pem_find(p);
 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
 
 		LINUX_PEM_XLOCK(pem);
 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
 			pem->flags |= LINUX_XUNSUP_EPOLL;
 			LINUX_PEM_XUNLOCK(pem);
 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
 			    levents);
 		} else
 			LINUX_PEM_XUNLOCK(pem);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /*
  * Structure converting function from kevent to epoll. In a case
  * this is called on error in registration we store the error in
  * event->data and pick it up later in linux_epoll_ctl().
  */
 static void
 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
 {
 
 	if ((kevent->flags & EV_ERROR) != 0) {
 		l_event->events = LINUX_EPOLLERR;
 		return;
 	}
 
 	/* XXX EPOLLPRI, EPOLLHUP */
 	switch (kevent->filter) {
 	case EVFILT_READ:
 		l_event->events = LINUX_EPOLLIN;
 		if ((kevent->flags & EV_EOF) != 0)
 			l_event->events |= LINUX_EPOLLRDHUP;
 	break;
 	case EVFILT_WRITE:
 		l_event->events = LINUX_EPOLLOUT;
 	break;
 	}
 }
 
 /*
  * Copyout callback used by kevent. This converts kevent
  * events to epoll events and copies them back to the
  * userspace. This is also called on error on registering
  * of the filter.
  */
 static int
 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyout_args *args;
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct epoll_event *eep;
 	int error, fd, i;
 
 	args = (struct epoll_copyout_args*) arg;
 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
 
 	pem = pem_find(args->p);
 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 	LINUX_PEM_SLOCK(pem);
 	emd = pem->epoll;
 	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
 
 	for (i = 0; i < count; i++) {
 		kevent_to_epoll(&kevp[i], &eep[i]);
 
 		fd = kevp[i].ident;
 		KASSERT(fd <= emd->fdc, ("epoll user data vector"
 						    " is too small.\n"));
 		eep[i].data = emd->udata[fd];
 	}
 	LINUX_PEM_SUNLOCK(pem);
 
 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
 	if (error == 0) {
 		args->leventlist += count;
 		args->count += count;
 	} else if (args->error == 0)
 		args->error = error;
 
 	free(eep, M_EPOLL);
 	return (error);
 }
 
 /*
  * Copyin callback used by kevent. This copies already
  * converted filters from kernel memory to the kevent
  * internal kernel memory. Hence the memcpy instead of
  * copyin.
  */
 static int
 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyin_args *args;
 
 	args = (struct epoll_copyin_args*) arg;
 
 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
 	args->changelist += count;
 
 	return (0);
 }
 
 /*
  * Load epoll filter, convert it to kevent filter
  * and load it into kevent subsystem.
  */
 int
 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
 {
 	struct file *epfp, *fp;
 	struct epoll_copyin_args ciargs;
 	struct kevent kev[2];
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 	struct epoll_event le;
 	cap_rights_t rights;
 	int kev_flags;
 	int nchanges = 0;
 	int error;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		error = copyin(args->event, &le, sizeof(le));
 		if (error != 0)
 			return (error);
 	}
 
 	error = fget(td, args->epfd,
 	    cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave1;
 	}
 
 	 /* Protect user data vector from incorrectly supplied fd. */
 	error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
 	if (error != 0)
 		goto leave1;
 
 	/* Linux disallows spying on himself */
 	if (epfp == fp) {
 		error = EINVAL;
 		goto leave0;
 	}
 
 	ciargs.changelist = kev;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		kev_flags = EV_ADD | EV_ENABLE;
 		error = epoll_to_kevent(td, epfp, args->fd, &le,
 		    &kev_flags, kev, &nchanges);
 		if (error != 0)
 			goto leave0;
 	}
 
 	switch (args->op) {
 	case LINUX_EPOLL_CTL_MOD:
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		if (error != 0)
 			goto leave0;
 		break;
 
 	case LINUX_EPOLL_CTL_ADD:
 		/*
 		 * kqueue_register() return ENOENT if event does not exists
 		 * and the EV_ADD flag is not set.
 		 */
 		kev[0].flags &= ~EV_ADD;
 		error = kqfd_register(args->epfd, &kev[0], td, 1);
 		if (error != ENOENT) {
 			error = EEXIST;
 			goto leave0;
 		}
 		error = 0;
 		kev[0].flags |= EV_ADD;
 		break;
 
 	case LINUX_EPOLL_CTL_DEL:
 		/* CTL_DEL means unregister this fd with this epoll */
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		goto leave0;
 
 	default:
 		error = EINVAL;
 		goto leave0;
 	}
 
 	epoll_fd_install(td, args->fd, le.data);
 
 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
 
 leave0:
 	fdrop(fp, td);
 
 leave1:
 	fdrop(epfp, td);
 	return (error);
 }
 
 /*
  * Wait for a filter to be triggered on the epoll file descriptor.
  */
 static int
 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
     int maxevents, int timeout, sigset_t *uset)
 {
 	struct epoll_copyout_args coargs;
 	struct kevent_copyops k_ops = { &coargs,
 					epoll_kev_copyout,
 					NULL};
 	struct timespec ts, *tsp;
 	cap_rights_t rights;
 	struct file *epfp;
 	sigset_t omask;
 	int error;
 
 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
 		return (EINVAL);
 
 	error = fget(td, epfd,
 	    cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave1;
 	}
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &omask, 0);
 		if (error != 0)
 			goto leave1;
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 
 
 	coargs.leventlist = events;
 	coargs.p = td->td_proc;
 	coargs.count = 0;
 	coargs.error = 0;
 
 	if (timeout != -1) {
 		if (timeout < 0) {
 			error = EINVAL;
 			goto leave0;
 		}
 		/* Convert from milliseconds to timespec. */
 		ts.tv_sec = timeout / 1000;
 		ts.tv_nsec = (timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else {
 		tsp = NULL;
 	}
 
 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
 	if (error == 0 && coargs.error != 0)
 		error = coargs.error;
 
 	/*
 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
 	 * Maybe we should translate that but I don't think it matters at all.
 	 */
 	if (error == 0)
 		td->td_retval[0] = coargs.count;
 
 leave0:
 	if (uset != NULL)
 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
 		    NULL, 0);
 leave1:
 	fdrop(epfp, td);
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
 {
 
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, NULL));
 }
 #endif
 
 int
 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
 {
 	sigset_t mask, *pmask;
 	l_sigset_t lmask;
 	int error;
 
 	if (args->mask != NULL) {
 		if (args->sigsetsize != sizeof(l_sigset_t))
 			return (EINVAL);
 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
 		if (error != 0)
 			return (error);
 		linux_to_bsd_sigset(&lmask, &mask);
 		pmask = &mask;
 	} else
 		pmask = NULL;
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, pmask));
 }
 
 static int
 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
 {
 	struct epoll_copyin_args ciargs;
 	struct kevent kev;
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 
 	ciargs.changelist = &kev;
 	EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
 
 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
 }
 
 static int
 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
 {
 	int error1, error2;
 
 	error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
 	error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
 
 	/* return 0 if at least one result positive */
 	return (error1 == 0 ? 0 : error2);
 }
 
 static int
 eventfd_create(struct thread *td, uint32_t initval, int flags)
 {
 	struct filedesc *fdp;
 	struct eventfd *efd;
 	struct file *fp;
 	int fflags, fd, error;
 
 	fflags = 0;
 	if ((flags & LINUX_O_CLOEXEC) != 0)
 		fflags |= O_CLOEXEC;
 
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, fflags);
 	if (error != 0)
 		return (error);
 
 	efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
 	efd->efd_flags = flags;
 	efd->efd_count = initval;
 	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
 
 	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
 
 	fflags = FREAD | FWRITE;
 	if ((flags & LINUX_O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
 {
 
 	return (eventfd_create(td, args->initval, 0));
 }
 #endif
 
 int
 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
 {
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
 		return (EINVAL);
 
 	return (eventfd_create(td, args->initval, args->flags));
 }
 
 static int
 eventfd_close(struct file *fp, struct thread *td)
 {
 	struct eventfd *efd;
 
 	efd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
 		return (EINVAL);
 
 	seldrain(&efd->efd_sel);
 	knlist_destroy(&efd->efd_sel.si_note);
 
 	fp->f_ops = &badfileops;
 	mtx_destroy(&efd->efd_lock);
 	free(efd, M_EPOLL);
 
 	return (0);
 }
 
 static int
 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	efd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
 		return (EINVAL);
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = 0;
 	mtx_lock(&efd->efd_lock);
 retry:
 	if (efd->efd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
 			count = 1;
 			--efd->efd_count;
 		} else {
 			count = efd->efd_count;
 			efd->efd_count = 0;
 		}
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 		mtx_unlock(&efd->efd_lock);
 		error = uiomove(&count, sizeof(eventfd_t), uio);
 	} else
 		mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
      int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	efd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
 		return (EINVAL);
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = uiomove(&count, sizeof(eventfd_t), uio);
 	if (error != 0)
 		return (error);
 	if (count == UINT64_MAX)
 		return (EINVAL);
 
 	mtx_lock(&efd->efd_lock);
 retry:
 	if (UINT64_MAX - efd->efd_count <= count) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			/* Do not not return the number of bytes written */
 			uio->uio_resid += sizeof(eventfd_t);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
 		    PCATCH, "lefdwr", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		efd->efd_count += count;
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 	}
 	mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct eventfd *efd;
 	int revents = 0;
 
 	efd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
 		return (POLLERR);
 
 	mtx_lock(&efd->efd_lock);
 	if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
 		revents |= events & (POLLIN|POLLRDNORM);
 	if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
 		revents |= events & (POLLOUT|POLLWRNORM);
 	if (revents == 0)
 		selrecord(td, &efd->efd_sel);
 	mtx_unlock(&efd->efd_lock);
 
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 eventfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct eventfd *efd;
 
 	efd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
 		return (EINVAL);
 
 	mtx_lock(&efd->efd_lock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &eventfd_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &eventfd_wfiltops;
 		break;
 	default:
 		mtx_unlock(&efd->efd_lock);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = efd;
 	knlist_add(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 
 	return (0);
 }
 
 static void
 filt_eventfddetach(struct knote *kn)
 {
 	struct eventfd *efd = kn->kn_hook;
 
 	mtx_lock(&efd->efd_lock);
 	knlist_remove(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 }
 
 /*ARGSUSED*/
 static int
 filt_eventfdread(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	ret = (efd->efd_count > 0);
 
 	return (ret);
 }
 
 /*ARGSUSED*/
 static int
 filt_eventfdwrite(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	ret = (UINT64_MAX - 1 > efd->efd_count);
 
 	return (ret);
 }
 
 /*ARGSUSED*/
 static int
 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD &&
 	    fp->f_type != DTYPE_LINUXTFD))
 		return (EINVAL);
 
 	switch (cmd)
 	{
 	case FIONBIO:
 		if ((*(int *)data))
 			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 	case FIOASYNC:
 		return (0);
 	default:
 		return (ENXIO);
 	}
 }
 
 /*ARGSUSED*/
 static int
 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	return (0);
 }
 
 int
 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
 {
 	struct filedesc *fdp;
 	struct timerfd *tfd;
 	struct file *fp;
 	clockid_t clockid;
 	int fflags, fd, error;
 
 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
 		return (EINVAL);
 
 	error = linux_to_native_clockid(&clockid, args->clockid);
 	if (error != 0)
 		return (error);
 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
 		return (EINVAL);
 
 	fflags = 0;
 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
 		fflags |= O_CLOEXEC;
 
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, fflags);
 	if (error != 0)
 		return (error);
 
 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
 	tfd->tfd_clockid = clockid;
 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
 
 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
 
 	fflags = FREAD;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (error);
 }
 
 static int
 timerfd_close(struct file *fp, struct thread *td)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	timespecclear(&tfd->tfd_time.it_value);
 	timespecclear(&tfd->tfd_time.it_interval);
 
 	mtx_lock(&tfd->tfd_lock);
 	callout_drain(&tfd->tfd_callout);
 	mtx_unlock(&tfd->tfd_lock);
 
 	seldrain(&tfd->tfd_sel);
 	knlist_destroy(&tfd->tfd_sel.si_note);
 
 	fp->f_ops = &badfileops;
 	mtx_destroy(&tfd->tfd_lock);
 	free(tfd, M_EPOLL);
 
 	return (0);
 }
 
 static int
 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct timerfd *tfd;
 	timerfd_t count;
 	int error;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (uio->uio_resid < sizeof(timerfd_t))
 		return (EINVAL);
 
 	error = 0;
 	mtx_lock(&tfd->tfd_lock);
 retry:
 	if (tfd->tfd_canceled) {
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		return (ECANCELED);
 	}
 	if (tfd->tfd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&tfd->tfd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		count = tfd->tfd_count;
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		error = uiomove(&count, sizeof(timerfd_t), uio);
 	} else
 		mtx_unlock(&tfd->tfd_lock);
 
 	return (error);
 }
 
 static int
 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct timerfd *tfd;
 	int revents = 0;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (POLLERR);
 
 	mtx_lock(&tfd->tfd_lock);
 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
 		revents |= events & (POLLIN|POLLRDNORM);
 	if (revents == 0)
 		selrecord(td, &tfd->tfd_sel);
 	mtx_unlock(&tfd->tfd_lock);
 
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 timerfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (kn->kn_filter == EVFILT_READ)
 		kn->kn_fop = &timerfd_rfiltops;
 	else
 		return (EINVAL);
 
 	kn->kn_hook = tfd;
 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_timerfddetach(struct knote *kn)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	mtx_lock(&tfd->tfd_lock);
 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
 	mtx_unlock(&tfd->tfd_lock);
 }
 
 /*ARGSUSED*/
 static int
 filt_timerfdread(struct knote *kn, long hint)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	return (tfd->tfd_count > 0);
 }
 
 /*ARGSUSED*/
 static int
 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	return (0);
 }
 
 static void
 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
 {
 
 	if (tfd->tfd_clockid == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 static void
 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
 {
 	struct timespec cts;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	*ots = tfd->tfd_time;
 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
-		timespecsub(&ots->it_value, &cts);
+		timespecsub(&ots->it_value, &cts, &ots->it_value);
 		if (ots->it_value.tv_sec < 0 ||
 		    (ots->it_value.tv_sec == 0 &&
 		     ots->it_value.tv_nsec == 0)) {
 			ots->it_value.tv_sec  = 0;
 			ots->it_value.tv_nsec = 1;
 		}
 	}
 }
 
 int
 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec ots;
 	struct timerfd *tfd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, args->fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	linux_timerfd_curval(tfd, &ots);
 	mtx_unlock(&tfd->tfd_lock);
 
 	error = native_to_linux_itimerspec(&lots, &ots);
 	if (error == 0)
 		error = copyout(&lots, args->old_value, sizeof(lots));
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec nts, ots;
 	struct timespec cts, ts;
 	struct timerfd *tfd;
 	struct timeval tv;
 	struct file *fp;
 	int error;
 
 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
 		return (EINVAL);
 
 	error = copyin(args->new_value, &lots, sizeof(lots));
 	if (error != 0)
 		return (error);
 	error = linux_to_native_itimerspec(&nts, &lots);
 	if (error != 0)
 		return (error);
 
 	error = fget(td, args->fd, &cap_write_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	if (!timespecisset(&nts.it_value))
 		timespecclear(&nts.it_interval);
 	if (args->old_value != NULL)
 		linux_timerfd_curval(tfd, &ots);
 
 	tfd->tfd_time = nts;
 	if (timespecisset(&nts.it_value)) {
 		linux_timerfd_clocktime(tfd, &cts);
 		ts = nts.it_value;
 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
-			timespecadd(&tfd->tfd_time.it_value, &cts);
+			timespecadd(&tfd->tfd_time.it_value, &cts,
+				&tfd->tfd_time.it_value);
 		} else {
-			timespecsub(&ts, &cts);
+			timespecsub(&ts, &cts, &ts);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 			linux_timerfd_expire, tfd);
 		tfd->tfd_canceled = false;
 	} else {
 		tfd->tfd_canceled = true;
 		callout_stop(&tfd->tfd_callout);
 	}
 	mtx_unlock(&tfd->tfd_lock);
 
 	if (args->old_value != NULL) {
 		error = native_to_linux_itimerspec(&lots, &ots);
 		if (error == 0)
 			error = copyout(&lots, args->old_value, sizeof(lots));
 	}
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static void
 linux_timerfd_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct timerfd *tfd;
 
 	tfd = (struct timerfd *)arg;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
 		if (timespecisset(&tfd->tfd_time.it_interval))
 			timespecadd(&tfd->tfd_time.it_value,
-				    &tfd->tfd_time.it_interval);
+				    &tfd->tfd_time.it_interval,
+				    &tfd->tfd_time.it_value);
 		else
 			/* single shot timer */
 			timespecclear(&tfd->tfd_time.it_value);
 		if (timespecisset(&tfd->tfd_time.it_value)) {
-			ts = tfd->tfd_time.it_value;
-			timespecsub(&ts, &cts);
+			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 				linux_timerfd_expire, tfd);
 		}
 		tfd->tfd_count++;
 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
 		selwakeup(&tfd->tfd_sel);
 		wakeup(&tfd->tfd_count);
 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
-		ts = tfd->tfd_time.it_value;
-		timespecsub(&ts, &cts);
+		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 		    linux_timerfd_expire, tfd);
 	}
 }
Index: head/sys/compat/linux/linux_futex.c
===================================================================
--- head/sys/compat/linux/linux_futex.c	(revision 336913)
+++ head/sys/compat/linux/linux_futex.c	(revision 336914)
@@ -1,1335 +1,1335 @@
 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2009-2016 Dmitry Chagin
  * Copyright (c) 2005 Emmanuel Dreyfus
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Emmanuel Dreyfus
  * 4. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #if 0
 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
 #endif
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/umtx.h>
 
 #include <vm/vm_extern.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 
 /* DTrace init */
 LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
 
 /**
  * Futex part for the special DTrace module "locks".
  */
 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *");
 LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *");
 
 /**
  * Per futex probes.
  */
 LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *");
 LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *");
 
 /**
  * DTrace probes in this module.
  */
 LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *",
     "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE0(futex, futex_put, return);
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **",
     "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t",
     "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *",
     "struct waiting_proc **", "struct futex **");
 LIN_SDT_PROBE_DEFINE0(futex, futex_get, error);
 LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *",
     "struct waiting_proc **", "struct timespec *");
 LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *",
     "struct waiting_proc *", "uint32_t *", "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *",
     "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int",
     "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t",
     "struct waiting_proc *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int");
 LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int",
     "struct futex *", "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *");
 LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *",
     "struct waiting_proc *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int");
 LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *",
     "struct waiting_proc **", "struct timespec *", "uint32_t");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *",
     "int", "uint32_t");
 LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int",
     "int");
 LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check);
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int");
 LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *",
     "struct linux_sys_futex_args *");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch);
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use);
 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *",
     "uint32_t", "uint32_t");
 LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq,
     "uint32_t *", "uint32_t", "int", "uint32_t");
 LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *",
     "uint32_t", "uint32_t");
 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *",
     "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *");
 LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq,
     "uint32_t", "int");
 LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *",
     "int", "uint32_t", "uint32_t *", "uint32_t");
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi);
 LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi);
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int");
 LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *",
     "struct linux_set_robust_list_args *");
 LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error);
 LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *",
     "struct linux_get_robust_list_args *");
 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry,
     "struct linux_emuldata *", "uint32_t *", "unsigned int");
 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int");
 LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry,
     "struct linux_robust_list **", "struct linux_robust_list **",
     "unsigned int *");
 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int");
 LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *",
     "struct linux_emuldata *");
 LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int");
 LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return);
 
 struct futex;
 
 struct waiting_proc {
 	uint32_t	wp_flags;
 	struct futex	*wp_futex;
 	TAILQ_ENTRY(waiting_proc) wp_list;
 };
 
 struct futex {
 	struct mtx	f_lck;
 	uint32_t	*f_uaddr;	/* user-supplied value, for debug */
 	struct umtx_key	f_key;
 	uint32_t	f_refcount;
 	uint32_t	f_bitset;
 	LIST_ENTRY(futex) f_list;
 	TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
 };
 
 struct futex_list futex_list;
 
 #define FUTEX_LOCK(f)		mtx_lock(&(f)->f_lck)
 #define FUTEX_LOCKED(f)		mtx_owned(&(f)->f_lck)
 #define FUTEX_UNLOCK(f)		mtx_unlock(&(f)->f_lck)
 #define FUTEX_INIT(f)		do { \
 				    mtx_init(&(f)->f_lck, "ftlk", NULL, \
 					MTX_DUPOK); \
 				    LIN_SDT_PROBE1(futex, futex, create, \
 					&(f)->f_lck); \
 				} while (0)
 #define FUTEX_DESTROY(f)	do { \
 				    LIN_SDT_PROBE1(futex, futex, destroy, \
 					&(f)->f_lck); \
 				    mtx_destroy(&(f)->f_lck); \
 				} while (0)
 #define FUTEX_ASSERT_LOCKED(f)	mtx_assert(&(f)->f_lck, MA_OWNED)
 #define FUTEX_ASSERT_UNLOCKED(f) mtx_assert(&(f)->f_lck, MA_NOTOWNED)
 
 struct mtx futex_mtx;			/* protects the futex list */
 #define FUTEXES_LOCK		do { \
 				    mtx_lock(&futex_mtx); \
 				    LIN_SDT_PROBE1(locks, futex_mtx, \
 					locked, &futex_mtx); \
 				} while (0)
 #define FUTEXES_UNLOCK		do { \
 				    LIN_SDT_PROBE1(locks, futex_mtx, \
 					unlock, &futex_mtx); \
 				    mtx_unlock(&futex_mtx); \
 				} while (0)
 
 /* flags for futex_get() */
 #define FUTEX_CREATE_WP		0x1	/* create waiting_proc */
 #define FUTEX_DONTCREATE	0x2	/* don't create futex if not exists */
 #define FUTEX_DONTEXISTS	0x4	/* return EINVAL if futex exists */
 #define	FUTEX_SHARED		0x8	/* shared futex */
 #define	FUTEX_DONTLOCK		0x10	/* don't lock futex */
 
 /* wp_flags */
 #define FUTEX_WP_REQUEUED	0x1	/* wp requeued - wp moved from wp_list
 					 * of futex where thread sleep to wp_list
 					 * of another futex.
 					 */
 #define FUTEX_WP_REMOVED	0x2	/* wp is woken up and removed from futex
 					 * wp_list to prevent double wakeup.
 					 */
 
 static void futex_put(struct futex *, struct waiting_proc *);
 static int futex_get0(uint32_t *, struct futex **f, uint32_t);
 static int futex_get(uint32_t *, struct waiting_proc **, struct futex **,
     uint32_t);
 static int futex_sleep(struct futex *, struct waiting_proc *, struct timespec *);
 static int futex_wake(struct futex *, int, uint32_t);
 static int futex_requeue(struct futex *, int, struct futex *, int);
 static int futex_copyin_timeout(int, struct l_timespec *, int,
     struct timespec *);
 static int futex_wait(struct futex *, struct waiting_proc *, struct timespec *,
     uint32_t);
 static void futex_lock(struct futex *);
 static void futex_unlock(struct futex *);
 static int futex_atomic_op(struct thread *, int, uint32_t *);
 static int handle_futex_death(struct linux_emuldata *, uint32_t *,
     unsigned int);
 static int fetch_robust_entry(struct linux_robust_list **,
     struct linux_robust_list **, unsigned int *);
 
 static int
 futex_copyin_timeout(int op, struct l_timespec *luts, int clockrt,
     struct timespec *ts)
 {
 	struct l_timespec lts;
 	struct timespec kts;
 	int error;
 
 	error = copyin(luts, &lts, sizeof(lts));
 	if (error)
 		return (error);
 
 	error = linux_to_native_timespec(ts, &lts);
 	if (error)
 		return (error);
 	if (clockrt) {
 		nanotime(&kts);
-		timespecsub(ts, &kts);
+		timespecsub(ts, &kts, ts);
 	} else if (op == LINUX_FUTEX_WAIT_BITSET) {
 		nanouptime(&kts);
-		timespecsub(ts, &kts);
+		timespecsub(ts, &kts, ts);
 	}
 	return (error);
 }
 
 static void
 futex_put(struct futex *f, struct waiting_proc *wp)
 {
 	LIN_SDT_PROBE2(futex, futex_put, entry, f, wp);
 
 	if (wp != NULL) {
 		if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0)
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 		free(wp, M_FUTEX_WP);
 	}
 
 	FUTEXES_LOCK;
 	if (--f->f_refcount == 0) {
 		LIST_REMOVE(f, f_list);
 		FUTEXES_UNLOCK;
 		if (FUTEX_LOCKED(f))
 			futex_unlock(f);
 
 		LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr,
 		    f->f_refcount, f->f_key.shared);
 		LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d "
 		    "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared);
 		umtx_key_release(&f->f_key);
 		FUTEX_DESTROY(f);
 		free(f, M_FUTEX);
 
 		LIN_SDT_PROBE0(futex, futex_put, return);
 		return;
 	}
 
 	LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount,
 	    f->f_key.shared);
 	LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d",
 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
 	FUTEXES_UNLOCK;
 	if (FUTEX_LOCKED(f))
 		futex_unlock(f);
 
 	LIN_SDT_PROBE0(futex, futex_put, return);
 }
 
 static int
 futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags)
 {
 	struct futex *f, *tmpf;
 	struct umtx_key key;
 	int error;
 
 	LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags);
 
 	*newf = tmpf = NULL;
 
 	error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ?
 	    AUTO_SHARE : THREAD_SHARE, &key);
 	if (error) {
 		LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error);
 		LIN_SDT_PROBE1(futex, futex_get0, return, error);
 		return (error);
 	}
 retry:
 	FUTEXES_LOCK;
 	LIST_FOREACH(f, &futex_list, f_list) {
 		if (umtx_key_match(&f->f_key, &key)) {
 			if (tmpf != NULL) {
 				if (FUTEX_LOCKED(tmpf))
 					futex_unlock(tmpf);
 				FUTEX_DESTROY(tmpf);
 				free(tmpf, M_FUTEX);
 			}
 			if (flags & FUTEX_DONTEXISTS) {
 				FUTEXES_UNLOCK;
 				umtx_key_release(&key);
 
 				LIN_SDT_PROBE1(futex, futex_get0, return,
 				    EINVAL);
 				return (EINVAL);
 			}
 
 			/*
 			 * Increment refcount of the found futex to
 			 * prevent it from deallocation before FUTEX_LOCK()
 			 */
 			++f->f_refcount;
 			FUTEXES_UNLOCK;
 			umtx_key_release(&key);
 
 			if ((flags & FUTEX_DONTLOCK) == 0)
 				futex_lock(f);
 			*newf = f;
 			LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr,
 			    f->f_refcount, f->f_key.shared);
 			LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d",
 			    uaddr, f->f_refcount, f->f_key.shared);
 
 			LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 			return (0);
 		}
 	}
 
 	if (flags & FUTEX_DONTCREATE) {
 		FUTEXES_UNLOCK;
 		umtx_key_release(&key);
 		LIN_SDT_PROBE1(futex, futex_get0, null, uaddr);
 		LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr);
 
 		LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 		return (0);
 	}
 
 	if (tmpf == NULL) {
 		FUTEXES_UNLOCK;
 		tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO);
 		tmpf->f_uaddr = uaddr;
 		tmpf->f_key = key;
 		tmpf->f_refcount = 1;
 		tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY;
 		FUTEX_INIT(tmpf);
 		TAILQ_INIT(&tmpf->f_waiting_proc);
 
 		/*
 		 * Lock the new futex before an insert into the futex_list
 		 * to prevent futex usage by other.
 		 */
 		if ((flags & FUTEX_DONTLOCK) == 0)
 			futex_lock(tmpf);
 		goto retry;
 	}
 
 	LIST_INSERT_HEAD(&futex_list, tmpf, f_list);
 	FUTEXES_UNLOCK;
 
 	LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount,
 	    tmpf->f_key.shared);
 	LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new",
 	    uaddr, tmpf->f_refcount, tmpf->f_key.shared);
 	*newf = tmpf;
 
 	LIN_SDT_PROBE1(futex, futex_get0, return, 0);
 	return (0);
 }
 
 static int
 futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f,
     uint32_t flags)
 {
 	int error;
 
 	LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f);
 
 	if (flags & FUTEX_CREATE_WP) {
 		*wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK);
 		(*wp)->wp_flags = 0;
 	}
 	error = futex_get0(uaddr, f, flags);
 	if (error) {
 		LIN_SDT_PROBE0(futex, futex_get, error);
 
 		if (flags & FUTEX_CREATE_WP)
 			free(*wp, M_FUTEX_WP);
 
 		LIN_SDT_PROBE1(futex, futex_get, return, error);
 		return (error);
 	}
 	if (flags & FUTEX_CREATE_WP) {
 		TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list);
 		(*wp)->wp_futex = *f;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_get, return, error);
 	return (error);
 }
 
 static inline void
 futex_lock(struct futex *f)
 {
 
 	LINUX_CTR3(sys_futex, "futex_lock uaddr %p ref %d shared %d",
 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
 	FUTEX_ASSERT_UNLOCKED(f);
 	FUTEX_LOCK(f);
 }
 
 static inline void
 futex_unlock(struct futex *f)
 {
 
 	LINUX_CTR3(sys_futex, "futex_unlock uaddr %p ref %d shared %d",
 	    f->f_uaddr, f->f_refcount, f->f_key.shared);
 	FUTEX_ASSERT_LOCKED(f);
 	FUTEX_UNLOCK(f);
 }
 
 static int
 futex_sleep(struct futex *f, struct waiting_proc *wp, struct timespec *ts)
 {
 	struct timespec uts;
 	sbintime_t sbt, prec, tmp;
 	time_t over;
 	int error;
 
 	FUTEX_ASSERT_LOCKED(f);
 	if (ts != NULL) {
 		uts = *ts;
 		if (uts.tv_sec > INT32_MAX / 2) {
 			over = uts.tv_sec - INT32_MAX / 2;
 			uts.tv_sec -= over;
 		}
 		tmp = tstosbt(uts);
 		if (TIMESEL(&sbt, tmp))
 			sbt += tc_tick_sbt;
 		sbt += tmp;
 		prec = tmp;
 		prec >>= tc_precexp;
 	} else {
 		sbt = 0;
 		prec = 0;
 	}
 	LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, sbt);
 	LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %ld ref %d",
 	    f->f_uaddr, wp, sbt, f->f_refcount);
 
 	error = msleep_sbt(wp, &f->f_lck, PCATCH, "futex", sbt, prec, C_ABSOLUTE);
 	if (wp->wp_flags & FUTEX_WP_REQUEUED) {
 		KASSERT(f != wp->wp_futex, ("futex != wp_futex"));
 
 		if (error) {
 			LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error,
 			    f->f_uaddr, wp, wp->wp_futex->f_uaddr,
 			    wp->wp_futex->f_refcount);
 		}
 
 		LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp"
 		    " %p requeued uaddr %p ref %d",
 		    error, f->f_uaddr, wp, wp->wp_futex->f_uaddr,
 		    wp->wp_futex->f_refcount);
 		futex_put(f, NULL);
 		f = wp->wp_futex;
 		futex_lock(f);
 	} else {
 		if (error) {
 			LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error,
 			    f->f_uaddr, wp);
 		}
 		LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p",
 		    error, f->f_uaddr, wp);
 	}
 
 	futex_put(f, wp);
 
 	LIN_SDT_PROBE1(futex, futex_sleep, return, error);
 	return (error);
 }
 
 static int
 futex_wake(struct futex *f, int n, uint32_t bitset)
 {
 	struct waiting_proc *wp, *wpt;
 	int count = 0;
 
 	LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset);
 
 	if (bitset == 0) {
 		LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL);
 		return (EINVAL);
 	}
 
 	FUTEX_ASSERT_LOCKED(f);
 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
 		LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp,
 		    f->f_refcount);
 		LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d",
 		    f->f_uaddr, wp, f->f_refcount);
 		/*
 		 * Unless we find a matching bit in
 		 * the bitset, continue searching.
 		 */
 		if (!(wp->wp_futex->f_bitset & bitset))
 			continue;
 
 		wp->wp_flags |= FUTEX_WP_REMOVED;
 		TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 		LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp);
 		wakeup_one(wp);
 		if (++count == n)
 			break;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_wake, return, count);
 	return (count);
 }
 
 static int
 futex_requeue(struct futex *f, int n, struct futex *f2, int n2)
 {
 	struct waiting_proc *wp, *wpt;
 	int count = 0;
 
 	LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2);
 
 	FUTEX_ASSERT_LOCKED(f);
 	FUTEX_ASSERT_LOCKED(f2);
 
 	TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) {
 		if (++count <= n) {
 			LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p",
 			    f->f_uaddr, wp);
 			wp->wp_flags |= FUTEX_WP_REMOVED;
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 			LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp);
 			wakeup_one(wp);
 		} else {
 			LIN_SDT_PROBE3(futex, futex_requeue, requeue,
 			    f->f_uaddr, wp, f2->f_uaddr);
 			LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p",
 			    f->f_uaddr, wp, f2->f_uaddr);
 			wp->wp_flags |= FUTEX_WP_REQUEUED;
 			/* Move wp to wp_list of f2 futex */
 			TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
 			TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list);
 
 			/*
 			 * Thread which sleeps on wp after waking should
 			 * acquire f2 lock, so increment refcount of f2 to
 			 * prevent it from premature deallocation.
 			 */
 			wp->wp_futex = f2;
 			FUTEXES_LOCK;
 			++f2->f_refcount;
 			FUTEXES_UNLOCK;
 			if (count - n >= n2)
 				break;
 		}
 	}
 
 	LIN_SDT_PROBE1(futex, futex_requeue, return, count);
 	return (count);
 }
 
 static int
 futex_wait(struct futex *f, struct waiting_proc *wp, struct timespec *ts,
     uint32_t bitset)
 {
 	int error;
 
 	LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, ts, bitset);
 
 	if (bitset == 0) {
 		LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL);
 		return (EINVAL);
 	}
 
 	f->f_bitset = bitset;
 	error = futex_sleep(f, wp, ts);
 	if (error)
 		LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 
 	LIN_SDT_PROBE1(futex, futex_wait, return, error);
 	return (error);
 }
 
 static int
 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
 	int oparg = (encoded_op << 8) >> 20;
 	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
 	LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr);
 
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
 	LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg,
 	    cmparg);
 
 	/* XXX: Linux verifies access here and returns EFAULT */
 	LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check);
 
 	switch (op) {
 	case FUTEX_OP_SET:
 		ret = futex_xchgl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_ADD:
 		ret = futex_addl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_OR:
 		ret = futex_orl(oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_ANDN:
 		ret = futex_andl(~oparg, uaddr, &oldval);
 		break;
 	case FUTEX_OP_XOR:
 		ret = futex_xorl(oparg, uaddr, &oldval);
 		break;
 	default:
 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op);
 		ret = -ENOSYS;
 		break;
 	}
 
 	if (ret) {
 		LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
 		return (ret);
 	}
 
 	switch (cmp) {
 	case FUTEX_OP_CMP_EQ:
 		ret = (oldval == cmparg);
 		break;
 	case FUTEX_OP_CMP_NE:
 		ret = (oldval != cmparg);
 		break;
 	case FUTEX_OP_CMP_LT:
 		ret = (oldval < cmparg);
 		break;
 	case FUTEX_OP_CMP_GE:
 		ret = (oldval >= cmparg);
 		break;
 	case FUTEX_OP_CMP_LE:
 		ret = (oldval <= cmparg);
 		break;
 	case FUTEX_OP_CMP_GT:
 		ret = (oldval > cmparg);
 		break;
 	default:
 		LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp);
 		ret = -ENOSYS;
 	}
 
 	LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret);
 	return (ret);
 }
 
 int
 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
 {
 	int clockrt, nrwake, op_ret, ret;
 	struct linux_pemuldata *pem;
 	struct waiting_proc *wp;
 	struct futex *f, *f2;
 	struct timespec uts, *ts;
 	int error, save;
 	uint32_t flags, val;
 
 	LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args);
 
 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
 		flags = 0;
 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
 	} else
 		flags = FUTEX_SHARED;
 
 	/*
 	 * Currently support for switching between CLOCK_MONOTONIC and
 	 * CLOCK_REALTIME is not present. However Linux forbids the use of
 	 * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and
 	 * FUTEX_WAIT_REQUEUE_PI.
 	 */
 	clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
 	if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET &&
 		args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) {
 		LIN_SDT_PROBE0(futex, linux_sys_futex,
 		    unimplemented_clockswitch);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 	}
 
 	error = 0;
 	f = f2 = NULL;
 
 	switch (args->op) {
 	case LINUX_FUTEX_WAIT:
 		args->val3 = FUTEX_BITSET_MATCH_ANY;
 		/* FALLTHROUGH */
 
 	case LINUX_FUTEX_WAIT_BITSET:
 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr,
 		    args->val, args->val3);
 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
 		    args->uaddr, args->val, args->val3);
 
 		if (args->timeout != NULL) {
 			error = futex_copyin_timeout(args->op, args->timeout,
 			    clockrt, &uts);
 			if (error) {
 				LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
 				    error);
 				LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 				return (error);
 			}
 			ts = &uts;
 		} else
 			ts = NULL;
 
 retry0:
 		error = futex_get(args->uaddr, &wp, &f,
 		    flags | FUTEX_CREATE_WP);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		error = copyin_nofault(args->uaddr, &val, sizeof(val));
 		if (error) {
 			futex_put(f, wp);
 			error = copyin(args->uaddr, &val, sizeof(val));
 			if (error == 0)
 				goto retry0;
 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
 			    error);
 			LINUX_CTR1(sys_futex, "WAIT copyin failed %d",
 			    error);
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		if (val != args->val) {
 			LIN_SDT_PROBE4(futex, linux_sys_futex,
 			    debug_wait_value_neq, args->uaddr, args->val, val,
 			    args->val3);
 			LINUX_CTR3(sys_futex,
 			    "WAIT uaddr %p val 0x%x != uval 0x%x",
 			    args->uaddr, args->val, val);
 			futex_put(f, wp);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return,
 			    EWOULDBLOCK);
 			return (EWOULDBLOCK);
 		}
 
 		error = futex_wait(f, wp, ts, args->val3);
 		break;
 
 	case LINUX_FUTEX_WAKE:
 		args->val3 = FUTEX_BITSET_MATCH_ANY;
 		/* FALLTHROUGH */
 
 	case LINUX_FUTEX_WAKE_BITSET:
 		LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr,
 		    args->val, args->val3);
 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
 		    args->uaddr, args->val, args->val3);
 
 		error = futex_get(args->uaddr, NULL, &f,
 		    flags | FUTEX_DONTCREATE);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		if (f == NULL) {
 			td->td_retval[0] = 0;
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		td->td_retval[0] = futex_wake(f, args->val, args->val3);
 		futex_put(f, NULL);
 		break;
 
 	case LINUX_FUTEX_CMP_REQUEUE:
 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue,
 		    args->uaddr, args->val, args->val3, args->uaddr2,
 		    args->timeout);
 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
 		    args->uaddr, args->val, args->val3, args->uaddr2,
 		    args->timeout);
 
 		/*
 		 * Linux allows this, we would not, it is an incorrect
 		 * usage of declared ABI, so return EINVAL.
 		 */
 		if (args->uaddr == args->uaddr2) {
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    invalid_cmp_requeue_use);
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
 			return (EINVAL);
 		}
 
 retry1:
 		error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		/*
 		 * To avoid deadlocks return EINVAL if second futex
 		 * exists at this time.
 		 *
 		 * Glibc fall back to FUTEX_WAKE in case of any error
 		 * returned by FUTEX_CMP_REQUEUE.
 		 */
 		error = futex_get(args->uaddr2, NULL, &f2,
 		    flags | FUTEX_DONTEXISTS | FUTEX_DONTLOCK);
 		if (error) {
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		futex_lock(f);
 		futex_lock(f2);
 		error = copyin_nofault(args->uaddr, &val, sizeof(val));
 		if (error) {
 			futex_put(f2, NULL);
 			futex_put(f, NULL);
 			error = copyin(args->uaddr, &val, sizeof(val));
 			if (error == 0)
 				goto retry1;
 			LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error,
 			    error);
 			LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d",
 			    error);
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		if (val != args->val3) {
 			LIN_SDT_PROBE2(futex, linux_sys_futex,
 			    debug_cmp_requeue_value_neq, args->val, val);
 			LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x",
 			    args->val, val);
 			futex_put(f2, NULL);
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN);
 			return (EAGAIN);
 		}
 
 		nrwake = (int)(unsigned long)args->timeout;
 		td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake);
 		futex_put(f2, NULL);
 		futex_put(f, NULL);
 		break;
 
 	case LINUX_FUTEX_WAKE_OP:
 		LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op,
 		    args->uaddr, args->op, args->val, args->uaddr2, args->val3);
 		LINUX_CTR5(sys_futex, "WAKE_OP "
 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
 		    args->uaddr, args->val, args->uaddr2, args->val3,
 		    args->timeout);
 
 		if (args->uaddr == args->uaddr2) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
 			return (EINVAL);
 		}
 
 retry2:
 		error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK);
 		if (error) {
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTLOCK);
 		if (error) {
 			futex_put(f, NULL);
 
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 		futex_lock(f);
 		futex_lock(f2);
 
 		/*
 		 * This function returns positive number as results and
 		 * negative as errors
 		 */
 		save = vm_fault_disable_pagefaults();
 		op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
 		vm_fault_enable_pagefaults(save);
 
 		LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x",
 		    args->uaddr, op_ret);
 
 		if (op_ret < 0) {
 			if (f2 != NULL)
 				futex_put(f2, NULL);
 			futex_put(f, NULL);
 			error = copyin(args->uaddr2, &val, sizeof(val));
 			if (error == 0)
 				goto retry2;
 			LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 			return (error);
 		}
 
 		ret = futex_wake(f, args->val, args->val3);
 
 		if (op_ret > 0) {
 			op_ret = 0;
 			nrwake = (int)(unsigned long)args->timeout;
 
 			if (f2 != NULL)
 				op_ret += futex_wake(f2, nrwake, args->val3);
 			else
 				op_ret += futex_wake(f, nrwake, args->val3);
 			ret += op_ret;
 
 		}
 		if (f2 != NULL)
 			futex_put(f2, NULL);
 		futex_put(f, NULL);
 		td->td_retval[0] = ret;
 		break;
 
 	case LINUX_FUTEX_LOCK_PI:
 		/* not yet implemented */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_pi op\n");
 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    unimplemented_lock_pi);
 		}
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_UNLOCK_PI:
 		/* not yet implemented */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_pi op\n");
 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    unimplemented_unlock_pi);
 		}
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_TRYLOCK_PI:
 		/* not yet implemented */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_pi op\n");
 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    unimplemented_trylock_pi);
 		}
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_REQUEUE:
 		/*
 		 * Glibc does not use this operation since version 2.3.3,
 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
 		 * FUTEX_REQUEUE returned EINVAL.
 		 */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_requeue op\n");
 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    deprecated_requeue);
 		}
 
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL);
 		return (EINVAL);
 
 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
 		/* not yet implemented */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_pi op\n");
 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    unimplemented_wait_requeue_pi);
 		}
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	case LINUX_FUTEX_CMP_REQUEUE_PI:
 		/* not yet implemented */
 		pem = pem_find(td->td_proc);
 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
 			linux_msg(td,
 				  "linux_sys_futex: "
 				  "unsupported futex_pi op\n");
 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
 			LIN_SDT_PROBE0(futex, linux_sys_futex,
 			    unimplemented_cmp_requeue_pi);
 		}
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 
 	default:
 		linux_msg(td,
 			  "linux_sys_futex: unknown op %d\n", args->op);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation,
 		    args->op);
 		LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS);
 		return (ENOSYS);
 	}
 
 	LIN_SDT_PROBE1(futex, linux_sys_futex, return, error);
 	return (error);
 }
 
 int
 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
 {
 	struct linux_emuldata *em;
 
 	LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args);
 
 	if (args->len != sizeof(struct linux_robust_list_head)) {
 		LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error);
 		LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL);
 		return (EINVAL);
 	}
 
 	em = em_find(td);
 	em->robust_futexes = args->head;
 
 	LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0);
 	return (0);
 }
 
 int
 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
 {
 	struct linux_emuldata *em;
 	struct linux_robust_list_head *head;
 	l_size_t len = sizeof(struct linux_robust_list_head);
 	struct thread *td2;
 	int error = 0;
 
 	LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args);
 
 	if (!args->pid) {
 		em = em_find(td);
 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
 		head = em->robust_futexes;
 	} else {
 		td2 = tdfind(args->pid, -1);
 		if (td2 == NULL) {
 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
 			    ESRCH);
 			return (ESRCH);
 		}
 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
 			    EPERM);
 			PROC_UNLOCK(td2->td_proc);
 			return (EPERM);
 		}
 
 		em = em_find(td2);
 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
 		/* XXX: ptrace? */
 		if (priv_check(td, PRIV_CRED_SETUID) ||
 		    priv_check(td, PRIV_CRED_SETEUID) ||
 		    p_candebug(td, td2->td_proc)) {
 			PROC_UNLOCK(td2->td_proc);
 
 			LIN_SDT_PROBE1(futex, linux_get_robust_list, return,
 			    EPERM);
 			return (EPERM);
 		}
 		head = em->robust_futexes;
 
 		PROC_UNLOCK(td2->td_proc);
 	}
 
 	error = copyout(&len, args->len, sizeof(l_size_t));
 	if (error) {
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
 		    error);
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT);
 		return (EFAULT);
 	}
 
 	error = copyout(&head, args->head, sizeof(head));
 	if (error) {
 		LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error,
 		    error);
 	}
 
 	LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error);
 	return (error);
 }
 
 static int
 handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr,
     unsigned int pi)
 {
 	uint32_t uval, nval, mval;
 	struct futex *f;
 	int error;
 
 	LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi);
 
 retry:
 	error = copyin(uaddr, &uval, 4);
 	if (error) {
 		LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error);
 		LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT);
 		return (EFAULT);
 	}
 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
 		nval = casuword32(uaddr, uval, mval);
 
 		if (nval == -1) {
 			LIN_SDT_PROBE1(futex, handle_futex_death, return,
 			    EFAULT);
 			return (EFAULT);
 		}
 
 		if (nval != uval)
 			goto retry;
 
 		if (!pi && (uval & FUTEX_WAITERS)) {
 			error = futex_get(uaddr, NULL, &f,
 			    FUTEX_DONTCREATE | FUTEX_SHARED);
 			if (error) {
 				LIN_SDT_PROBE1(futex, handle_futex_death,
 				    return, error);
 				return (error);
 			}
 			if (f != NULL) {
 				futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY);
 				futex_put(f, NULL);
 			}
 		}
 	}
 
 	LIN_SDT_PROBE1(futex, handle_futex_death, return, 0);
 	return (0);
 }
 
 static int
 fetch_robust_entry(struct linux_robust_list **entry,
     struct linux_robust_list **head, unsigned int *pi)
 {
 	l_ulong uentry;
 	int error;
 
 	LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi);
 
 	error = copyin((const void *)head, &uentry, sizeof(l_ulong));
 	if (error) {
 		LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error);
 		LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT);
 		return (EFAULT);
 	}
 
 	*entry = (void *)(uentry & ~1UL);
 	*pi = uentry & 1;
 
 	LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0);
 	return (0);
 }
 
 /* This walks the list of robust futexes releasing them. */
 void
 release_futexes(struct thread *td, struct linux_emuldata *em)
 {
 	struct linux_robust_list_head *head = NULL;
 	struct linux_robust_list *entry, *next_entry, *pending;
 	unsigned int limit = 2048, pi, next_pi, pip;
 	l_long futex_offset;
 	int rc, error;
 
 	LIN_SDT_PROBE2(futex, release_futexes, entry, td, em);
 
 	head = em->robust_futexes;
 
 	if (head == NULL) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	error = copyin(&head->futex_offset, &futex_offset,
 	    sizeof(futex_offset));
 	if (error) {
 		LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error);
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) {
 		LIN_SDT_PROBE0(futex, release_futexes, return);
 		return;
 	}
 
 	while (entry != &head->list) {
 		rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi);
 
 		if (entry != pending)
 			if (handle_futex_death(em,
 			    (uint32_t *)((caddr_t)entry + futex_offset), pi)) {
 				LIN_SDT_PROBE0(futex, release_futexes, return);
 				return;
 			}
 		if (rc) {
 			LIN_SDT_PROBE0(futex, release_futexes, return);
 			return;
 		}
 
 		entry = next_entry;
 		pi = next_pi;
 
 		if (!--limit)
 			break;
 
 		sched_relinquish(curthread);
 	}
 
 	if (pending)
 		handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip);
 
 	LIN_SDT_PROBE0(futex, release_futexes, return);
 }
Index: head/sys/compat/linux/linux_misc.c
===================================================================
--- head/sys/compat/linux/linux_misc.c	(revision 336913)
+++ head/sys/compat/linux/linux_misc.c	(revision 336914)
@@ -1,2593 +1,2593 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/blist.h>
 #include <sys/fcntl.h>
 #if defined(__i386__)
 #include <sys/imgact_aout.h>
 #endif
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/racct.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/cpuset.h>
 #include <sys/uio.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 #include <compat/linux/linux_sysproto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 
 /**
  * Special DTrace provider for the linuxulator.
  *
  * In this file we define the provider for the entire linuxulator. All
  * modules (= files of the linuxulator) use it.
  *
  * We define a different name depending on the emulated bitsize, see
  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
  *      native bitsize          = linuxulator
  *      amd64, 32bit emulation  = linuxulator32
  */
 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 
 int stclohz;				/* Statistics clock frequency */
 
 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 	RLIMIT_MEMLOCK, RLIMIT_AS
 };
 
 struct l_sysinfo {
 	l_long		uptime;		/* Seconds since boot */
 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
 #define LINUX_SYSINFO_LOADS_SCALE 65536
 	l_ulong		totalram;	/* Total usable main memory size */
 	l_ulong		freeram;	/* Available memory size */
 	l_ulong		sharedram;	/* Amount of shared memory */
 	l_ulong		bufferram;	/* Memory used by buffers */
 	l_ulong		totalswap;	/* Total swap space size */
 	l_ulong		freeswap;	/* swap space still available */
 	l_ushort	procs;		/* Number of current processes */
 	l_ushort	pads;
 	l_ulong		totalbig;
 	l_ulong		freebig;
 	l_uint		mem_unit;
 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
 };
 
 struct l_pselect6arg {
 	l_uintptr_t	ss;
 	l_size_t	ss_len;
 };
 
 static int	linux_utimensat_nsec_valid(l_long);
 
 
 int
 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 {
 	struct l_sysinfo sysinfo;
 	vm_object_t object;
 	int i, j;
 	struct timespec ts;
 
 	bzero(&sysinfo, sizeof(sysinfo));
 	getnanouptime(&ts);
 	if (ts.tv_nsec != 0)
 		ts.tv_sec++;
 	sysinfo.uptime = ts.tv_sec;
 
 	/* Use the information from the mib to get our load averages */
 	for (i = 0; i < 3; i++)
 		sysinfo.loads[i] = averunnable.ldavg[i] *
 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 
 	sysinfo.totalram = physmem * PAGE_SIZE;
 	sysinfo.freeram = sysinfo.totalram - vm_wire_count() * PAGE_SIZE;
 
 	sysinfo.sharedram = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			sysinfo.sharedram += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 
 	sysinfo.sharedram *= PAGE_SIZE;
 	sysinfo.bufferram = 0;
 
 	swap_pager_status(&i, &j);
 	sysinfo.totalswap = i * PAGE_SIZE;
 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
 
 	sysinfo.procs = nprocs;
 
 	/* The following are only present in newer Linux kernels. */
 	sysinfo.totalbig = 0;
 	sysinfo.freebig = 0;
 	sysinfo.mem_unit = 1;
 
 	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 {
 	struct itimerval it, old_it;
 	u_int secs;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(alarm))
 		printf(ARGS(alarm, "%u"), args->secs);
 #endif
 	secs = args->secs;
 	/*
 	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 	 * to match kern_setitimer()'s limit to avoid error from it.
 	 *
 	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 	 * platforms.
 	 */
 	if (secs > INT32_MAX / 2)
 		secs = INT32_MAX / 2;
 
 	it.it_value.tv_sec = secs;
 	it.it_value.tv_usec = 0;
 	timevalclear(&it.it_interval);
 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 	KASSERT(error == 0, ("kern_setitimer returns %d", error));
 
 	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 	    old_it.it_value.tv_usec >= 500000)
 		old_it.it_value.tv_sec++;
 	td->td_retval[0] = old_it.it_value.tv_sec;
 	return (0);
 }
 #endif
 
 int
 linux_brk(struct thread *td, struct linux_brk_args *args)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	uintptr_t new, old;
 
 #ifdef DEBUG
 	if (ldebug(brk))
 		printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
 #endif
 	old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
 	new = (uintptr_t)args->dsend;
 	if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
 		td->td_retval[0] = (register_t)new;
 	else
 		td->td_retval[0] = (register_t)old;
 
 	return (0);
 }
 
 #if defined(__i386__)
 /* XXX: what about amd64/linux32? */
 
 int
 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 {
 	struct nameidata ni;
 	struct vnode *vp;
 	struct exec *a_out;
 	struct vattr attr;
 	vm_offset_t vmaddr;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	char *library;
 	ssize_t aresid;
 	int error, locked, writecount;
 
 	LCONVPATHEXIST(td, args->library, &library);
 
 #ifdef DEBUG
 	if (ldebug(uselib))
 		printf(ARGS(uselib, "%s"), library);
 #endif
 
 	a_out = NULL;
 	locked = 0;
 	vp = NULL;
 
 	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, library, td);
 	error = namei(&ni);
 	LFREEPATH(library);
 	if (error)
 		goto cleanup;
 
 	vp = ni.ni_vp;
 	NDFREE(&ni, NDF_ONLY_PNBUF);
 
 	/*
 	 * From here on down, we have a locked vnode that must be unlocked.
 	 * XXX: The code below largely duplicates exec_check_permissions().
 	 */
 	locked = 1;
 
 	/* Writable? */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		goto cleanup;
 	if (writecount != 0) {
 		error = ETXTBSY;
 		goto cleanup;
 	}
 
 	/* Executable? */
 	error = VOP_GETATTR(vp, &attr, td->td_ucred);
 	if (error)
 		goto cleanup;
 
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 		/* EACCESS is what exec(2) returns. */
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Sensible size? */
 	if (attr.va_size == 0) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Can we access it? */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		goto cleanup;
 
 	/*
 	 * XXX: This should use vn_open() so that it is properly authorized,
 	 * and to reduce code redundancy all over the place here.
 	 * XXX: Not really, it duplicates far more of exec_check_permissions()
 	 * than vn_open().
 	 */
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 	if (error)
 		goto cleanup;
 #endif
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error)
 		goto cleanup;
 
 	/* Pull in executable header into exec_map */
 	error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 	if (error)
 		goto cleanup;
 
 	/* Is it a Linux binary ? */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/*
 	 * While we are here, we should REALLY do some more checks
 	 */
 
 	/* Set file/virtual offset based on a.out variant. */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case 0413:			/* ZMAGIC */
 		file_offset = 1024;
 		break;
 	case 0314:			/* QMAGIC */
 		file_offset = 0;
 		break;
 	default:
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	bss_size = round_page(a_out->a_bss);
 
 	/* Check various fields in header for validity/bounds. */
 	if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > attr.va_size) {
 		error = EFAULT;
 		goto cleanup;
 	}
 
 	/*
 	 * text/data/bss must not exceed limits
 	 * XXX - this is not complete. it should check current usage PLUS
 	 * the resources needed by this library.
 	 */
 	PROC_LOCK(td->td_proc);
 	if (a_out->a_text > maxtsiz ||
 	    a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 	    racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 	    bss_size) != 0) {
 		PROC_UNLOCK(td->td_proc);
 		error = ENOMEM;
 		goto cleanup;
 	}
 	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * Prevent more writers.
 	 * XXX: Note that if any of the VM operations fail below we don't
 	 * clear this flag.
 	 */
 	VOP_SET_TEXT(vp);
 
 	/*
 	 * Lock no longer needed
 	 */
 	locked = 0;
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Check if file_offset page aligned. Currently we cannot handle
 	 * misalinged file offsets, and so we read in the entire image
 	 * (what a waste).
 	 */
 	if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 		printf("uselib: Non page aligned binary %lu\n", file_offset);
 #endif
 		/* Map text+data read/write/execute */
 
 		/* a_entry is the load address and is page aligned */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/* get anon user mapping, read+write+execute */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 
 		error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 		    a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 		    td->td_ucred, NOCRED, &aresid, td);
 		if (error != 0)
 			goto cleanup;
 		if (aresid != 0) {
 			error = ENOEXEC;
 			goto cleanup;
 		}
 	} else {
 #ifdef DEBUG
 		printf("uselib: Page aligned binary %lu\n", file_offset);
 #endif
 		/*
 		 * for QMAGIC, a_entry is 20 bytes beyond the load address
 		 * to skip the executable header
 		 */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/*
 		 * Map it all into the process's space as a single
 		 * copy-on-write "data" segment.
 		 */
 		error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
 		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 		if (error)
 			goto cleanup;
 	}
 #ifdef DEBUG
 	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
 	    ((long *)vmaddr)[1]);
 #endif
 	if (bss_size != 0) {
 		/* Calculate BSS start address */
 		vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 		    a_out->a_data;
 
 		/* allocate some 'anon' space */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 	}
 
 cleanup:
 	/* Unlock vnode if needed */
 	if (locked)
 		VOP_UNLOCK(vp, 0);
 
 	/* Release the temporary mapping. */
 	if (a_out)
 		kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 
 	return (error);
 }
 
 #endif	/* __i386__ */
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_select(struct thread *td, struct linux_select_args *args)
 {
 	l_timeval ltv;
 	struct timeval tv0, tv1, utv, *tvp;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
 		    (void *)args->readfds, (void *)args->writefds,
 		    (void *)args->exceptfds, (void *)args->timeout);
 #endif
 
 	/*
 	 * Store current time for computation of the amount of
 	 * time left.
 	 */
 	if (args->timeout) {
 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 			goto select_out;
 		utv.tv_sec = ltv.tv_sec;
 		utv.tv_usec = ltv.tv_usec;
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("incoming timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 
 		if (itimerfix(&utv)) {
 			/*
 			 * The timeval was invalid.  Convert it to something
 			 * valid that will act as it does under Linux.
 			 */
 			utv.tv_sec += utv.tv_usec / 1000000;
 			utv.tv_usec %= 1000000;
 			if (utv.tv_usec < 0) {
 				utv.tv_sec -= 1;
 				utv.tv_usec += 1000000;
 			}
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		}
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp, LINUX_NFDBITS);
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("real select returns %d"), error);
 #endif
 	if (error)
 		goto select_out;
 
 	if (args->timeout) {
 		if (td->td_retval[0]) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("outgoing timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 		ltv.tv_sec = utv.tv_sec;
 		ltv.tv_usec = utv.tv_usec;
 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 			goto select_out;
 	}
 
 select_out:
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("select_out -> %d"), error);
 #endif
 	return (error);
 }
 #endif
 
 int
 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 {
 	uintptr_t addr;
 	size_t len;
 	int error = 0;
 
 #ifdef DEBUG
 	if (ldebug(mremap))
 		printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
 		    (void *)(uintptr_t)args->addr,
 		    (unsigned long)args->old_len,
 		    (unsigned long)args->new_len,
 		    (unsigned long)args->flags);
 #endif
 
 	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	/*
 	 * Check for the page alignment.
 	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 	 */
 	if (args->addr & PAGE_MASK) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	args->new_len = round_page(args->new_len);
 	args->old_len = round_page(args->old_len);
 
 	if (args->new_len > args->old_len) {
 		td->td_retval[0] = 0;
 		return (ENOMEM);
 	}
 
 	if (args->new_len < args->old_len) {
 		addr = args->addr + args->new_len;
 		len = args->old_len - args->new_len;
 		error = kern_munmap(td, addr, len);
 	}
 
 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 	return (error);
 }
 
 #define LINUX_MS_ASYNC       0x0001
 #define LINUX_MS_INVALIDATE  0x0002
 #define LINUX_MS_SYNC        0x0004
 
 int
 linux_msync(struct thread *td, struct linux_msync_args *args)
 {
 
 	return (kern_msync(td, args->addr, args->len,
 	    args->fl & ~LINUX_MS_SYNC));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_time(struct thread *td, struct linux_time_args *args)
 {
 	struct timeval tv;
 	l_time_t tm;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(time))
 		printf(ARGS(time, "*"));
 #endif
 
 	microtime(&tv);
 	tm = tv.tv_sec;
 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 		return (error);
 	td->td_retval[0] = tm;
 	return (0);
 }
 #endif
 
 struct l_times_argv {
 	l_clock_t	tms_utime;
 	l_clock_t	tms_stime;
 	l_clock_t	tms_cutime;
 	l_clock_t	tms_cstime;
 };
 
 
 /*
  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
  * auxiliary vector entry.
  */
 #define	CLK_TCK		100
 
 #define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 #define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 
 #define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER_2004000 ?		\
 			    CONVNTCK(r) : CONVOTCK(r))
 
 int
 linux_times(struct thread *td, struct linux_times_args *args)
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct l_times_argv tms;
 	struct proc *p;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(times))
 		printf(ARGS(times, "*"));
 #endif
 
 	if (args->buf != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &utime, &stime);
 		PROC_STATUNLOCK(p);
 		calccru(p, &cutime, &cstime);
 		PROC_UNLOCK(p);
 
 		tms.tms_utime = CONVTCK(utime);
 		tms.tms_stime = CONVTCK(stime);
 
 		tms.tms_cutime = CONVTCK(cutime);
 		tms.tms_cstime = CONVTCK(cstime);
 
 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
 			return (error);
 	}
 
 	microuptime(&tv);
 	td->td_retval[0] = (int)CONVTCK(tv);
 	return (0);
 }
 
 int
 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 {
 	struct l_new_utsname utsname;
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 	char *p;
 
 #ifdef DEBUG
 	if (ldebug(newuname))
 		printf(ARGS(newuname, "*"));
 #endif
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 
 	bzero(&utsname, sizeof(utsname));
 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 	for (p = utsname.version; *p != '\0'; ++p)
 		if (*p == '\n') {
 			*p = '\0';
 			break;
 		}
 	strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
 
 	return (copyout(&utsname, args->buf, sizeof(utsname)));
 }
 
 struct l_utimbuf {
 	l_time_t l_actime;
 	l_time_t l_modtime;
 };
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_utime(struct thread *td, struct linux_utime_args *args)
 {
 	struct timeval tv[2], *tvp;
 	struct l_utimbuf lut;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utime))
 		printf(ARGS(utime, "%s, *"), fname);
 #endif
 
 	if (args->times) {
 		if ((error = copyin(args->times, &lut, sizeof lut))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = lut.l_actime;
 		tv[0].tv_usec = 0;
 		tv[1].tv_sec = lut.l_modtime;
 		tv[1].tv_usec = 0;
 		tvp = tv;
 	} else
 		tvp = NULL;
 
 	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 	    UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 #endif
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utimes))
 		printf(ARGS(utimes, "%s, *"), fname);
 #endif
 
 	if (args->tptr != NULL) {
 		if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 	    tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 #endif
 
 static int
 linux_utimensat_nsec_valid(l_long nsec)
 {
 
 	if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
 		return (0);
 	if (nsec >= 0 && nsec <= 999999999)
 		return (0);
 	return (1);
 }
 
 int
 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 {
 	struct l_timespec l_times[2];
 	struct timespec times[2], *timesp = NULL;
 	char *path = NULL;
 	int error, dfd, flags = 0;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 
 #ifdef DEBUG
 	if (ldebug(utimensat))
 		printf(ARGS(utimensat, "%d, *"), dfd);
 #endif
 
 	if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	if (args->times != NULL) {
 		error = copyin(args->times, l_times, sizeof(l_times));
 		if (error != 0)
 			return (error);
 
 		if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
 		    linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
 			return (EINVAL);
 
 		times[0].tv_sec = l_times[0].tv_sec;
 		switch (l_times[0].tv_nsec)
 		{
 		case LINUX_UTIME_OMIT:
 			times[0].tv_nsec = UTIME_OMIT;
 			break;
 		case LINUX_UTIME_NOW:
 			times[0].tv_nsec = UTIME_NOW;
 			break;
 		default:
 			times[0].tv_nsec = l_times[0].tv_nsec;
 		}
 
 		times[1].tv_sec = l_times[1].tv_sec;
 		switch (l_times[1].tv_nsec)
 		{
 		case LINUX_UTIME_OMIT:
 			times[1].tv_nsec = UTIME_OMIT;
 			break;
 		case LINUX_UTIME_NOW:
 			times[1].tv_nsec = UTIME_NOW;
 			break;
 		default:
 			times[1].tv_nsec = l_times[1].tv_nsec;
 			break;
 		}
 		timesp = times;
 
 		/* This breaks POSIX, but is what the Linux kernel does
 		 * _on purpose_ (documented in the man page for utimensat(2)),
 		 * so we must follow that behaviour. */
 		if (times[0].tv_nsec == UTIME_OMIT &&
 		    times[1].tv_nsec == UTIME_OMIT)
 			return (0);
 	}
 
 	if (args->pathname != NULL)
 		LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
 	else if (args->flags != 0)
 		return (EINVAL);
 
 	if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
 		flags |= AT_SYMLINK_NOFOLLOW;
 
 	if (path == NULL)
 		error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 	else {
 		error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 			UIO_SYSSPACE, flags);
 		LFREEPATH(path);
 	}
 
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 
 #ifdef DEBUG
 	if (ldebug(futimesat))
 		printf(ARGS(futimesat, "%s, *"), fname);
 #endif
 
 	if (args->utimes != NULL) {
 		if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 #endif
 
 int
 linux_common_wait(struct thread *td, int pid, int *status,
     int options, struct rusage *ru)
 {
 	int error, tmpstat;
 
 	error = kern_wait(td, pid, &tmpstat, options, ru);
 	if (error)
 		return (error);
 
 	if (status) {
 		tmpstat &= 0xffff;
 		if (WIFSIGNALED(tmpstat))
 			tmpstat = (tmpstat & 0xffffff80) |
 			    bsd_to_linux_signal(WTERMSIG(tmpstat));
 		else if (WIFSTOPPED(tmpstat))
 			tmpstat = (tmpstat & 0xffff00ff) |
 			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
 		else if (WIFCONTINUED(tmpstat))
 			tmpstat = 0xffff;
 		error = copyout(&tmpstat, status, sizeof(int));
 	}
 
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 {
 	struct linux_wait4_args wait4_args;
 
 #ifdef DEBUG
 	if (ldebug(waitpid))
 		printf(ARGS(waitpid, "%d, %p, %d"),
 		    args->pid, (void *)args->status, args->options);
 #endif
 
 	wait4_args.pid = args->pid;
 	wait4_args.status = args->status;
 	wait4_args.options = args->options;
 	wait4_args.rusage = NULL;
 
 	return (linux_wait4(td, &wait4_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_wait4(struct thread *td, struct linux_wait4_args *args)
 {
 	int error, options;
 	struct rusage ru, *rup;
 
 #ifdef DEBUG
 	if (ldebug(wait4))
 		printf(ARGS(wait4, "%d, %p, %d, %p"),
 		    args->pid, (void *)args->status, args->options,
 		    (void *)args->rusage);
 #endif
 	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
 	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
 		return (EINVAL);
 
 	options = WEXITED;
 	linux_to_bsd_waitopts(args->options, &options);
 
 	if (args->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = linux_common_wait(td, args->pid, args->status, options, rup);
 	if (error != 0)
 		return (error);
 	if (args->rusage != NULL)
 		error = linux_copyout_rusage(&ru, args->rusage);
 	return (error);
 }
 
 int
 linux_waitid(struct thread *td, struct linux_waitid_args *args)
 {
 	int status, options, sig;
 	struct __wrusage wru;
 	siginfo_t siginfo;
 	l_siginfo_t lsi;
 	idtype_t idtype;
 	struct proc *p;
 	int error;
 
 	options = 0;
 	linux_to_bsd_waitopts(args->options, &options);
 
 	if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
 		return (EINVAL);
 	if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
 		return (EINVAL);
 
 	switch (args->idtype) {
 	case LINUX_P_ALL:
 		idtype = P_ALL;
 		break;
 	case LINUX_P_PID:
 		if (args->id <= 0)
 			return (EINVAL);
 		idtype = P_PID;
 		break;
 	case LINUX_P_PGID:
 		if (args->id <= 0)
 			return (EINVAL);
 		idtype = P_PGID;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = kern_wait6(td, idtype, args->id, &status, options,
 	    &wru, &siginfo);
 	if (error != 0)
 		return (error);
 	if (args->rusage != NULL) {
 		error = linux_copyout_rusage(&wru.wru_children,
 		    args->rusage);
 		if (error != 0)
 			return (error);
 	}
 	if (args->info != NULL) {
 		p = td->td_proc;
 		if (td->td_retval[0] == 0)
 			bzero(&lsi, sizeof(lsi));
 		else {
 			sig = bsd_to_linux_signal(siginfo.si_signo);
 			siginfo_to_lsiginfo(&siginfo, &lsi, sig);
 		}
 		error = copyout(&lsi, args->info, sizeof(lsi));
 	}
 	td->td_retval[0] = 0;
 
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_mknod(struct thread *td, struct linux_mknod_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHCREAT(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(mknod))
 		printf(ARGS(mknod, "%s, %d, %ju"), path, args->mode,
 		    (uintmax_t)args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    args->mode, args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 #endif
 
 int
 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
 {
 	char *path;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
 
 #ifdef DEBUG
 	if (ldebug(mknodat))
 		printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
 		    args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_openat(td, dfd, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 /*
  * UGH! This is just about the dumbest idea I've ever heard!!
  */
 int
 linux_personality(struct thread *td, struct linux_personality_args *args)
 {
 	struct linux_pemuldata *pem;
 	struct proc *p = td->td_proc;
 	uint32_t old;
 
 #ifdef DEBUG
 	if (ldebug(personality))
 		printf(ARGS(personality, "%u"), args->per);
 #endif
 
 	PROC_LOCK(p);
 	pem = pem_find(p);
 	old = pem->persona;
 	if (args->per != 0xffffffff)
 		pem->persona = args->per;
 	PROC_UNLOCK(p);
 
 	td->td_retval[0] = old;
 	return (0);
 }
 
 struct l_itimerval {
 	l_timeval it_interval;
 	l_timeval it_value;
 };
 
 #define	B2L_ITIMERVAL(bip, lip)						\
 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
 
 int
 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv, oitv;
 
 #ifdef DEBUG
 	if (ldebug(setitimer))
 		printf(ARGS(setitimer, "%p, %p"),
 		    (void *)uap->itv, (void *)uap->oitv);
 #endif
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
 	}
 
 	error = copyin(uap->itv, &ls, sizeof(ls));
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&aitv, &ls);
 #ifdef DEBUG
 	if (ldebug(setitimer)) {
 		printf("setitimer: value: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
 		printf("setitimer: interval: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
 	}
 #endif
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	B2L_ITIMERVAL(&ls, &oitv);
 
 	return (copyout(&ls, uap->oitv, sizeof(ls)));
 }
 
 int
 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv;
 
 #ifdef DEBUG
 	if (ldebug(getitimer))
 		printf(ARGS(getitimer, "%p"), (void *)uap->itv);
 #endif
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&ls, &aitv);
 	return (copyout(&ls, uap->itv, sizeof(ls)));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_nice(struct thread *td, struct linux_nice_args *args)
 {
 	struct setpriority_args bsd_args;
 
 	bsd_args.which = PRIO_PROCESS;
 	bsd_args.who = 0;		/* current process */
 	bsd_args.prio = args->inc;
 	return (sys_setpriority(td, &bsd_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
 {
 	struct ucred *newcred, *oldcred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int ngrp, error;
 	struct proc *p;
 
 	ngrp = args->gidsetsize;
 	if (ngrp < 0 || ngrp >= ngroups_max + 1)
 		return (EINVAL);
 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
 	if (error)
 		goto out;
 	newcred = crget();
 	crextend(newcred, ngrp + 1);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 	crcopy(newcred, oldcred);
 
 	/*
 	 * cr_groups[0] holds egid. Setting the whole set from
 	 * the supplied set will cause egid to be changed too.
 	 * Keep cr_groups[0] unchanged to prevent that.
 	 */
 
 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	if (ngrp > 0) {
 		newcred->cr_ngroups = ngrp + 1;
 
 		bsd_gidset = newcred->cr_groups;
 		ngrp--;
 		while (ngrp >= 0) {
 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
 			ngrp--;
 		}
 	} else
 		newcred->cr_ngroups = 1;
 
 	setsugid(p);
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	error = 0;
 out:
 	free(linux_gidset, M_LINUX);
 	return (error);
 }
 
 int
 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
 {
 	struct ucred *cred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int bsd_gidsetsz, ngrp, error;
 
 	cred = td->td_ucred;
 	bsd_gidset = cred->cr_groups;
 	bsd_gidsetsz = cred->cr_ngroups - 1;
 
 	/*
 	 * cr_groups[0] holds egid. Returning the whole set
 	 * here will cause a duplicate. Exclude cr_groups[0]
 	 * to prevent that.
 	 */
 
 	if ((ngrp = args->gidsetsize) == 0) {
 		td->td_retval[0] = bsd_gidsetsz;
 		return (0);
 	}
 
 	if (ngrp < bsd_gidsetsz)
 		return (EINVAL);
 
 	ngrp = 0;
 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
 	    M_LINUX, M_WAITOK);
 	while (ngrp < bsd_gidsetsz) {
 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
 		ngrp++;
 	}
 
 	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
 	free(linux_gidset, M_LINUX);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = ngrp;
 	return (0);
 }
 
 int
 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
 {
 	struct rlimit bsd_rlim;
 	struct l_rlimit rlim;
 	u_int which;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(setrlimit))
 		printf(ARGS(setrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	error = copyin(args->rlim, &rlim, sizeof(rlim));
 	if (error)
 		return (error);
 
 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
 	return (kern_setrlimit(td, which, &bsd_rlim));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(old_getrlimit))
 		printf(ARGS(old_getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	lim_rlimit(td, which, &bsd_rlim);
 
 #ifdef COMPAT_LINUX32
 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == UINT_MAX)
 		rlim.rlim_cur = INT_MAX;
 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == UINT_MAX)
 		rlim.rlim_max = INT_MAX;
 #else
 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == ULONG_MAX)
 		rlim.rlim_cur = LONG_MAX;
 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == ULONG_MAX)
 		rlim.rlim_max = LONG_MAX;
 #endif
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(getrlimit))
 		printf(ARGS(getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	lim_rlimit(td, which, &bsd_rlim);
 
 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_sched_setscheduler(struct thread *td,
     struct linux_sched_setscheduler_args *args)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error, policy;
 
 #ifdef DEBUG
 	if (ldebug(sched_setscheduler))
 		printf(ARGS(sched_setscheduler, "%d, %d, %p"),
 		    args->pid, args->policy, (const void *)args->param);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = copyin(args->param, &sched_param, sizeof(sched_param));
 	if (error)
 		return (error);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	return (error);
 }
 
 int
 linux_sched_getscheduler(struct thread *td,
     struct linux_sched_getscheduler_args *args)
 {
 	struct thread *tdt;
 	int error, policy;
 
 #ifdef DEBUG
 	if (ldebug(sched_getscheduler))
 		printf(ARGS(sched_getscheduler, "%d"), args->pid);
 #endif
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_getscheduler(td, tdt, &policy);
 	PROC_UNLOCK(tdt->td_proc);
 
 	switch (policy) {
 	case SCHED_OTHER:
 		td->td_retval[0] = LINUX_SCHED_OTHER;
 		break;
 	case SCHED_FIFO:
 		td->td_retval[0] = LINUX_SCHED_FIFO;
 		break;
 	case SCHED_RR:
 		td->td_retval[0] = LINUX_SCHED_RR;
 		break;
 	}
 	return (error);
 }
 
 int
 linux_sched_get_priority_max(struct thread *td,
     struct linux_sched_get_priority_max_args *args)
 {
 	struct sched_get_priority_max_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_max))
 		printf(ARGS(sched_get_priority_max, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_max(td, &bsd));
 }
 
 int
 linux_sched_get_priority_min(struct thread *td,
     struct linux_sched_get_priority_min_args *args)
 {
 	struct sched_get_priority_min_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_min))
 		printf(ARGS(sched_get_priority_min, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_min(td, &bsd));
 }
 
 #define REBOOT_CAD_ON	0x89abcdef
 #define REBOOT_CAD_OFF	0
 #define REBOOT_HALT	0xcdef0123
 #define REBOOT_RESTART	0x01234567
 #define REBOOT_RESTART2	0xA1B2C3D4
 #define REBOOT_POWEROFF	0x4321FEDC
 #define REBOOT_MAGIC1	0xfee1dead
 #define REBOOT_MAGIC2	0x28121969
 #define REBOOT_MAGIC2A	0x05121996
 #define REBOOT_MAGIC2B	0x16041998
 
 int
 linux_reboot(struct thread *td, struct linux_reboot_args *args)
 {
 	struct reboot_args bsd_args;
 
 #ifdef DEBUG
 	if (ldebug(reboot))
 		printf(ARGS(reboot, "0x%x"), args->cmd);
 #endif
 
 	if (args->magic1 != REBOOT_MAGIC1)
 		return (EINVAL);
 
 	switch (args->magic2) {
 	case REBOOT_MAGIC2:
 	case REBOOT_MAGIC2A:
 	case REBOOT_MAGIC2B:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (args->cmd) {
 	case REBOOT_CAD_ON:
 	case REBOOT_CAD_OFF:
 		return (priv_check(td, PRIV_REBOOT));
 	case REBOOT_HALT:
 		bsd_args.opt = RB_HALT;
 		break;
 	case REBOOT_RESTART:
 	case REBOOT_RESTART2:
 		bsd_args.opt = 0;
 		break;
 	case REBOOT_POWEROFF:
 		bsd_args.opt = RB_POWEROFF;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_reboot(td, &bsd_args));
 }
 
 
 /*
  * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
  * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
  * are assumed to be preserved. The following lightweight syscalls fixes
  * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
  *
  * linux_getpid() - MP SAFE
  * linux_getgid() - MP SAFE
  * linux_getuid() - MP SAFE
  */
 
 int
 linux_getpid(struct thread *td, struct linux_getpid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getpid))
 		printf(ARGS(getpid, ""));
 #endif
 	td->td_retval[0] = td->td_proc->p_pid;
 
 	return (0);
 }
 
 int
 linux_gettid(struct thread *td, struct linux_gettid_args *args)
 {
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(gettid))
 		printf(ARGS(gettid, ""));
 #endif
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
 
 	td->td_retval[0] = em->em_tid;
 
 	return (0);
 }
 
 
 int
 linux_getppid(struct thread *td, struct linux_getppid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getppid))
 		printf(ARGS(getppid, ""));
 #endif
 
 	td->td_retval[0] = kern_getppid(td);
 	return (0);
 }
 
 int
 linux_getgid(struct thread *td, struct linux_getgid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getgid))
 		printf(ARGS(getgid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 	return (0);
 }
 
 int
 linux_getuid(struct thread *td, struct linux_getuid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getuid))
 		printf(ARGS(getuid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 	return (0);
 }
 
 
 int
 linux_getsid(struct thread *td, struct linux_getsid_args *args)
 {
 	struct getsid_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(getsid))
 		printf(ARGS(getsid, "%i"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	return (sys_getsid(td, &bsd));
 }
 
 int
 linux_nosys(struct thread *td, struct nosys_args *ignore)
 {
 
 	return (ENOSYS);
 }
 
 int
 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
 {
 	struct getpriority_args bsd_args;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(getpriority))
 		printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
 #endif
 
 	bsd_args.which = args->which;
 	bsd_args.who = args->who;
 	error = sys_getpriority(td, &bsd_args);
 	td->td_retval[0] = 20 - td->td_retval[0];
 	return (error);
 }
 
 int
 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(sethostname))
 		printf(ARGS(sethostname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
 	    args->len, 0, 0));
 }
 
 int
 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(setdomainname))
 		printf(ARGS(setdomainname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_NISDOMAINNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
 	    args->len, 0, 0));
 }
 
 int
 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(exit_group))
 		printf(ARGS(exit_group, "%i"), args->error_code);
 #endif
 
 	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
 	    args->error_code);
 
 	/*
 	 * XXX: we should send a signal to the parent if
 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
 	 * as it doesnt occur often.
 	 */
 	exit1(td, args->error_code, 0);
 		/* NOTREACHED */
 }
 
 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
 
 struct l_user_cap_header {
 	l_int	version;
 	l_int	pid;
 };
 
 struct l_user_cap_data {
 	l_int	effective;
 	l_int	permitted;
 	l_int	inheritable;
 };
 
 int
 linux_capget(struct thread *td, struct linux_capget_args *uap)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd[2];
 	int error, u32s;
 
 	if (uap->hdrp == NULL)
 		return (EFAULT);
 
 	error = copyin(uap->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	switch (luch.version) {
 	case _LINUX_CAPABILITY_VERSION_1:
 		u32s = 1;
 		break;
 	case _LINUX_CAPABILITY_VERSION_2:
 	case _LINUX_CAPABILITY_VERSION_3:
 		u32s = 2;
 		break;
 	default:
 #ifdef DEBUG
 		if (ldebug(capget))
 			printf(LMSG("invalid capget capability version 0x%x"),
 			    luch.version);
 #endif
 		luch.version = _LINUX_CAPABILITY_VERSION_1;
 		error = copyout(&luch, uap->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	if (uap->datap) {
 		/*
 		 * The current implementation doesn't support setting
 		 * a capability (it's essentially a stub) so indicate
 		 * that no capabilities are currently set or available
 		 * to request.
 		 */
 		memset(&lucd, 0, u32s * sizeof(lucd[0]));
 		error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
 	}
 
 	return (error);
 }
 
 int
 linux_capset(struct thread *td, struct linux_capset_args *uap)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd[2];
 	int error, i, u32s;
 
 	if (uap->hdrp == NULL || uap->datap == NULL)
 		return (EFAULT);
 
 	error = copyin(uap->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	switch (luch.version) {
 	case _LINUX_CAPABILITY_VERSION_1:
 		u32s = 1;
 		break;
 	case _LINUX_CAPABILITY_VERSION_2:
 	case _LINUX_CAPABILITY_VERSION_3:
 		u32s = 2;
 		break;
 	default:
 #ifdef DEBUG
 		if (ldebug(capset))
 			printf(LMSG("invalid capset capability version 0x%x"),
 			    luch.version);
 #endif
 		luch.version = _LINUX_CAPABILITY_VERSION_1;
 		error = copyout(&luch, uap->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
 	if (error != 0)
 		return (error);
 
 	/* We currently don't support setting any capabilities. */
 	for (i = 0; i < u32s; i++) {
 		if (lucd[i].effective || lucd[i].permitted ||
 		    lucd[i].inheritable) {
 			linux_msg(td,
 			    "capset[%d] effective=0x%x, permitted=0x%x, "
 			    "inheritable=0x%x is not implemented", i,
 			    (int)lucd[i].effective, (int)lucd[i].permitted,
 			    (int)lucd[i].inheritable);
 			return (EPERM);
 		}
 	}
 
 	return (0);
 }
 
 int
 linux_prctl(struct thread *td, struct linux_prctl_args *args)
 {
 	int error = 0, max_size;
 	struct proc *p = td->td_proc;
 	char comm[LINUX_MAX_COMM_LEN];
 	struct linux_emuldata *em;
 	int pdeath_signal;
 
 #ifdef DEBUG
 	if (ldebug(prctl))
 		printf(ARGS(prctl, "%d, %ju, %ju, %ju, %ju"), args->option,
 		    (uintmax_t)args->arg2, (uintmax_t)args->arg3,
 		    (uintmax_t)args->arg4, (uintmax_t)args->arg5);
 #endif
 
 	switch (args->option) {
 	case LINUX_PR_SET_PDEATHSIG:
 		if (!LINUX_SIG_VALID(args->arg2))
 			return (EINVAL);
 		em = em_find(td);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		em->pdeath_signal = args->arg2;
 		break;
 	case LINUX_PR_GET_PDEATHSIG:
 		em = em_find(td);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		pdeath_signal = em->pdeath_signal;
 		error = copyout(&pdeath_signal,
 		    (void *)(register_t)args->arg2,
 		    sizeof(pdeath_signal));
 		break;
 	case LINUX_PR_GET_KEEPCAPS:
 		/*
 		 * Indicate that we always clear the effective and
 		 * permitted capability sets when the user id becomes
 		 * non-zero (actually the capability sets are simply
 		 * always zero in the current implementation).
 		 */
 		td->td_retval[0] = 0;
 		break;
 	case LINUX_PR_SET_KEEPCAPS:
 		/*
 		 * Ignore requests to keep the effective and permitted
 		 * capability sets when the user id becomes non-zero.
 		 */
 		break;
 	case LINUX_PR_SET_NAME:
 		/*
 		 * To be on the safe side we need to make sure to not
 		 * overflow the size a Linux program expects. We already
 		 * do this here in the copyin, so that we don't need to
 		 * check on copyout.
 		 */
 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
 		error = copyinstr((void *)(register_t)args->arg2, comm,
 		    max_size, NULL);
 
 		/* Linux silently truncates the name if it is too long. */
 		if (error == ENAMETOOLONG) {
 			/*
 			 * XXX: copyinstr() isn't documented to populate the
 			 * array completely, so do a copyin() to be on the
 			 * safe side. This should be changed in case
 			 * copyinstr() is changed to guarantee this.
 			 */
 			error = copyin((void *)(register_t)args->arg2, comm,
 			    max_size - 1);
 			comm[max_size - 1] = '\0';
 		}
 		if (error)
 			return (error);
 
 		PROC_LOCK(p);
 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
 		PROC_UNLOCK(p);
 		break;
 	case LINUX_PR_GET_NAME:
 		PROC_LOCK(p);
 		strlcpy(comm, p->p_comm, sizeof(comm));
 		PROC_UNLOCK(p);
 		error = copyout(comm, (void *)(register_t)args->arg2,
 		    strlen(comm) + 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 linux_sched_setparam(struct thread *td,
     struct linux_sched_setparam_args *uap)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_setparam))
 		printf(ARGS(sched_setparam, "%d, *"), uap->pid);
 #endif
 
 	error = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (error)
 		return (error);
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_setparam(td, tdt, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	return (error);
 }
 
 int
 linux_sched_getparam(struct thread *td,
     struct linux_sched_getparam_args *uap)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_getparam))
 		printf(ARGS(sched_getparam, "%d, *"), uap->pid);
 #endif
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_getparam(td, tdt, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	if (error == 0)
 		error = copyout(&sched_param, uap->param,
 		    sizeof(sched_param));
 	return (error);
 }
 
 /*
  * Get affinity of a process.
  */
 int
 linux_sched_getaffinity(struct thread *td,
     struct linux_sched_getaffinity_args *args)
 {
 	int error;
 	struct thread *tdt;
 
 #ifdef DEBUG
 	if (ldebug(sched_getaffinity))
 		printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	PROC_UNLOCK(tdt->td_proc);
 
 	error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
 	    tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr);
 	if (error == 0)
 		td->td_retval[0] = sizeof(cpuset_t);
 
 	return (error);
 }
 
 /*
  *  Set affinity of a process.
  */
 int
 linux_sched_setaffinity(struct thread *td,
     struct linux_sched_setaffinity_args *args)
 {
 	struct thread *tdt;
 
 #ifdef DEBUG
 	if (ldebug(sched_setaffinity))
 		printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	PROC_UNLOCK(tdt->td_proc);
 
 	return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
 	    tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr));
 }
 
 struct linux_rlimit64 {
 	uint64_t	rlim_cur;
 	uint64_t	rlim_max;
 };
 
 int
 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
 {
 	struct rlimit rlim, nrlim;
 	struct linux_rlimit64 lrlim;
 	struct proc *p;
 	u_int which;
 	int flags;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(prlimit64))
 		printf(ARGS(prlimit64, "%d, %d, %p, %p"), args->pid,
 		    args->resource, (void *)args->new, (void *)args->old);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	if (args->new != NULL) {
 		/*
 		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
 		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
 		 * as INFINITY so we do not need a conversion even.
 		 */
 		error = copyin(args->new, &nrlim, sizeof(nrlim));
 		if (error != 0)
 			return (error);
 	}
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (args->new != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget(args->pid, flags, &p);
 	if (error != 0)
 		return (error);
 
 	if (args->old != NULL) {
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, which, &rlim);
 		PROC_UNLOCK(p);
 		if (rlim.rlim_cur == RLIM_INFINITY)
 			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
 		else
 			lrlim.rlim_cur = rlim.rlim_cur;
 		if (rlim.rlim_max == RLIM_INFINITY)
 			lrlim.rlim_max = LINUX_RLIM_INFINITY;
 		else
 			lrlim.rlim_max = rlim.rlim_max;
 		error = copyout(&lrlim, args->old, sizeof(lrlim));
 		if (error != 0)
 			goto out;
 	}
 
 	if (args->new != NULL)
 		error = kern_proc_setrlimit(td, p, which, &nrlim);
 
  out:
 	PRELE(p);
 	return (error);
 }
 
 int
 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
 {
 	struct timeval utv, tv0, tv1, *tvp;
 	struct l_pselect6arg lpse6;
 	struct l_timespec lts;
 	struct timespec uts;
 	l_sigset_t l_ss;
 	sigset_t *ssp;
 	sigset_t ss;
 	int error;
 
 	ssp = NULL;
 	if (args->sig != NULL) {
 		error = copyin(args->sig, &lpse6, sizeof(lpse6));
 		if (error != 0)
 			return (error);
 		if (lpse6.ss_len != sizeof(l_ss))
 			return (EINVAL);
 		if (lpse6.ss != 0) {
 			error = copyin(PTRIN(lpse6.ss), &l_ss,
 			    sizeof(l_ss));
 			if (error != 0)
 				return (error);
 			linux_to_bsd_sigset(&l_ss, &ss);
 			ssp = &ss;
 		}
 	}
 
 	/*
 	 * Currently glibc changes nanosecond number to microsecond.
 	 * This mean losing precision but for now it is hardly seen.
 	 */
 	if (args->tsp != NULL) {
 		error = copyin(args->tsp, &lts, sizeof(lts));
 		if (error != 0)
 			return (error);
 		error = linux_to_native_timespec(&uts, &lts);
 		if (error != 0)
 			return (error);
 
 		TIMESPEC_TO_TIMEVAL(&utv, &uts);
 		if (itimerfix(&utv))
 			return (EINVAL);
 
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp, ssp, LINUX_NFDBITS);
 
 	if (error == 0 && args->tsp != NULL) {
 		if (td->td_retval[0] != 0) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 
 		TIMEVAL_TO_TIMESPEC(&utv, &uts);
 
 		error = native_to_linux_timespec(&lts, &uts);
 		if (error == 0)
 			error = copyout(&lts, args->tsp, sizeof(lts));
 	}
 
 	return (error);
 }
 
 int
 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
 {
 	struct timespec ts0, ts1;
 	struct l_timespec lts;
 	struct timespec uts, *tsp;
 	l_sigset_t l_ss;
 	sigset_t *ssp;
 	sigset_t ss;
 	int error;
 
 	if (args->sset != NULL) {
 		if (args->ssize != sizeof(l_ss))
 			return (EINVAL);
 		error = copyin(args->sset, &l_ss, sizeof(l_ss));
 		if (error)
 			return (error);
 		linux_to_bsd_sigset(&l_ss, &ss);
 		ssp = &ss;
 	} else
 		ssp = NULL;
 	if (args->tsp != NULL) {
 		error = copyin(args->tsp, &lts, sizeof(lts));
 		if (error)
 			return (error);
 		error = linux_to_native_timespec(&uts, &lts);
 		if (error != 0)
 			return (error);
 
 		nanotime(&ts0);
 		tsp = &uts;
 	} else
 		tsp = NULL;
 
 	error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
 
 	if (error == 0 && args->tsp != NULL) {
 		if (td->td_retval[0]) {
 			nanotime(&ts1);
-			timespecsub(&ts1, &ts0);
-			timespecsub(&uts, &ts1);
+			timespecsub(&ts1, &ts0, &ts1);
+			timespecsub(&uts, &ts1, &uts);
 			if (uts.tv_sec < 0)
 				timespecclear(&uts);
 		} else
 			timespecclear(&uts);
 
 		error = native_to_linux_timespec(&lts, &uts);
 		if (error == 0)
 			error = copyout(&lts, args->tsp, sizeof(lts));
 	}
 
 	return (error);
 }
 
 #if defined(DEBUG) || defined(KTR)
 /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */
 
 #ifdef COMPAT_LINUX32
 #define	L_MAXSYSCALL	LINUX32_SYS_MAXSYSCALL
 #else
 #define	L_MAXSYSCALL	LINUX_SYS_MAXSYSCALL
 #endif
 
 u_char linux_debug_map[howmany(L_MAXSYSCALL, sizeof(u_char))];
 
 static int
 linux_debug(int syscall, int toggle, int global)
 {
 
 	if (global) {
 		char c = toggle ? 0 : 0xff;
 
 		memset(linux_debug_map, c, sizeof(linux_debug_map));
 		return (0);
 	}
 	if (syscall < 0 || syscall >= L_MAXSYSCALL)
 		return (EINVAL);
 	if (toggle)
 		clrbit(linux_debug_map, syscall);
 	else
 		setbit(linux_debug_map, syscall);
 	return (0);
 }
 #undef L_MAXSYSCALL
 
 /*
  * Usage: sysctl linux.debug=<syscall_nr>.<0/1>
  *
  *    E.g.: sysctl linux.debug=21.0
  *
  * As a special case, syscall "all" will apply to all syscalls globally.
  */
 #define LINUX_MAX_DEBUGSTR	16
 int
 linux_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	char value[LINUX_MAX_DEBUGSTR], *p;
 	int error, sysc, toggle;
 	int global = 0;
 
 	value[0] = '\0';
 	error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	for (p = value; *p != '\0' && *p != '.'; p++);
 	if (*p == '\0')
 		return (EINVAL);
 	*p++ = '\0';
 	sysc = strtol(value, NULL, 0);
 	toggle = strtol(p, NULL, 0);
 	if (strcmp(value, "all") == 0)
 		global = 1;
 	error = linux_debug(sysc, toggle, global);
 	return (error);
 }
 
 #endif /* DEBUG || KTR */
 
 int
 linux_sched_rr_get_interval(struct thread *td,
     struct linux_sched_rr_get_interval_args *uap)
 {
 	struct timespec ts;
 	struct l_timespec lts;
 	struct thread *tdt;
 	int error;
 
 	/*
 	 * According to man in case the invalid pid specified
 	 * EINVAL should be returned.
 	 */
 	if (uap->pid < 0)
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_rr_get_interval_td(td, tdt, &ts);
 	PROC_UNLOCK(tdt->td_proc);
 	if (error != 0)
 		return (error);
 	error = native_to_linux_timespec(&lts, &ts);
 	if (error != 0)
 		return (error);
 	return (copyout(&lts, uap->interval, sizeof(lts)));
 }
 
 /*
  * In case when the Linux thread is the initial thread in
  * the thread group thread id is equal to the process id.
  * Glibc depends on this magic (assert in pthread_getattr_np.c).
  */
 struct thread *
 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
 {
 	struct linux_emuldata *em;
 	struct thread *tdt;
 	struct proc *p;
 
 	tdt = NULL;
 	if (tid == 0 || tid == td->td_tid) {
 		tdt = td;
 		PROC_LOCK(tdt->td_proc);
 	} else if (tid > PID_MAX)
 		tdt = tdfind(tid, pid);
 	else {
 		/*
 		 * Initial thread where the tid equal to the pid.
 		 */
 		p = pfind(tid);
 		if (p != NULL) {
 			if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
 				/*
 				 * p is not a Linuxulator process.
 				 */
 				PROC_UNLOCK(p);
 				return (NULL);
 			}
 			FOREACH_THREAD_IN_PROC(p, tdt) {
 				em = em_find(tdt);
 				if (tid == em->em_tid)
 					return (tdt);
 			}
 			PROC_UNLOCK(p);
 		}
 		return (NULL);
 	}
 
 	return (tdt);
 }
 
 void
 linux_to_bsd_waitopts(int options, int *bsdopts)
 {
 
 	if (options & LINUX_WNOHANG)
 		*bsdopts |= WNOHANG;
 	if (options & LINUX_WUNTRACED)
 		*bsdopts |= WUNTRACED;
 	if (options & LINUX_WEXITED)
 		*bsdopts |= WEXITED;
 	if (options & LINUX_WCONTINUED)
 		*bsdopts |= WCONTINUED;
 	if (options & LINUX_WNOWAIT)
 		*bsdopts |= WNOWAIT;
 
 	if (options & __WCLONE)
 		*bsdopts |= WLINUXCLONE;
 }
 
 int
 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
 {
 	struct uio uio;
 	struct iovec iov;
 	int error;
 
 	if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
 		return (EINVAL);
 	if (args->count > INT_MAX)
 		args->count = INT_MAX;
 
 	iov.iov_base = args->buf;
 	iov.iov_len = args->count;
 
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_resid = iov.iov_len;
 	uio.uio_segflg = UIO_USERSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
 	if (error == 0)
 		td->td_retval[0] = args->count - uio.uio_resid;
 	return (error);
 }
 
 int
 linux_mincore(struct thread *td, struct linux_mincore_args *args)
 {
 
 	/* Needs to be page-aligned */
 	if (args->start & PAGE_MASK)
 		return (EINVAL);
 	return (kern_mincore(td, args->start, args->len, args->vec));
 }
Index: head/sys/compat/linux/linux_socket.c
===================================================================
--- head/sys/compat/linux/linux_socket.c	(revision 336913)
+++ head/sys/compat/linux/linux_socket.c	(revision 336914)
@@ -1,1764 +1,1764 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* XXX we use functions that might not exist. */
 #include "opt_compat.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/uio.h>
 #include <sys/syslog.h>
 #include <sys/un.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_socket.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 
 static int linux_to_bsd_domain(int);
 static int linux_sendmsg_common(struct thread *, l_int, struct l_msghdr *,
 					l_uint);
 static int linux_recvmsg_common(struct thread *, l_int, struct l_msghdr *,
 					l_uint, struct msghdr *);
 static int linux_set_socket_flags(int, int *);
 
 /*
  * Reads a Linux sockaddr and does any necessary translation.
  * Linux sockaddrs don't have a length field, only a family.
  * Copy the osockaddr structure pointed to by osa to kernel, adjust
  * family and convert to sockaddr.
  */
 static int
 linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int salen)
 {
 	struct sockaddr *sa;
 	struct osockaddr *kosa;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	int oldv6size;
 #endif
 	char *name;
 	int bdom, error, hdrlen, namelen;
 
 	if (salen < 2 || salen > UCHAR_MAX || !osa)
 		return (EINVAL);
 
 #ifdef INET6
 	oldv6size = 0;
 	/*
 	 * Check for old (pre-RFC2553) sockaddr_in6. We may accept it
 	 * if it's a v4-mapped address, so reserve the proper space
 	 * for it.
 	 */
 	if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) {
 		salen += sizeof(uint32_t);
 		oldv6size = 1;
 	}
 #endif
 
 	kosa = malloc(salen, M_SONAME, M_WAITOK);
 
 	if ((error = copyin(osa, kosa, salen)))
 		goto out;
 
 	bdom = linux_to_bsd_domain(kosa->sa_family);
 	if (bdom == -1) {
 		error = EAFNOSUPPORT;
 		goto out;
 	}
 
 #ifdef INET6
 	/*
 	 * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6,
 	 * which lacks the scope id compared with RFC2553 one. If we detect
 	 * the situation, reject the address and write a message to system log.
 	 *
 	 * Still accept addresses for which the scope id is not used.
 	 */
 	if (oldv6size) {
 		if (bdom == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)kosa;
 			if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ||
 			    (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
 			     !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
 			     !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) &&
 			     !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) &&
 			     !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) {
 				sin6->sin6_scope_id = 0;
 			} else {
 				log(LOG_DEBUG,
 				    "obsolete pre-RFC2553 sockaddr_in6 rejected\n");
 				error = EINVAL;
 				goto out;
 			}
 		} else
 			salen -= sizeof(uint32_t);
 	}
 #endif
 	if (bdom == AF_INET) {
 		if (salen < sizeof(struct sockaddr_in)) {
 			error = EINVAL;
 			goto out;
 		}
 		salen = sizeof(struct sockaddr_in);
 	}
 
 	if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) {
 		hdrlen = offsetof(struct sockaddr_un, sun_path);
 		name = ((struct sockaddr_un *)kosa)->sun_path;
 		if (*name == '\0') {
 			/*
 			 * Linux abstract namespace starts with a NULL byte.
 			 * XXX We do not support abstract namespace yet.
 			 */
 			namelen = strnlen(name + 1, salen - hdrlen - 1) + 1;
 		} else
 			namelen = strnlen(name, salen - hdrlen);
 		salen = hdrlen + namelen;
 		if (salen > sizeof(struct sockaddr_un)) {
 			error = ENAMETOOLONG;
 			goto out;
 		}
 	}
 
 	sa = (struct sockaddr *)kosa;
 	sa->sa_family = bdom;
 	sa->sa_len = salen;
 
 	*sap = sa;
 	return (0);
 
 out:
 	free(kosa, M_SONAME);
 	return (error);
 }
 
 static int
 linux_to_bsd_domain(int domain)
 {
 
 	switch (domain) {
 	case LINUX_AF_UNSPEC:
 		return (AF_UNSPEC);
 	case LINUX_AF_UNIX:
 		return (AF_LOCAL);
 	case LINUX_AF_INET:
 		return (AF_INET);
 	case LINUX_AF_INET6:
 		return (AF_INET6);
 	case LINUX_AF_AX25:
 		return (AF_CCITT);
 	case LINUX_AF_IPX:
 		return (AF_IPX);
 	case LINUX_AF_APPLETALK:
 		return (AF_APPLETALK);
 	}
 	return (-1);
 }
 
 static int
 bsd_to_linux_domain(int domain)
 {
 
 	switch (domain) {
 	case AF_UNSPEC:
 		return (LINUX_AF_UNSPEC);
 	case AF_LOCAL:
 		return (LINUX_AF_UNIX);
 	case AF_INET:
 		return (LINUX_AF_INET);
 	case AF_INET6:
 		return (LINUX_AF_INET6);
 	case AF_CCITT:
 		return (LINUX_AF_AX25);
 	case AF_IPX:
 		return (LINUX_AF_IPX);
 	case AF_APPLETALK:
 		return (LINUX_AF_APPLETALK);
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_sockopt_level(int level)
 {
 
 	switch (level) {
 	case LINUX_SOL_SOCKET:
 		return (SOL_SOCKET);
 	}
 	return (level);
 }
 
 static int
 bsd_to_linux_sockopt_level(int level)
 {
 
 	switch (level) {
 	case SOL_SOCKET:
 		return (LINUX_SOL_SOCKET);
 	}
 	return (level);
 }
 
 static int
 linux_to_bsd_ip_sockopt(int opt)
 {
 
 	switch (opt) {
 	case LINUX_IP_TOS:
 		return (IP_TOS);
 	case LINUX_IP_TTL:
 		return (IP_TTL);
 	case LINUX_IP_OPTIONS:
 		return (IP_OPTIONS);
 	case LINUX_IP_MULTICAST_IF:
 		return (IP_MULTICAST_IF);
 	case LINUX_IP_MULTICAST_TTL:
 		return (IP_MULTICAST_TTL);
 	case LINUX_IP_MULTICAST_LOOP:
 		return (IP_MULTICAST_LOOP);
 	case LINUX_IP_ADD_MEMBERSHIP:
 		return (IP_ADD_MEMBERSHIP);
 	case LINUX_IP_DROP_MEMBERSHIP:
 		return (IP_DROP_MEMBERSHIP);
 	case LINUX_IP_HDRINCL:
 		return (IP_HDRINCL);
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_ip6_sockopt(int opt)
 {
 
 	switch (opt) {
 	case LINUX_IPV6_NEXTHOP:
 		return (IPV6_NEXTHOP);
 	case LINUX_IPV6_UNICAST_HOPS:
 		return (IPV6_UNICAST_HOPS);
 	case LINUX_IPV6_MULTICAST_IF:
 		return (IPV6_MULTICAST_IF);
 	case LINUX_IPV6_MULTICAST_HOPS:
 		return (IPV6_MULTICAST_HOPS);
 	case LINUX_IPV6_MULTICAST_LOOP:
 		return (IPV6_MULTICAST_LOOP);
 	case LINUX_IPV6_ADD_MEMBERSHIP:
 		return (IPV6_JOIN_GROUP);
 	case LINUX_IPV6_DROP_MEMBERSHIP:
 		return (IPV6_LEAVE_GROUP);
 	case LINUX_IPV6_V6ONLY:
 		return (IPV6_V6ONLY);
 	case LINUX_IPV6_DONTFRAG:
 		return (IPV6_DONTFRAG);
 #if 0
 	case LINUX_IPV6_CHECKSUM:
 		return (IPV6_CHECKSUM);
 	case LINUX_IPV6_RECVPKTINFO:
 		return (IPV6_RECVPKTINFO);
 	case LINUX_IPV6_PKTINFO:
 		return (IPV6_PKTINFO);
 	case LINUX_IPV6_RECVHOPLIMIT:
 		return (IPV6_RECVHOPLIMIT);
 	case LINUX_IPV6_HOPLIMIT:
 		return (IPV6_HOPLIMIT);
 	case LINUX_IPV6_RECVHOPOPTS:
 		return (IPV6_RECVHOPOPTS);
 	case LINUX_IPV6_HOPOPTS:
 		return (IPV6_HOPOPTS);
 	case LINUX_IPV6_RTHDRDSTOPTS:
 		return (IPV6_RTHDRDSTOPTS);
 	case LINUX_IPV6_RECVRTHDR:
 		return (IPV6_RECVRTHDR);
 	case LINUX_IPV6_RTHDR:
 		return (IPV6_RTHDR);
 	case LINUX_IPV6_RECVDSTOPTS:
 		return (IPV6_RECVDSTOPTS);
 	case LINUX_IPV6_DSTOPTS:
 		return (IPV6_DSTOPTS);
 	case LINUX_IPV6_RECVPATHMTU:
 		return (IPV6_RECVPATHMTU);
 	case LINUX_IPV6_PATHMTU:
 		return (IPV6_PATHMTU);
 #endif
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_so_sockopt(int opt)
 {
 
 	switch (opt) {
 	case LINUX_SO_DEBUG:
 		return (SO_DEBUG);
 	case LINUX_SO_REUSEADDR:
 		return (SO_REUSEADDR);
 	case LINUX_SO_TYPE:
 		return (SO_TYPE);
 	case LINUX_SO_ERROR:
 		return (SO_ERROR);
 	case LINUX_SO_DONTROUTE:
 		return (SO_DONTROUTE);
 	case LINUX_SO_BROADCAST:
 		return (SO_BROADCAST);
 	case LINUX_SO_SNDBUF:
 		return (SO_SNDBUF);
 	case LINUX_SO_RCVBUF:
 		return (SO_RCVBUF);
 	case LINUX_SO_KEEPALIVE:
 		return (SO_KEEPALIVE);
 	case LINUX_SO_OOBINLINE:
 		return (SO_OOBINLINE);
 	case LINUX_SO_LINGER:
 		return (SO_LINGER);
 	case LINUX_SO_PEERCRED:
 		return (LOCAL_PEERCRED);
 	case LINUX_SO_RCVLOWAT:
 		return (SO_RCVLOWAT);
 	case LINUX_SO_SNDLOWAT:
 		return (SO_SNDLOWAT);
 	case LINUX_SO_RCVTIMEO:
 		return (SO_RCVTIMEO);
 	case LINUX_SO_SNDTIMEO:
 		return (SO_SNDTIMEO);
 	case LINUX_SO_TIMESTAMP:
 		return (SO_TIMESTAMP);
 	case LINUX_SO_ACCEPTCONN:
 		return (SO_ACCEPTCONN);
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_tcp_sockopt(int opt)
 {
 
 	switch (opt) {
 	case LINUX_TCP_NODELAY:
 		return (TCP_NODELAY);
 	case LINUX_TCP_MAXSEG:
 		return (TCP_MAXSEG);
 	case LINUX_TCP_KEEPIDLE:
 		return (TCP_KEEPIDLE);
 	case LINUX_TCP_KEEPINTVL:
 		return (TCP_KEEPINTVL);
 	case LINUX_TCP_KEEPCNT:
 		return (TCP_KEEPCNT);
 	case LINUX_TCP_MD5SIG:
 		return (TCP_MD5SIG);
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_msg_flags(int flags)
 {
 	int ret_flags = 0;
 
 	if (flags & LINUX_MSG_OOB)
 		ret_flags |= MSG_OOB;
 	if (flags & LINUX_MSG_PEEK)
 		ret_flags |= MSG_PEEK;
 	if (flags & LINUX_MSG_DONTROUTE)
 		ret_flags |= MSG_DONTROUTE;
 	if (flags & LINUX_MSG_CTRUNC)
 		ret_flags |= MSG_CTRUNC;
 	if (flags & LINUX_MSG_TRUNC)
 		ret_flags |= MSG_TRUNC;
 	if (flags & LINUX_MSG_DONTWAIT)
 		ret_flags |= MSG_DONTWAIT;
 	if (flags & LINUX_MSG_EOR)
 		ret_flags |= MSG_EOR;
 	if (flags & LINUX_MSG_WAITALL)
 		ret_flags |= MSG_WAITALL;
 	if (flags & LINUX_MSG_NOSIGNAL)
 		ret_flags |= MSG_NOSIGNAL;
 #if 0 /* not handled */
 	if (flags & LINUX_MSG_PROXY)
 		;
 	if (flags & LINUX_MSG_FIN)
 		;
 	if (flags & LINUX_MSG_SYN)
 		;
 	if (flags & LINUX_MSG_CONFIRM)
 		;
 	if (flags & LINUX_MSG_RST)
 		;
 	if (flags & LINUX_MSG_ERRQUEUE)
 		;
 #endif
 	return (ret_flags);
 }
 
 /*
 * If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the
 * native syscall will fault.  Thus, we don't really need to check the
 * return values for these functions.
 */
 
 static int
 bsd_to_linux_sockaddr(struct sockaddr *arg)
 {
 	struct sockaddr sa;
 	size_t sa_len = sizeof(struct sockaddr);
 	int error, bdom;
 
 	if ((error = copyin(arg, &sa, sa_len)))
 		return (error);
 
 	bdom = bsd_to_linux_domain(sa.sa_family);
 	if (bdom == -1)
 		return (EAFNOSUPPORT);
 
 	*(u_short *)&sa = bdom;
 	return (copyout(&sa, arg, sa_len));
 }
 
 static int
 linux_to_bsd_sockaddr(struct sockaddr *arg, int len)
 {
 	struct sockaddr sa;
 	size_t sa_len = sizeof(struct sockaddr);
 	int error, bdom;
 
 	if ((error = copyin(arg, &sa, sa_len)))
 		return (error);
 
 	bdom = linux_to_bsd_domain(*(sa_family_t *)&sa);
 	if (bdom == -1)
 		return (EAFNOSUPPORT);
 
 	sa.sa_family = bdom;
 	sa.sa_len = len;
 	return (copyout(&sa, arg, sa_len));
 }
 
 static int
 linux_sa_put(struct osockaddr *osa)
 {
 	struct osockaddr sa;
 	int error, bdom;
 
 	/*
 	 * Only read/write the osockaddr family part, the rest is
 	 * not changed.
 	 */
 	error = copyin(osa, &sa, sizeof(sa.sa_family));
 	if (error != 0)
 		return (error);
 
 	bdom = bsd_to_linux_domain(sa.sa_family);
 	if (bdom == -1)
 		return (EINVAL);
 
 	sa.sa_family = bdom;
 	return (copyout(&sa, osa, sizeof(sa.sa_family)));
 }
 
 static int
 linux_to_bsd_cmsg_type(int cmsg_type)
 {
 
 	switch (cmsg_type) {
 	case LINUX_SCM_RIGHTS:
 		return (SCM_RIGHTS);
 	case LINUX_SCM_CREDENTIALS:
 		return (SCM_CREDS);
 	}
 	return (-1);
 }
 
 static int
 bsd_to_linux_cmsg_type(int cmsg_type)
 {
 
 	switch (cmsg_type) {
 	case SCM_RIGHTS:
 		return (LINUX_SCM_RIGHTS);
 	case SCM_CREDS:
 		return (LINUX_SCM_CREDENTIALS);
 	case SCM_TIMESTAMP:
 		return (LINUX_SCM_TIMESTAMP);
 	}
 	return (-1);
 }
 
 static int
 linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr)
 {
 	if (lhdr->msg_controllen > INT_MAX)
 		return (ENOBUFS);
 
 	bhdr->msg_name		= PTRIN(lhdr->msg_name);
 	bhdr->msg_namelen	= lhdr->msg_namelen;
 	bhdr->msg_iov		= PTRIN(lhdr->msg_iov);
 	bhdr->msg_iovlen	= lhdr->msg_iovlen;
 	bhdr->msg_control	= PTRIN(lhdr->msg_control);
 
 	/*
 	 * msg_controllen is skipped since BSD and LINUX control messages
 	 * are potentially different sizes (e.g. the cred structure used
 	 * by SCM_CREDS is different between the two operating system).
 	 *
 	 * The caller can set it (if necessary) after converting all the
 	 * control messages.
 	 */
 
 	bhdr->msg_flags		= linux_to_bsd_msg_flags(lhdr->msg_flags);
 	return (0);
 }
 
 static int
 bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr)
 {
 	lhdr->msg_name		= PTROUT(bhdr->msg_name);
 	lhdr->msg_namelen	= bhdr->msg_namelen;
 	lhdr->msg_iov		= PTROUT(bhdr->msg_iov);
 	lhdr->msg_iovlen	= bhdr->msg_iovlen;
 	lhdr->msg_control	= PTROUT(bhdr->msg_control);
 
 	/*
 	 * msg_controllen is skipped since BSD and LINUX control messages
 	 * are potentially different sizes (e.g. the cred structure used
 	 * by SCM_CREDS is different between the two operating system).
 	 *
 	 * The caller can set it (if necessary) after converting all the
 	 * control messages.
 	 */
 
 	/* msg_flags skipped */
 	return (0);
 }
 
 static int
 linux_set_socket_flags(int lflags, int *flags)
 {
 
 	if (lflags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK))
 		return (EINVAL);
 	if (lflags & LINUX_SOCK_NONBLOCK)
 		*flags |= SOCK_NONBLOCK;
 	if (lflags & LINUX_SOCK_CLOEXEC)
 		*flags |= SOCK_CLOEXEC;
 	return (0);
 }
 
 static int
 linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
     struct mbuf *control, enum uio_seg segflg)
 {
 	struct sockaddr *to;
 	int error;
 
 	if (mp->msg_name != NULL) {
 		error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error != 0)
 			return (error);
 		mp->msg_name = to;
 	} else
 		to = NULL;
 
 	error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control,
 	    segflg);
 
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 }
 
 /* Return 0 if IP_HDRINCL is set for the given socket. */
 static int
 linux_check_hdrincl(struct thread *td, int s)
 {
 	int error, optval;
 	socklen_t size_val;
 
 	size_val = sizeof(optval);
 	error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL,
 	    &optval, UIO_SYSSPACE, &size_val);
 	if (error != 0)
 		return (error);
 
 	return (optval == 0);
 }
 
 /*
  * Updated sendto() when IP_HDRINCL is set:
  * tweak endian-dependent fields in the IP packet.
  */
 static int
 linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args)
 {
 /*
  * linux_ip_copysize defines how many bytes we should copy
  * from the beginning of the IP packet before we customize it for BSD.
  * It should include all the fields we modify (ip_len and ip_off).
  */
 #define linux_ip_copysize	8
 
 	struct ip *packet;
 	struct msghdr msg;
 	struct iovec aiov[1];
 	int error;
 
 	/* Check that the packet isn't too big or too small. */
 	if (linux_args->len < linux_ip_copysize ||
 	    linux_args->len > IP_MAXPACKET)
 		return (EINVAL);
 
 	packet = (struct ip *)malloc(linux_args->len, M_LINUX, M_WAITOK);
 
 	/* Make kernel copy of the packet to be sent */
 	if ((error = copyin(PTRIN(linux_args->msg), packet,
 	    linux_args->len)))
 		goto goout;
 
 	/* Convert fields from Linux to BSD raw IP socket format */
 	packet->ip_len = linux_args->len;
 	packet->ip_off = ntohs(packet->ip_off);
 
 	/* Prepare the msghdr and iovec structures describing the new packet */
 	msg.msg_name = PTRIN(linux_args->to);
 	msg.msg_namelen = linux_args->tolen;
 	msg.msg_iov = aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = NULL;
 	msg.msg_flags = 0;
 	aiov[0].iov_base = (char *)packet;
 	aiov[0].iov_len = linux_args->len;
 	error = linux_sendit(td, linux_args->s, &msg, linux_args->flags,
 	    NULL, UIO_SYSSPACE);
 goout:
 	free(packet, M_LINUX);
 	return (error);
 }
 
 int
 linux_socket(struct thread *td, struct linux_socket_args *args)
 {
 	int domain, retval_socket, type;
 
 	type = args->type & LINUX_SOCK_TYPE_MASK;
 	if (type < 0 || type > LINUX_SOCK_MAX)
 		return (EINVAL);
 	retval_socket = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK,
 		&type);
 	if (retval_socket != 0)
 		return (retval_socket);
 	domain = linux_to_bsd_domain(args->domain);
 	if (domain == -1)
 		return (EAFNOSUPPORT);
 
 	retval_socket = kern_socket(td, domain, type, args->protocol);
 	if (retval_socket)
 		return (retval_socket);
 
 	if (type == SOCK_RAW
 	    && (args->protocol == IPPROTO_RAW || args->protocol == 0)
 	    && domain == PF_INET) {
 		/* It's a raw IP socket: set the IP_HDRINCL option. */
 		int hdrincl;
 
 		hdrincl = 1;
 		/* We ignore any error returned by kern_setsockopt() */
 		kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL,
 		    &hdrincl, UIO_SYSSPACE, sizeof(hdrincl));
 	}
 #ifdef INET6
 	/*
 	 * Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default
 	 * and some apps depend on this. So, set V6ONLY to 0 for Linux apps.
 	 * For simplicity we do this unconditionally of the net.inet6.ip6.v6only
 	 * sysctl value.
 	 */
 	if (domain == PF_INET6) {
 		int v6only;
 
 		v6only = 0;
 		/* We ignore any error returned by setsockopt() */
 		kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY,
 		    &v6only, UIO_SYSSPACE, sizeof(v6only));
 	}
 #endif
 
 	return (retval_socket);
 }
 
 int
 linux_bind(struct thread *td, struct linux_bind_args *args)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = linux_getsockaddr(&sa, PTRIN(args->name),
 	    args->namelen);
 	if (error != 0)
 		return (error);
 
 	error = kern_bindat(td, AT_FDCWD, args->s, sa);
 	free(sa, M_SONAME);
 	if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in))
 		return (EINVAL);
 	return (error);
 }
 
 int
 linux_connect(struct thread *td, struct linux_connect_args *args)
 {
 	struct socket *so;
 	struct sockaddr *sa;
 	struct file *fp;
 	u_int fflag;
 	int error;
 
 	error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name),
 	    args->namelen);
 	if (error != 0)
 		return (error);
 
 	error = kern_connectat(td, AT_FDCWD, args->s, sa);
 	free(sa, M_SONAME);
 	if (error != EISCONN)
 		return (error);
 
 	/*
 	 * Linux doesn't return EISCONN the first time it occurs,
 	 * when on a non-blocking socket. Instead it returns the
 	 * error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD.
 	 */
 	error = getsock_cap(td, args->s, &cap_connect_rights,
 	    &fp, &fflag, NULL);
 	if (error != 0)
 		return (error);
 
 	error = EISCONN;
 	so = fp->f_data;
 	if (fflag & FNONBLOCK) {
 		SOCK_LOCK(so);
 		if (so->so_emuldata == 0)
 			error = so->so_error;
 		so->so_emuldata = (void *)1;
 		SOCK_UNLOCK(so);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 linux_listen(struct thread *td, struct linux_listen_args *args)
 {
 
 	return (kern_listen(td, args->s, args->backlog));
 }
 
 static int
 linux_accept_common(struct thread *td, int s, l_uintptr_t addr,
     l_uintptr_t namelen, int flags)
 {
 	struct accept4_args /* {
 		int	s;
 		struct sockaddr * __restrict name;
 		socklen_t * __restrict anamelen;
 		int	flags;
 	} */ bsd_args;
 	struct socket *so;
 	struct file *fp;
 	int error, error1;
 
 	bsd_args.s = s;
 	bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr);
 	bsd_args.anamelen = PTRIN(namelen);
 	bsd_args.flags = 0;
 	error = linux_set_socket_flags(flags, &bsd_args.flags);
 	if (error != 0)
 		return (error);
 	error = sys_accept4(td, &bsd_args);
 	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name);
 	if (error != 0) {
 		if (error == EFAULT && namelen != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		if (error == EINVAL) {
 			error1 = getsock_cap(td, s, &cap_accept_rights, &fp, NULL, NULL);
 			if (error1 != 0)
 				return (error1);
 			so = fp->f_data;
 			if (so->so_type == SOCK_DGRAM) {
 				fdrop(fp, td);
 				return (EOPNOTSUPP);
 			}
 			fdrop(fp, td);
 		}
 		return (error);
 	}
 	if (addr)
 		error = linux_sa_put(PTRIN(addr));
 	if (error != 0) {
 		(void)kern_close(td, td->td_retval[0]);
 		td->td_retval[0] = 0;
 	}
 	return (error);
 }
 
 int
 linux_accept(struct thread *td, struct linux_accept_args *args)
 {
 
 	return (linux_accept_common(td, args->s, args->addr,
 	    args->namelen, 0));
 }
 
 int
 linux_accept4(struct thread *td, struct linux_accept4_args *args)
 {
 
 	return (linux_accept_common(td, args->s, args->addr,
 	    args->namelen, args->flags));
 }
 
 int
 linux_getsockname(struct thread *td, struct linux_getsockname_args *args)
 {
 	struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ bsd_args;
 	int error;
 
 	bsd_args.fdes = args->s;
 	bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr);
 	bsd_args.alen = PTRIN(args->namelen);
 	error = sys_getsockname(td, &bsd_args);
 	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
 	if (error != 0)
 		return (error);
 	return (linux_sa_put(PTRIN(args->addr)));
 }
 
 int
 linux_getpeername(struct thread *td, struct linux_getpeername_args *args)
 {
 	struct getpeername_args /* {
 		int fdes;
 		caddr_t asa;
 		int *alen;
 	} */ bsd_args;
 	int error;
 
 	bsd_args.fdes = args->s;
 	bsd_args.asa = (struct sockaddr *)PTRIN(args->addr);
 	bsd_args.alen = (socklen_t *)PTRIN(args->namelen);
 	error = sys_getpeername(td, &bsd_args);
 	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
 	if (error != 0)
 		return (error);
 	return (linux_sa_put(PTRIN(args->addr)));
 }
 
 int
 linux_socketpair(struct thread *td, struct linux_socketpair_args *args)
 {
 	struct socketpair_args /* {
 		int domain;
 		int type;
 		int protocol;
 		int *rsv;
 	} */ bsd_args;
 	int error;
 
 	bsd_args.domain = linux_to_bsd_domain(args->domain);
 	if (bsd_args.domain != PF_LOCAL)
 		return (EAFNOSUPPORT);
 	bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK;
 	if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX)
 		return (EINVAL);
 	error = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK,
 		&bsd_args.type);
 	if (error != 0)
 		return (error);
 	if (args->protocol != 0 && args->protocol != PF_UNIX)
 
 		/*
 		 * Use of PF_UNIX as protocol argument is not right,
 		 * but Linux does it.
 		 * Do not map PF_UNIX as its Linux value is identical
 		 * to FreeBSD one.
 		 */
 		return (EPROTONOSUPPORT);
 	else
 		bsd_args.protocol = 0;
 	bsd_args.rsv = (int *)PTRIN(args->rsv);
 	return (sys_socketpair(td, &bsd_args));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 struct linux_send_args {
 	register_t s;
 	register_t msg;
 	register_t len;
 	register_t flags;
 };
 
 static int
 linux_send(struct thread *td, struct linux_send_args *args)
 {
 	struct sendto_args /* {
 		int s;
 		caddr_t buf;
 		int len;
 		int flags;
 		caddr_t to;
 		int tolen;
 	} */ bsd_args;
 
 	bsd_args.s = args->s;
 	bsd_args.buf = (caddr_t)PTRIN(args->msg);
 	bsd_args.len = args->len;
 	bsd_args.flags = args->flags;
 	bsd_args.to = NULL;
 	bsd_args.tolen = 0;
 	return (sys_sendto(td, &bsd_args));
 }
 
 struct linux_recv_args {
 	register_t s;
 	register_t msg;
 	register_t len;
 	register_t flags;
 };
 
 static int
 linux_recv(struct thread *td, struct linux_recv_args *args)
 {
 	struct recvfrom_args /* {
 		int s;
 		caddr_t buf;
 		int len;
 		int flags;
 		struct sockaddr *from;
 		socklen_t fromlenaddr;
 	} */ bsd_args;
 
 	bsd_args.s = args->s;
 	bsd_args.buf = (caddr_t)PTRIN(args->msg);
 	bsd_args.len = args->len;
 	bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
 	bsd_args.from = NULL;
 	bsd_args.fromlenaddr = 0;
 	return (sys_recvfrom(td, &bsd_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_sendto(struct thread *td, struct linux_sendto_args *args)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	if (linux_check_hdrincl(td, args->s) == 0)
 		/* IP_HDRINCL set, tweak the packet before sending */
 		return (linux_sendto_hdrincl(td, args));
 
 	msg.msg_name = PTRIN(args->to);
 	msg.msg_namelen = args->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = NULL;
 	msg.msg_flags = 0;
 	aiov.iov_base = PTRIN(args->msg);
 	aiov.iov_len = args->len;
 	return (linux_sendit(td, args->s, &msg, args->flags, NULL,
 	    UIO_USERSPACE));
 }
 
 int
 linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error, fromlen;
 
 	if (PTRIN(args->fromlen) != NULL) {
 		error = copyin(PTRIN(args->fromlen), &fromlen,
 		    sizeof(fromlen));
 		if (error != 0)
 			return (error);
 		if (fromlen < 0)
 			return (EINVAL);
 		msg.msg_namelen = fromlen;
 	} else
 		msg.msg_namelen = 0;
 
 	msg.msg_name = (struct sockaddr * __restrict)PTRIN(args->from);
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = PTRIN(args->buf);
 	aiov.iov_len = args->len;
 	msg.msg_control = 0;
 	msg.msg_flags = linux_to_bsd_msg_flags(args->flags);
 
 	error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, NULL);
 	if (error != 0)
 		return (error);
 
 	if (PTRIN(args->from) != NULL) {
 		error = bsd_to_linux_sockaddr((struct sockaddr *)
 		    PTRIN(args->from));
 		if (error != 0)
 			return (error);
 
 		error = linux_sa_put((struct osockaddr *)
 		    PTRIN(args->from));
 	}
 
 	if (PTRIN(args->fromlen) != NULL)
 		error = copyout(&msg.msg_namelen, PTRIN(args->fromlen),
 		    sizeof(msg.msg_namelen));
 
 	return (error);
 }
 
 static int
 linux_sendmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr,
     l_uint flags)
 {
 	struct cmsghdr *cmsg;
 	struct cmsgcred cmcred;
 	struct mbuf *control;
 	struct msghdr msg;
 	struct l_cmsghdr linux_cmsg;
 	struct l_cmsghdr *ptr_cmsg;
 	struct l_msghdr linux_msg;
 	struct iovec *iov;
 	socklen_t datalen;
 	struct sockaddr *sa;
 	sa_family_t sa_family;
 	void *data;
 	int error;
 
 	error = copyin(msghdr, &linux_msg, sizeof(linux_msg));
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Some Linux applications (ping) define a non-NULL control data
 	 * pointer, but a msg_controllen of 0, which is not allowed in the
 	 * FreeBSD system call interface.  NULL the msg_control pointer in
 	 * order to handle this case.  This should be checked, but allows the
 	 * Linux ping to work.
 	 */
 	if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0)
 		linux_msg.msg_control = PTROUT(NULL);
 
 	error = linux_to_bsd_msghdr(&msg, &linux_msg);
 	if (error != 0)
 		return (error);
 
 #ifdef COMPAT_LINUX32
 	error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen,
 	    &iov, EMSGSIZE);
 #else
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 #endif
 	if (error != 0)
 		return (error);
 
 	control = NULL;
 	cmsg = NULL;
 
 	if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) {
 		error = kern_getsockname(td, s, &sa, &datalen);
 		if (error != 0)
 			goto bad;
 		sa_family = sa->sa_family;
 		free(sa, M_SONAME);
 
 		error = ENOBUFS;
 		cmsg = malloc(CMSG_HDRSZ, M_LINUX, M_WAITOK|M_ZERO);
 		control = m_get(M_WAITOK, MT_CONTROL);
 
 		do {
 			error = copyin(ptr_cmsg, &linux_cmsg,
 			    sizeof(struct l_cmsghdr));
 			if (error != 0)
 				goto bad;
 
 			error = EINVAL;
 			if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr))
 				goto bad;
 
 			/*
 			 * Now we support only SCM_RIGHTS and SCM_CRED,
 			 * so return EINVAL in any other cmsg_type
 			 */
 			cmsg->cmsg_type =
 			    linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type);
 			cmsg->cmsg_level =
 			    linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level);
 			if (cmsg->cmsg_type == -1
 			    || cmsg->cmsg_level != SOL_SOCKET)
 				goto bad;
 
 			/*
 			 * Some applications (e.g. pulseaudio) attempt to
 			 * send ancillary data even if the underlying protocol
 			 * doesn't support it which is not allowed in the
 			 * FreeBSD system call interface.
 			 */
 			if (sa_family != AF_UNIX)
 				continue;
 
 			data = LINUX_CMSG_DATA(ptr_cmsg);
 			datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ;
 
 			switch (cmsg->cmsg_type)
 			{
 			case SCM_RIGHTS:
 				break;
 
 			case SCM_CREDS:
 				data = &cmcred;
 				datalen = sizeof(cmcred);
 
 				/*
 				 * The lower levels will fill in the structure
 				 */
 				bzero(data, datalen);
 				break;
 			}
 
 			cmsg->cmsg_len = CMSG_LEN(datalen);
 
 			error = ENOBUFS;
 			if (!m_append(control, CMSG_HDRSZ, (c_caddr_t)cmsg))
 				goto bad;
 			if (!m_append(control, datalen, (c_caddr_t)data))
 				goto bad;
 		} while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg)));
 
 		if (m_length(control, NULL) == 0) {
 			m_freem(control);
 			control = NULL;
 		}
 	}
 
 	msg.msg_iov = iov;
 	msg.msg_flags = 0;
 	error = linux_sendit(td, s, &msg, flags, control, UIO_USERSPACE);
 	control = NULL;
 
 bad:
 	m_freem(control);
 	free(iov, M_IOV);
 	if (cmsg)
 		free(cmsg, M_LINUX);
 	return (error);
 }
 
 int
 linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args)
 {
 
 	return (linux_sendmsg_common(td, args->s, PTRIN(args->msg),
 	    args->flags));
 }
 
 int
 linux_sendmmsg(struct thread *td, struct linux_sendmmsg_args *args)
 {
 	struct l_mmsghdr *msg;
 	l_uint retval;
 	int error, datagrams;
 
 	if (args->vlen > UIO_MAXIOV)
 		args->vlen = UIO_MAXIOV;
 
 	msg = PTRIN(args->msg);
 	datagrams = 0;
 	while (datagrams < args->vlen) {
 		error = linux_sendmsg_common(td, args->s, &msg->msg_hdr,
 		    args->flags);
 		if (error != 0)
 			break;
 
 		retval = td->td_retval[0];
 		error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len));
 		if (error != 0)
 			break;
 		++msg;
 		++datagrams;
 	}
 	if (error == 0)
 		td->td_retval[0] = datagrams;
 	return (error);
 }
 
 static int
 linux_recvmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr,
     l_uint flags, struct msghdr *msg)
 {
 	struct cmsghdr *cm;
 	struct cmsgcred *cmcred;
 	struct l_cmsghdr *linux_cmsg = NULL;
 	struct l_ucred linux_ucred;
 	socklen_t datalen, outlen;
 	struct l_msghdr linux_msg;
 	struct iovec *iov, *uiov;
 	struct mbuf *control = NULL;
 	struct mbuf **controlp;
 	struct timeval *ftmvl;
 	l_timeval ltmvl;
 	caddr_t outbuf;
 	void *data;
 	int error, i, fd, fds, *fdp;
 
 	error = copyin(msghdr, &linux_msg, sizeof(linux_msg));
 	if (error != 0)
 		return (error);
 
 	error = linux_to_bsd_msghdr(msg, &linux_msg);
 	if (error != 0)
 		return (error);
 
 #ifdef COMPAT_LINUX32
 	error = linux32_copyiniov(PTRIN(msg->msg_iov), msg->msg_iovlen,
 	    &iov, EMSGSIZE);
 #else
 	error = copyiniov(msg->msg_iov, msg->msg_iovlen, &iov, EMSGSIZE);
 #endif
 	if (error != 0)
 		return (error);
 
 	if (msg->msg_name) {
 		error = linux_to_bsd_sockaddr((struct sockaddr *)msg->msg_name,
 		    msg->msg_namelen);
 		if (error != 0)
 			goto bad;
 	}
 
 	uiov = msg->msg_iov;
 	msg->msg_iov = iov;
 	controlp = (msg->msg_control != NULL) ? &control : NULL;
 	error = kern_recvit(td, s, msg, UIO_USERSPACE, controlp);
 	msg->msg_iov = uiov;
 	if (error != 0)
 		goto bad;
 
 	error = bsd_to_linux_msghdr(msg, &linux_msg);
 	if (error != 0)
 		goto bad;
 
 	if (linux_msg.msg_name) {
 		error = bsd_to_linux_sockaddr((struct sockaddr *)
 		    PTRIN(linux_msg.msg_name));
 		if (error != 0)
 			goto bad;
 	}
 	if (linux_msg.msg_name && linux_msg.msg_namelen > 2) {
 		error = linux_sa_put(PTRIN(linux_msg.msg_name));
 		if (error != 0)
 			goto bad;
 	}
 
 	outbuf = PTRIN(linux_msg.msg_control);
 	outlen = 0;
 
 	if (control) {
 		linux_cmsg = malloc(L_CMSG_HDRSZ, M_LINUX, M_WAITOK | M_ZERO);
 
 		msg->msg_control = mtod(control, struct cmsghdr *);
 		msg->msg_controllen = control->m_len;
 
 		cm = CMSG_FIRSTHDR(msg);
 
 		while (cm != NULL) {
 			linux_cmsg->cmsg_type =
 			    bsd_to_linux_cmsg_type(cm->cmsg_type);
 			linux_cmsg->cmsg_level =
 			    bsd_to_linux_sockopt_level(cm->cmsg_level);
 			if (linux_cmsg->cmsg_type == -1
 			    || cm->cmsg_level != SOL_SOCKET)
 			{
 				error = EINVAL;
 				goto bad;
 			}
 
 			data = CMSG_DATA(cm);
 			datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 			switch (cm->cmsg_type)
 			{
 			case SCM_RIGHTS:
 				if (flags & LINUX_MSG_CMSG_CLOEXEC) {
 					fds = datalen / sizeof(int);
 					fdp = data;
 					for (i = 0; i < fds; i++) {
 						fd = *fdp++;
 						(void)kern_fcntl(td, fd,
 						    F_SETFD, FD_CLOEXEC);
 					}
 				}
 				break;
 
 			case SCM_CREDS:
 				/*
 				 * Currently LOCAL_CREDS is never in
 				 * effect for Linux so no need to worry
 				 * about sockcred
 				 */
 				if (datalen != sizeof(*cmcred)) {
 					error = EMSGSIZE;
 					goto bad;
 				}
 				cmcred = (struct cmsgcred *)data;
 				bzero(&linux_ucred, sizeof(linux_ucred));
 				linux_ucred.pid = cmcred->cmcred_pid;
 				linux_ucred.uid = cmcred->cmcred_uid;
 				linux_ucred.gid = cmcred->cmcred_gid;
 				data = &linux_ucred;
 				datalen = sizeof(linux_ucred);
 				break;
 
 			case SCM_TIMESTAMP:
 				if (datalen != sizeof(struct timeval)) {
 					error = EMSGSIZE;
 					goto bad;
 				}
 				ftmvl = (struct timeval *)data;
 				ltmvl.tv_sec = ftmvl->tv_sec;
 				ltmvl.tv_usec = ftmvl->tv_usec;
 				data = &ltmvl;
 				datalen = sizeof(ltmvl);
 				break;
 			}
 
 			if (outlen + LINUX_CMSG_LEN(datalen) >
 			    linux_msg.msg_controllen) {
 				if (outlen == 0) {
 					error = EMSGSIZE;
 					goto bad;
 				} else {
 					linux_msg.msg_flags |=
 					    LINUX_MSG_CTRUNC;
 					goto out;
 				}
 			}
 
 			linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen);
 
 			error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ);
 			if (error != 0)
 				goto bad;
 			outbuf += L_CMSG_HDRSZ;
 
 			error = copyout(data, outbuf, datalen);
 			if (error != 0)
 				goto bad;
 
 			outbuf += LINUX_CMSG_ALIGN(datalen);
 			outlen += LINUX_CMSG_LEN(datalen);
 
 			cm = CMSG_NXTHDR(msg, cm);
 		}
 	}
 
 out:
 	linux_msg.msg_controllen = outlen;
 	error = copyout(&linux_msg, msghdr, sizeof(linux_msg));
 
 bad:
 	free(iov, M_IOV);
 	m_freem(control);
 	free(linux_cmsg, M_LINUX);
 
 	return (error);
 }
 
 int
 linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args)
 {
 	struct msghdr bsd_msg;
 
 	return (linux_recvmsg_common(td, args->s, PTRIN(args->msg),
 	    args->flags, &bsd_msg));
 }
 
 int
 linux_recvmmsg(struct thread *td, struct linux_recvmmsg_args *args)
 {
 	struct l_mmsghdr *msg;
 	struct msghdr bsd_msg;
 	struct l_timespec lts;
 	struct timespec ts, tts;
 	l_uint retval;
 	int error, datagrams;
 
 	if (args->timeout) {
 		error = copyin(args->timeout, &lts, sizeof(struct l_timespec));
 		if (error != 0)
 			return (error);
 		error = linux_to_native_timespec(&ts, &lts);
 		if (error != 0)
 			return (error);
 		getnanotime(&tts);
-		timespecadd(&tts, &ts);
+		timespecadd(&tts, &ts, &tts);
 	}
 
 	msg = PTRIN(args->msg);
 	datagrams = 0;
 	while (datagrams < args->vlen) {
 		error = linux_recvmsg_common(td, args->s, &msg->msg_hdr,
 		    args->flags & ~LINUX_MSG_WAITFORONE, &bsd_msg);
 		if (error != 0)
 			break;
 
 		retval = td->td_retval[0];
 		error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len));
 		if (error != 0)
 			break;
 		++msg;
 		++datagrams;
 
 		/*
 		 * MSG_WAITFORONE turns on MSG_DONTWAIT after one packet.
 		 */
 		if (args->flags & LINUX_MSG_WAITFORONE)
 			args->flags |= LINUX_MSG_DONTWAIT;
 
 		/*
 		 * See BUGS section of recvmmsg(2).
 		 */
 		if (args->timeout) {
 			getnanotime(&ts);
-			timespecsub(&ts, &tts);
+			timespecsub(&ts, &tts, &ts);
 			if (!timespecisset(&ts) || ts.tv_sec > 0)
 				break;
 		}
 		/* Out of band data, return right away. */
 		if (bsd_msg.msg_flags & MSG_OOB)
 			break;
 	}
 	if (error == 0)
 		td->td_retval[0] = datagrams;
 	return (error);
 }
 
 int
 linux_shutdown(struct thread *td, struct linux_shutdown_args *args)
 {
 
 	return (kern_shutdown(td, args->s, args->how));
 }
 
 int
 linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args)
 {
 	struct setsockopt_args /* {
 		int s;
 		int level;
 		int name;
 		caddr_t val;
 		int valsize;
 	} */ bsd_args;
 	l_timeval linux_tv;
 	struct timeval tv;
 	int error, name;
 
 	bsd_args.s = args->s;
 	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
 	switch (bsd_args.level) {
 	case SOL_SOCKET:
 		name = linux_to_bsd_so_sockopt(args->optname);
 		switch (name) {
 		case SO_RCVTIMEO:
 			/* FALLTHROUGH */
 		case SO_SNDTIMEO:
 			error = copyin(PTRIN(args->optval), &linux_tv,
 			    sizeof(linux_tv));
 			if (error != 0)
 				return (error);
 			tv.tv_sec = linux_tv.tv_sec;
 			tv.tv_usec = linux_tv.tv_usec;
 			return (kern_setsockopt(td, args->s, bsd_args.level,
 			    name, &tv, UIO_SYSSPACE, sizeof(tv)));
 			/* NOTREACHED */
 		default:
 			break;
 		}
 		break;
 	case IPPROTO_IP:
 		name = linux_to_bsd_ip_sockopt(args->optname);
 		break;
 	case IPPROTO_IPV6:
 		name = linux_to_bsd_ip6_sockopt(args->optname);
 		break;
 	case IPPROTO_TCP:
 		name = linux_to_bsd_tcp_sockopt(args->optname);
 		break;
 	default:
 		name = -1;
 		break;
 	}
 	if (name == -1)
 		return (ENOPROTOOPT);
 
 	bsd_args.name = name;
 	bsd_args.val = PTRIN(args->optval);
 	bsd_args.valsize = args->optlen;
 
 	if (name == IPV6_NEXTHOP) {
 		linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val,
 			bsd_args.valsize);
 		error = sys_setsockopt(td, &bsd_args);
 		bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
 	} else
 		error = sys_setsockopt(td, &bsd_args);
 
 	return (error);
 }
 
 int
 linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args)
 {
 	struct getsockopt_args /* {
 		int s;
 		int level;
 		int name;
 		caddr_t val;
 		int *avalsize;
 	} */ bsd_args;
 	l_timeval linux_tv;
 	struct timeval tv;
 	socklen_t tv_len, xulen, len;
 	struct xucred xu;
 	struct l_ucred lxu;
 	int error, name, newval;
 
 	bsd_args.s = args->s;
 	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
 	switch (bsd_args.level) {
 	case SOL_SOCKET:
 		name = linux_to_bsd_so_sockopt(args->optname);
 		switch (name) {
 		case SO_RCVTIMEO:
 			/* FALLTHROUGH */
 		case SO_SNDTIMEO:
 			tv_len = sizeof(tv);
 			error = kern_getsockopt(td, args->s, bsd_args.level,
 			    name, &tv, UIO_SYSSPACE, &tv_len);
 			if (error != 0)
 				return (error);
 			linux_tv.tv_sec = tv.tv_sec;
 			linux_tv.tv_usec = tv.tv_usec;
 			return (copyout(&linux_tv, PTRIN(args->optval),
 			    sizeof(linux_tv)));
 			/* NOTREACHED */
 		case LOCAL_PEERCRED:
 			if (args->optlen < sizeof(lxu))
 				return (EINVAL);
 			xulen = sizeof(xu);
 			error = kern_getsockopt(td, args->s, bsd_args.level,
 			    name, &xu, UIO_SYSSPACE, &xulen);
 			if (error != 0)
 				return (error);
 			/*
 			 * XXX Use 0 for pid as the FreeBSD does not cache peer pid.
 			 */
 			lxu.pid = 0;
 			lxu.uid = xu.cr_uid;
 			lxu.gid = xu.cr_gid;
 			return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu)));
 			/* NOTREACHED */
 		case SO_ERROR:
 			len = sizeof(newval);
 			error = kern_getsockopt(td, args->s, bsd_args.level,
 			    name, &newval, UIO_SYSSPACE, &len);
 			if (error != 0)
 				return (error);
 			newval = -SV_ABI_ERRNO(td->td_proc, newval);
 			return (copyout(&newval, PTRIN(args->optval), len));
 			/* NOTREACHED */
 		default:
 			break;
 		}
 		break;
 	case IPPROTO_IP:
 		name = linux_to_bsd_ip_sockopt(args->optname);
 		break;
 	case IPPROTO_IPV6:
 		name = linux_to_bsd_ip6_sockopt(args->optname);
 		break;
 	case IPPROTO_TCP:
 		name = linux_to_bsd_tcp_sockopt(args->optname);
 		break;
 	default:
 		name = -1;
 		break;
 	}
 	if (name == -1)
 		return (EINVAL);
 
 	bsd_args.name = name;
 	bsd_args.val = PTRIN(args->optval);
 	bsd_args.avalsize = PTRIN(args->optlen);
 
 	if (name == IPV6_NEXTHOP) {
 		error = sys_getsockopt(td, &bsd_args);
 		bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
 	} else
 		error = sys_getsockopt(td, &bsd_args);
 
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 
 /* Argument list sizes for linux_socketcall */
 static const unsigned char lxs_args_cnt[] = {
 	0 /* unused*/,		3 /* socket */,
 	3 /* bind */,		3 /* connect */,
 	2 /* listen */,		3 /* accept */,
 	3 /* getsockname */,	3 /* getpeername */,
 	4 /* socketpair */,	4 /* send */,
 	4 /* recv */,		6 /* sendto */,
 	6 /* recvfrom */,	2 /* shutdown */,
 	5 /* setsockopt */,	5 /* getsockopt */,
 	3 /* sendmsg */,	3 /* recvmsg */,
 	4 /* accept4 */,	5 /* recvmmsg */,
 	4 /* sendmmsg */
 };
 #define	LINUX_ARGS_CNT		(nitems(lxs_args_cnt) - 1)
 #define	LINUX_ARG_SIZE(x)	(lxs_args_cnt[x] * sizeof(l_ulong))
 
 int
 linux_socketcall(struct thread *td, struct linux_socketcall_args *args)
 {
 	l_ulong a[6];
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	register_t l_args[6];
 #endif
 	void *arg;
 	int error;
 
 	if (args->what < LINUX_SOCKET || args->what > LINUX_ARGS_CNT)
 		return (EINVAL);
 	error = copyin(PTRIN(args->args), a, LINUX_ARG_SIZE(args->what));
 	if (error != 0)
 		return (error);
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	for (int i = 0; i < lxs_args_cnt[args->what]; ++i)
 		l_args[i] = a[i];
 	arg = l_args;
 #else
 	arg = a;
 #endif
 	switch (args->what) {
 	case LINUX_SOCKET:
 		return (linux_socket(td, arg));
 	case LINUX_BIND:
 		return (linux_bind(td, arg));
 	case LINUX_CONNECT:
 		return (linux_connect(td, arg));
 	case LINUX_LISTEN:
 		return (linux_listen(td, arg));
 	case LINUX_ACCEPT:
 		return (linux_accept(td, arg));
 	case LINUX_GETSOCKNAME:
 		return (linux_getsockname(td, arg));
 	case LINUX_GETPEERNAME:
 		return (linux_getpeername(td, arg));
 	case LINUX_SOCKETPAIR:
 		return (linux_socketpair(td, arg));
 	case LINUX_SEND:
 		return (linux_send(td, arg));
 	case LINUX_RECV:
 		return (linux_recv(td, arg));
 	case LINUX_SENDTO:
 		return (linux_sendto(td, arg));
 	case LINUX_RECVFROM:
 		return (linux_recvfrom(td, arg));
 	case LINUX_SHUTDOWN:
 		return (linux_shutdown(td, arg));
 	case LINUX_SETSOCKOPT:
 		return (linux_setsockopt(td, arg));
 	case LINUX_GETSOCKOPT:
 		return (linux_getsockopt(td, arg));
 	case LINUX_SENDMSG:
 		return (linux_sendmsg(td, arg));
 	case LINUX_RECVMSG:
 		return (linux_recvmsg(td, arg));
 	case LINUX_ACCEPT4:
 		return (linux_accept4(td, arg));
 	case LINUX_RECVMMSG:
 		return (linux_recvmmsg(td, arg));
 	case LINUX_SENDMMSG:
 		return (linux_sendmmsg(td, arg));
 	}
 
 	uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what);
 	return (ENOSYS);
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
Index: head/sys/compat/linuxkpi/common/include/linux/time.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/time.h	(revision 336913)
+++ head/sys/compat/linuxkpi/common/include/linux/time.h	(revision 336914)
@@ -1,137 +1,135 @@
 /*-
  * Copyright (c) 2014-2015 François Tigeot
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _LINUX_TIME_H_
 #define	_LINUX_TIME_H_
 
 #define	NSEC_PER_USEC	1000L
 #define	NSEC_PER_MSEC	1000000L
 #define	NSEC_PER_SEC	1000000000L
 
 #define	USEC_PER_MSEC	1000L
 #define	USEC_PER_SEC	1000000L
 
 #define	timespec64 timespec
 
 #include <sys/time.h>
 #include <sys/stdint.h>
 
 static inline struct timeval
 ns_to_timeval(const int64_t nsec)
 {
 	struct timeval tv;
 	long rem;
 
 	if (nsec == 0) {
 		tv.tv_sec = 0;
 		tv.tv_usec = 0;
 		return (tv);
 	}
 
 	tv.tv_sec = nsec / NSEC_PER_SEC;
 	rem = nsec % NSEC_PER_SEC;
 	if (rem < 0) {
 		tv.tv_sec--;
 		rem += NSEC_PER_SEC;
 	}
 	tv.tv_usec = rem / 1000;
 	return (tv);
 }
 
 static inline int64_t
 timeval_to_ns(const struct timeval *tv)
 {
 	return ((int64_t)tv->tv_sec * NSEC_PER_SEC) +
 		tv->tv_usec * NSEC_PER_USEC;
 }
 
 #define getrawmonotonic(ts)	nanouptime(ts)
 
 static inline struct timespec
 timespec_sub(struct timespec lhs, struct timespec rhs)
 {
 	struct timespec ts;
 
-	ts.tv_sec = lhs.tv_sec;
-	ts.tv_nsec = lhs.tv_nsec;
-	timespecsub(&ts, &rhs);
+	timespecsub(&lhs, &rhs, &ts);
 
 	return ts;
 }
 
 static inline void
 set_normalized_timespec(struct timespec *ts, time_t sec, int64_t nsec)
 {
 	/* XXX: this doesn't actually normalize anything */
 	ts->tv_sec = sec;
 	ts->tv_nsec = nsec;
 }
 
 static inline int64_t
 timespec_to_ns(const struct timespec *ts)
 {
 	return ((ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec);
 }
 
 static inline struct timespec
 ns_to_timespec(const int64_t nsec)
 {
 	struct timespec ts;
 	int32_t rem;
 
 	if (nsec == 0) {
 		ts.tv_sec = 0;
 		ts.tv_nsec = 0;
 		return (ts);
 	}
 
 	ts.tv_sec = nsec / NSEC_PER_SEC;
 	rem = nsec % NSEC_PER_SEC;
 	if (rem < 0) {
 		ts.tv_sec--;
 		rem += NSEC_PER_SEC;
 	}
 	ts.tv_nsec = rem;
 	return (ts);
 }
 
 static inline int
 timespec_valid(const struct timespec *ts)
 {
 	if (ts->tv_sec < 0 || ts->tv_sec > 100000000 ||
 	    ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 		return (0);
 	return (1);
 }
 
 static inline unsigned long
 get_seconds(void)
 {
 	return time_uptime;
 }
 
 #endif /* _LINUX_TIME_H_ */
Index: head/sys/dev/acpica/acpi_cmbat.c
===================================================================
--- head/sys/dev/acpica/acpi_cmbat.c	(revision 336913)
+++ head/sys/dev/acpica/acpi_cmbat.c	(revision 336914)
@@ -1,507 +1,507 @@
 /*-
  * Copyright (c) 2005 Nate Lawson
  * Copyright (c) 2000 Munehiro Matsuda
  * Copyright (c) 2000 Takanori Watanabe
  * Copyright (c) 2000 Mitsuru IWASAKI <iwasaki@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/ioccom.h>
 
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/malloc.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #include <dev/acpica/acpivar.h>
 #include <dev/acpica/acpiio.h>
 
 static MALLOC_DEFINE(M_ACPICMBAT, "acpicmbat",
     "ACPI control method battery data");
 
 /* Number of times to retry initialization before giving up. */
 #define ACPI_CMBAT_RETRY_MAX	6
 
 /* Check the battery once a minute. */
 #define	CMBAT_POLLRATE		(60 * hz)
 
 /* Hooks for the ACPI CA debugging infrastructure */
 #define	_COMPONENT	ACPI_BATTERY
 ACPI_MODULE_NAME("BATTERY")
 
 #define	ACPI_BATTERY_BST_CHANGE	0x80
 #define	ACPI_BATTERY_BIF_CHANGE	0x81
 
 struct acpi_cmbat_softc {
     device_t	    dev;
     int		    flags;
 
     struct acpi_bif bif;
     struct acpi_bst bst;
     struct timespec bst_lastupdated;
 };
 
 ACPI_SERIAL_DECL(cmbat, "ACPI cmbat");
 
 static int		acpi_cmbat_probe(device_t dev);
 static int		acpi_cmbat_attach(device_t dev);
 static int		acpi_cmbat_detach(device_t dev);
 static int		acpi_cmbat_resume(device_t dev);
 static void		acpi_cmbat_notify_handler(ACPI_HANDLE h, UINT32 notify,
 			    void *context);
 static int		acpi_cmbat_info_expired(struct timespec *lastupdated);
 static void		acpi_cmbat_info_updated(struct timespec *lastupdated);
 static void		acpi_cmbat_get_bst(void *arg);
 static void		acpi_cmbat_get_bif_task(void *arg);
 static void		acpi_cmbat_get_bif(void *arg);
 static int		acpi_cmbat_bst(device_t dev, struct acpi_bst *bstp);
 static int		acpi_cmbat_bif(device_t dev, struct acpi_bif *bifp);
 static void		acpi_cmbat_init_battery(void *arg);
 
 static device_method_t acpi_cmbat_methods[] = {
     /* Device interface */
     DEVMETHOD(device_probe,	acpi_cmbat_probe),
     DEVMETHOD(device_attach,	acpi_cmbat_attach),
     DEVMETHOD(device_detach,	acpi_cmbat_detach),
     DEVMETHOD(device_resume,	acpi_cmbat_resume),
 
     /* ACPI battery interface */
     DEVMETHOD(acpi_batt_get_info, acpi_cmbat_bif),
     DEVMETHOD(acpi_batt_get_status, acpi_cmbat_bst),
 
     DEVMETHOD_END
 };
 
 static driver_t acpi_cmbat_driver = {
     "battery",
     acpi_cmbat_methods,
     sizeof(struct acpi_cmbat_softc),
 };
 
 static devclass_t acpi_cmbat_devclass;
 DRIVER_MODULE(acpi_cmbat, acpi, acpi_cmbat_driver, acpi_cmbat_devclass, 0, 0);
 MODULE_DEPEND(acpi_cmbat, acpi, 1, 1, 1);
 
 static int
 acpi_cmbat_probe(device_t dev)
 {
     static char *cmbat_ids[] = { "PNP0C0A", NULL };
 
     if (acpi_disabled("cmbat") ||
 	ACPI_ID_PROBE(device_get_parent(dev), dev, cmbat_ids) == NULL)
 	return (ENXIO);
 
     device_set_desc(dev, "ACPI Control Method Battery");
     return (0);
 }
 
 static int
 acpi_cmbat_attach(device_t dev)
 {
     int		error;
     ACPI_HANDLE	handle;
     struct acpi_cmbat_softc *sc;
 
     sc = device_get_softc(dev);
     handle = acpi_get_handle(dev);
     sc->dev = dev;
 
     timespecclear(&sc->bst_lastupdated);
 
     error = acpi_battery_register(dev);
     if (error != 0) {
     	device_printf(dev, "registering battery failed\n");
 	return (error);
     }
 
     /*
      * Install a system notify handler in addition to the device notify.
      * Toshiba notebook uses this alternate notify for its battery.
      */
     AcpiInstallNotifyHandler(handle, ACPI_ALL_NOTIFY,
 	acpi_cmbat_notify_handler, dev);
 
     AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_init_battery, dev);
 
     return (0);
 }
 
 static int
 acpi_cmbat_detach(device_t dev)
 {
     ACPI_HANDLE	handle;
 
     handle = acpi_get_handle(dev);
     AcpiRemoveNotifyHandler(handle, ACPI_ALL_NOTIFY, acpi_cmbat_notify_handler);
     acpi_battery_remove(dev);
 
     /*
      * Force any pending notification handler calls to complete by
      * requesting cmbat serialisation while freeing and clearing the
      * softc pointer:
      */
     ACPI_SERIAL_BEGIN(cmbat);
     device_set_softc(dev, NULL);
     ACPI_SERIAL_END(cmbat);
 
     return (0);
 }
 
 static int
 acpi_cmbat_resume(device_t dev)
 {
 
     AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_init_battery, dev);
     return (0);
 }
 
 static void
 acpi_cmbat_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context)
 {
     struct acpi_cmbat_softc *sc;
     device_t dev;
 
     dev = (device_t)context;
     sc = device_get_softc(dev);
 
     switch (notify) {
     case ACPI_NOTIFY_DEVICE_CHECK:
     case ACPI_BATTERY_BST_CHANGE:
 	/*
 	 * Clear the last updated time.  The next call to retrieve the
 	 * battery status will get the new value for us.
 	 */
 	timespecclear(&sc->bst_lastupdated);
 	break;
     case ACPI_NOTIFY_BUS_CHECK:
     case ACPI_BATTERY_BIF_CHANGE:
 	/*
 	 * Queue a callback to get the current battery info from thread
 	 * context.  It's not safe to block in a notify handler.
 	 */
 	AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_get_bif_task, dev);
 	break;
     }
 
     acpi_UserNotify("CMBAT", h, notify);
 }
 
 static int
 acpi_cmbat_info_expired(struct timespec *lastupdated)
 {
     struct timespec	curtime;
 
     ACPI_SERIAL_ASSERT(cmbat);
 
     if (lastupdated == NULL)
 	return (TRUE);
     if (!timespecisset(lastupdated))
 	return (TRUE);
 
     getnanotime(&curtime);
-    timespecsub(&curtime, lastupdated);
+    timespecsub(&curtime, lastupdated, &curtime);
     return (curtime.tv_sec < 0 ||
 	    curtime.tv_sec > acpi_battery_get_info_expire());
 }
 
 static void
 acpi_cmbat_info_updated(struct timespec *lastupdated)
 {
 
     ACPI_SERIAL_ASSERT(cmbat);
 
     if (lastupdated != NULL)
 	getnanotime(lastupdated);
 }
 
 static void
 acpi_cmbat_get_bst(void *arg)
 {
     struct acpi_cmbat_softc *sc;
     ACPI_STATUS	as;
     ACPI_OBJECT	*res;
     ACPI_HANDLE	h;
     ACPI_BUFFER	bst_buffer;
     device_t dev;
 
     ACPI_SERIAL_ASSERT(cmbat);
 
     dev = arg;
     sc = device_get_softc(dev);
     h = acpi_get_handle(dev);
     bst_buffer.Pointer = NULL;
     bst_buffer.Length = ACPI_ALLOCATE_BUFFER;
 
     if (!acpi_cmbat_info_expired(&sc->bst_lastupdated))
 	goto end;
 
     as = AcpiEvaluateObject(h, "_BST", NULL, &bst_buffer);
     if (ACPI_FAILURE(as)) {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "error fetching current battery status -- %s\n",
 		    AcpiFormatException(as));
 	goto end;
     }
 
     res = (ACPI_OBJECT *)bst_buffer.Pointer;
     if (!ACPI_PKG_VALID(res, 4)) {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "battery status corrupted\n");
 	goto end;
     }
 
     if (acpi_PkgInt32(res, 0, &sc->bst.state) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 1, &sc->bst.rate) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 2, &sc->bst.cap) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 3, &sc->bst.volt) != 0)
 	goto end;
     acpi_cmbat_info_updated(&sc->bst_lastupdated);
 
     /* Clear out undefined/extended bits that might be set by hardware. */
     sc->bst.state &= ACPI_BATT_STAT_BST_MASK;
     if ((sc->bst.state & ACPI_BATT_STAT_INVALID) == ACPI_BATT_STAT_INVALID)
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 	    "battery reports simultaneous charging and discharging\n");
 
     /* XXX If all batteries are critical, perhaps we should suspend. */
     if (sc->bst.state & ACPI_BATT_STAT_CRITICAL) {
     	if ((sc->flags & ACPI_BATT_STAT_CRITICAL) == 0) {
 	    sc->flags |= ACPI_BATT_STAT_CRITICAL;
 	    device_printf(dev, "critically low charge!\n");
 	}
     } else
 	sc->flags &= ~ACPI_BATT_STAT_CRITICAL;
 
 end:
     if (bst_buffer.Pointer != NULL)
 	AcpiOsFree(bst_buffer.Pointer);
 }
 
 /* XXX There should be a cleaner way to do this locking. */
 static void
 acpi_cmbat_get_bif_task(void *arg)
 {
 
     ACPI_SERIAL_BEGIN(cmbat);
     acpi_cmbat_get_bif(arg);
     ACPI_SERIAL_END(cmbat);
 }
 
 static void
 acpi_cmbat_get_bif(void *arg)
 {
     struct acpi_cmbat_softc *sc;
     ACPI_STATUS	as;
     ACPI_OBJECT	*res;
     ACPI_HANDLE	h;
     ACPI_BUFFER	bif_buffer;
     device_t dev;
 
     ACPI_SERIAL_ASSERT(cmbat);
 
     dev = arg;
     sc = device_get_softc(dev);
     h = acpi_get_handle(dev);
     bif_buffer.Pointer = NULL;
     bif_buffer.Length = ACPI_ALLOCATE_BUFFER;
 
     as = AcpiEvaluateObject(h, "_BIF", NULL, &bif_buffer);
     if (ACPI_FAILURE(as)) {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "error fetching current battery info -- %s\n",
 		    AcpiFormatException(as));
 	goto end;
     }
 
     res = (ACPI_OBJECT *)bif_buffer.Pointer;
     if (!ACPI_PKG_VALID(res, 13)) {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "battery info corrupted\n");
 	goto end;
     }
 
     if (acpi_PkgInt32(res, 0, &sc->bif.units) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 1, &sc->bif.dcap) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 2, &sc->bif.lfcap) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 3, &sc->bif.btech) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 4, &sc->bif.dvol) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 5, &sc->bif.wcap) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 6, &sc->bif.lcap) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 7, &sc->bif.gra1) != 0)
 	goto end;
     if (acpi_PkgInt32(res, 8, &sc->bif.gra2) != 0)
 	goto end;
     if (acpi_PkgStr(res,  9, sc->bif.model, ACPI_CMBAT_MAXSTRLEN) != 0)
 	goto end;
     if (acpi_PkgStr(res, 10, sc->bif.serial, ACPI_CMBAT_MAXSTRLEN) != 0)
 	goto end;
     if (acpi_PkgStr(res, 11, sc->bif.type, ACPI_CMBAT_MAXSTRLEN) != 0)
 	goto end;
     if (acpi_PkgStr(res, 12, sc->bif.oeminfo, ACPI_CMBAT_MAXSTRLEN) != 0)
 	goto end;
 
 end:
     if (bif_buffer.Pointer != NULL)
 	AcpiOsFree(bif_buffer.Pointer);
 }
 
 static int
 acpi_cmbat_bif(device_t dev, struct acpi_bif *bifp)
 {
     struct acpi_cmbat_softc *sc;
 
     sc = device_get_softc(dev);
 
     /*
      * Just copy the data.  The only value that should change is the
      * last-full capacity, so we only update when we get a notify that says
      * the info has changed.  Many systems apparently take a long time to
      * process a _BIF call so we avoid it if possible.
      */
     ACPI_SERIAL_BEGIN(cmbat);
     bifp->units = sc->bif.units;
     bifp->dcap = sc->bif.dcap;
     bifp->lfcap = sc->bif.lfcap;
     bifp->btech = sc->bif.btech;
     bifp->dvol = sc->bif.dvol;
     bifp->wcap = sc->bif.wcap;
     bifp->lcap = sc->bif.lcap;
     bifp->gra1 = sc->bif.gra1;
     bifp->gra2 = sc->bif.gra2;
     strncpy(bifp->model, sc->bif.model, sizeof(sc->bif.model));
     strncpy(bifp->serial, sc->bif.serial, sizeof(sc->bif.serial));
     strncpy(bifp->type, sc->bif.type, sizeof(sc->bif.type));
     strncpy(bifp->oeminfo, sc->bif.oeminfo, sizeof(sc->bif.oeminfo));
     ACPI_SERIAL_END(cmbat);
 
     return (0);
 }
 
 static int
 acpi_cmbat_bst(device_t dev, struct acpi_bst *bstp)
 {
     struct acpi_cmbat_softc *sc;
 
     sc = device_get_softc(dev);
 
     ACPI_SERIAL_BEGIN(cmbat);
     if (acpi_BatteryIsPresent(dev)) {
 	acpi_cmbat_get_bst(dev);
 	bstp->state = sc->bst.state;
 	bstp->rate = sc->bst.rate;
 	bstp->cap = sc->bst.cap;
 	bstp->volt = sc->bst.volt;
     } else
 	bstp->state = ACPI_BATT_STAT_NOT_PRESENT;
     ACPI_SERIAL_END(cmbat);
 
     return (0);
 }
 
 static void
 acpi_cmbat_init_battery(void *arg)
 {
     struct acpi_cmbat_softc *sc;
     int		retry, valid;
     device_t	dev;
 
     dev = (device_t)arg;
     ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		"battery initialization start\n");
 
     /*
      * Try repeatedly to get valid data from the battery.  Since the
      * embedded controller isn't always ready just after boot, we may have
      * to wait a while.
      */
     for (retry = 0; retry < ACPI_CMBAT_RETRY_MAX; retry++, AcpiOsSleep(10000)) {
 	/*
 	 * Batteries on DOCK can be ejected w/ DOCK during retrying.
 	 *
 	 * If there is a valid softc pointer the device may be in
 	 * attaching, attached or detaching state. If the state is
 	 * different from attached retry getting the device state
 	 * until it becomes stable. This solves a race if the ACPI
 	 * notification handler is called during attach, because
 	 * device_is_attached() doesn't return non-zero until after
 	 * the attach code has been executed.
 	 */
 	ACPI_SERIAL_BEGIN(cmbat);
 	sc = device_get_softc(dev);
 	if (sc == NULL) {
 	    ACPI_SERIAL_END(cmbat);
 	    return;
 	}
 
 	if (!acpi_BatteryIsPresent(dev) || !device_is_attached(dev)) {
 	    ACPI_SERIAL_END(cmbat);
 	    continue;
 	}
 
 	/*
 	 * Only query the battery if this is the first try or the specific
 	 * type of info is still invalid.
 	 */
 	if (retry == 0 || !acpi_battery_bst_valid(&sc->bst)) {
 	    timespecclear(&sc->bst_lastupdated);
 	    acpi_cmbat_get_bst(dev);
 	}
 	if (retry == 0 || !acpi_battery_bif_valid(&sc->bif))
 	    acpi_cmbat_get_bif(dev);
 
 	valid = acpi_battery_bst_valid(&sc->bst) &&
 	    acpi_battery_bif_valid(&sc->bif);
 	ACPI_SERIAL_END(cmbat);
 
 	if (valid)
 	    break;
     }
 
     if (retry == ACPI_CMBAT_RETRY_MAX) {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "battery initialization failed, giving up\n");
     } else {
 	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
 		    "battery initialization done, tried %d times\n", retry + 1);
     }
 }
Index: head/sys/dev/acpica/acpi_smbat.c
===================================================================
--- head/sys/dev/acpica/acpi_smbat.c	(revision 336913)
+++ head/sys/dev/acpica/acpi_smbat.c	(revision 336914)
@@ -1,490 +1,490 @@
 /*-
  * Copyright (c) 2005 Hans Petter Selasky
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #include <dev/acpica/acpivar.h>
 #include <dev/acpica/acpiio.h>
 #include <dev/acpica/acpi_smbus.h>
 
 /* Transactions have failed after 500 ms. */
 #define SMBUS_TIMEOUT	50
 
 struct acpi_smbat_softc {
 	uint8_t		sb_base_addr;
 	device_t	ec_dev;
 
 	struct acpi_bif	bif;
 	struct acpi_bst	bst;
 	struct timespec	bif_lastupdated;
 	struct timespec	bst_lastupdated;
 };
 
 static int	acpi_smbat_probe(device_t dev);
 static int	acpi_smbat_attach(device_t dev);
 static int	acpi_smbat_shutdown(device_t dev);
 static int	acpi_smbat_info_expired(struct timespec *lastupdated);
 static void	acpi_smbat_info_updated(struct timespec *lastupdated);
 static int	acpi_smbat_get_bif(device_t dev, struct acpi_bif *bif);
 static int	acpi_smbat_get_bst(device_t dev, struct acpi_bst *bst);
 
 ACPI_SERIAL_DECL(smbat, "ACPI Smart Battery");
 
 static SYSCTL_NODE(_debug_acpi, OID_AUTO, batt, CTLFLAG_RD, NULL,
     "Battery debugging");
 
 /* On some laptops with smart batteries, enabling battery monitoring
  * software causes keystrokes from atkbd to be lost.  This has also been
  * reported on Linux, and is apparently due to the keyboard and I2C line
  * for the battery being routed through the same chip.  Whether that's
  * accurate or not, adding extra sleeps to the status checking code
  * causes the problem to go away.
  *
  * If you experience that problem, try a value of 10ms and move up
  * from there.
  */
 static int      batt_sleep_ms;
 SYSCTL_INT(_debug_acpi_batt, OID_AUTO, batt_sleep_ms, CTLFLAG_RW, &batt_sleep_ms, 0,
     "Sleep during battery status updates to prevent keystroke loss.");
 
 static device_method_t acpi_smbat_methods[] = {
 	/* device interface */
 	DEVMETHOD(device_probe, acpi_smbat_probe),
 	DEVMETHOD(device_attach, acpi_smbat_attach),
 	DEVMETHOD(device_shutdown, acpi_smbat_shutdown),
 
 	/* ACPI battery interface */
 	DEVMETHOD(acpi_batt_get_status, acpi_smbat_get_bst),
 	DEVMETHOD(acpi_batt_get_info, acpi_smbat_get_bif),
 
 	DEVMETHOD_END
 };
 
 static driver_t	acpi_smbat_driver = {
 	"battery",
 	acpi_smbat_methods,
 	sizeof(struct acpi_smbat_softc),
 };
 
 static devclass_t acpi_smbat_devclass;
 DRIVER_MODULE(acpi_smbat, acpi, acpi_smbat_driver, acpi_smbat_devclass, 0, 0);
 MODULE_DEPEND(acpi_smbat, acpi, 1, 1, 1);
 
 static int
 acpi_smbat_probe(device_t dev)
 {
 	static char *smbat_ids[] = {"ACPI0001", "ACPI0005", NULL};
 	ACPI_STATUS status;
 
 	if (acpi_disabled("smbat") ||
 	    ACPI_ID_PROBE(device_get_parent(dev), dev, smbat_ids) == NULL)
 		return (ENXIO);
 	status = AcpiEvaluateObject(acpi_get_handle(dev), "_EC", NULL, NULL);
 	if (ACPI_FAILURE(status))
 		return (ENXIO);
 
 	device_set_desc(dev, "ACPI Smart Battery");
 	return (0);
 }
 
 static int
 acpi_smbat_attach(device_t dev)
 {
 	struct acpi_smbat_softc *sc;
 	uint32_t base;
 
 	sc = device_get_softc(dev);
 	if (ACPI_FAILURE(acpi_GetInteger(acpi_get_handle(dev), "_EC", &base))) {
 		device_printf(dev, "cannot get EC base address\n");
 		return (ENXIO);
 	}
 	sc->sb_base_addr = (base >> 8) & 0xff;
 
 	/* XXX Only works with one EC, but nearly all systems only have one. */
 	sc->ec_dev = devclass_get_device(devclass_find("acpi_ec"), 0);
 	if (sc->ec_dev == NULL) {
 		device_printf(dev, "cannot find EC device\n");
 		return (ENXIO);
 	}
 
 	timespecclear(&sc->bif_lastupdated);
 	timespecclear(&sc->bst_lastupdated);
 
 	if (acpi_battery_register(dev) != 0) {
 		device_printf(dev, "cannot register battery\n");
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 acpi_smbat_shutdown(device_t dev)
 {
 
 	acpi_battery_remove(dev);
 	return (0);
 }
 
 static int
 acpi_smbat_info_expired(struct timespec *lastupdated)
 {
 	struct timespec	curtime;
 
 	ACPI_SERIAL_ASSERT(smbat);
 
 	if (lastupdated == NULL)
 		return (TRUE);
 	if (!timespecisset(lastupdated))
 		return (TRUE);
 
 	getnanotime(&curtime);
-	timespecsub(&curtime, lastupdated);
+	timespecsub(&curtime, lastupdated, &curtime);
 	return (curtime.tv_sec < 0 ||
 	    curtime.tv_sec > acpi_battery_get_info_expire());
 }
 
 static void
 acpi_smbat_info_updated(struct timespec *lastupdated)
 {
 
 	ACPI_SERIAL_ASSERT(smbat);
 
 	if (lastupdated != NULL)
 		getnanotime(lastupdated);
 }
 
 static int
 acpi_smbus_read_2(struct acpi_smbat_softc *sc, uint8_t addr, uint8_t cmd,
     uint16_t *ptr)
 {
 	int error, to;
 	UINT64 val;
 
 	ACPI_SERIAL_ASSERT(smbat);
 
 	if (batt_sleep_ms)
 	    AcpiOsSleep(batt_sleep_ms);
 
 	val = addr;
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_ADDR,
 	    val, 1);
 	if (error)
 		goto out;
 
 	val = cmd;
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_CMD,
 	    val, 1);
 	if (error)
 		goto out;
 
 	val = 0x09; /* | 0x80 if PEC */
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL,
 	    val, 1);
 	if (error)
 		goto out;
 
 	if (batt_sleep_ms)
 	    AcpiOsSleep(batt_sleep_ms);
 
 	for (to = SMBUS_TIMEOUT; to != 0; to--) {
 		error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL,
 		    &val, 1);
 		if (error)
 			goto out;
 		if (val == 0)
 			break;
 		AcpiOsSleep(10);
 	}
 	if (to == 0) {
 		error = ETIMEDOUT;
 		goto out;
 	}
 
 	error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_STS, &val, 1);
 	if (error)
 		goto out;
 	if (val & SMBUS_STS_MASK) {
 		printf("%s: AE_ERROR 0x%x\n",
 		       __FUNCTION__, (int)(val & SMBUS_STS_MASK));
 		error = EIO;
 		goto out;
 	}
 
 	error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_DATA,
 	    &val, 2);
 	if (error)
 		goto out;
 
 	*ptr = val;
 
 out:
 	return (error);
 }
 
 static int
 acpi_smbus_read_multi_1(struct acpi_smbat_softc *sc, uint8_t addr, uint8_t cmd,
     uint8_t *ptr, uint16_t len)
 {
 	UINT64 val;
 	uint8_t	to;
 	int error;
 
 	ACPI_SERIAL_ASSERT(smbat);
 
 	if (batt_sleep_ms)
 	    AcpiOsSleep(batt_sleep_ms);
 
 	val = addr;
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_ADDR,
 	    val, 1);
 	if (error)
 		goto out;
 
 	val = cmd;
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_CMD,
 	    val, 1);
 	if (error)
 		goto out;
 
 	val = 0x0B /* | 0x80 if PEC */ ;
 	error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL,
 	    val, 1);
 	if (error)
 		goto out;
 
 	if (batt_sleep_ms)
 	    AcpiOsSleep(batt_sleep_ms);
 
 	for (to = SMBUS_TIMEOUT; to != 0; to--) {
 		error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL,
 		    &val, 1);
 		if (error)
 			goto out;
 		if (val == 0)
 			break;
 		AcpiOsSleep(10);
 	}
 	if (to == 0) {
 		error = ETIMEDOUT;
 		goto out;
 	}
 
 	error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_STS, &val, 1);
 	if (error)
 		goto out;
 	if (val & SMBUS_STS_MASK) {
 		printf("%s: AE_ERROR 0x%x\n",
 		       __FUNCTION__, (int)(val & SMBUS_STS_MASK));
 		error = EIO;
 		goto out;
 	}
 
 	/* get length */
 	error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_BCNT,
 	    &val, 1);
 	if (error)
 		goto out;
 	val = (val & 0x1f) + 1;
 
 	bzero(ptr, len);
 	if (len > val)
 		len = val;
 
 	if (batt_sleep_ms)
 	    AcpiOsSleep(batt_sleep_ms);
 
 	while (len--) {
 		error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_DATA
 		    + len, &val, 1);
 		if (error)
 			goto out;
 
 		ptr[len] = val;
 		if (batt_sleep_ms)
 		    AcpiOsSleep(batt_sleep_ms);
 	}
 
 out:
 	return (error);
 }
 
 static int
 acpi_smbat_get_bst(device_t dev, struct acpi_bst *bst)
 {
 	struct acpi_smbat_softc *sc;
 	int error;
 	uint32_t factor;
 	int16_t val;
 	uint8_t	addr;
 
 	ACPI_SERIAL_BEGIN(smbat);
 
 	addr = SMBATT_ADDRESS;
 	error = ENXIO;
 	sc = device_get_softc(dev);
 
 	if (!acpi_smbat_info_expired(&sc->bst_lastupdated)) {
 		error = 0;
 		goto out;
 	}
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_MODE, &val))
 		goto out;
 	if (val & SMBATT_BM_CAPACITY_MODE)
 		factor = 10;
 	else
 		factor = 1;
 
 	/* get battery status */
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_STATUS, &val))
 		goto out;
 
 	sc->bst.state = 0;
 	if (val & SMBATT_BS_DISCHARGING)
 		sc->bst.state |= ACPI_BATT_STAT_DISCHARG;
 
 	if (val & SMBATT_BS_REMAINING_CAPACITY_ALARM)
 		sc->bst.state |= ACPI_BATT_STAT_CRITICAL;
 
 	/*
 	 * If the rate is negative, it is discharging.  Otherwise,
 	 * it is charging.
 	 */
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_CURRENT, &val))
 		goto out;
 
 	if (val > 0) {
 		sc->bst.rate = val * factor;
 		sc->bst.state &= ~SMBATT_BS_DISCHARGING;
 		sc->bst.state |= ACPI_BATT_STAT_CHARGING;
 	} else if (val < 0)
 		sc->bst.rate = (-val) * factor;
 	else
 		sc->bst.rate = 0;
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_REMAINING_CAPACITY, &val))
 		goto out;
 	sc->bst.cap = val * factor;
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_VOLTAGE, &val))
 		goto out;
 	sc->bst.volt = val;
 
 	acpi_smbat_info_updated(&sc->bst_lastupdated);
 	error = 0;
 
 out:
 	if (error == 0)
 		memcpy(bst, &sc->bst, sizeof(sc->bst));
 	ACPI_SERIAL_END(smbat);
 	return (error);
 }
 
 static int
 acpi_smbat_get_bif(device_t dev, struct acpi_bif *bif)
 {
 	struct acpi_smbat_softc *sc;
 	int error;
 	uint32_t factor;
 	uint16_t val;
 	uint8_t addr;
 
 	ACPI_SERIAL_BEGIN(smbat);
 
 	addr = SMBATT_ADDRESS;
 	error = ENXIO;
 	sc = device_get_softc(dev);
 
 	if (!acpi_smbat_info_expired(&sc->bif_lastupdated)) {
 		error = 0;
 		goto out;
 	}
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_MODE, &val))
 		goto out;
 	if (val & SMBATT_BM_CAPACITY_MODE) {
 		factor = 10;
 		sc->bif.units = ACPI_BIF_UNITS_MW;
 	} else {
 		factor = 1;
 		sc->bif.units = ACPI_BIF_UNITS_MA;
 	}
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_DESIGN_CAPACITY, &val))
 		goto out;
 	sc->bif.dcap = val * factor;
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_FULL_CHARGE_CAPACITY, &val))
 		goto out;
 	sc->bif.lfcap = val * factor;
 	sc->bif.btech = 1;		/* secondary (rechargeable) */
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_DESIGN_VOLTAGE, &val))
 		goto out;
 	sc->bif.dvol = val;
 
 	sc->bif.wcap = sc->bif.dcap / 10;
 	sc->bif.lcap = sc->bif.dcap / 10;
 
 	sc->bif.gra1 = factor;	/* not supported */
 	sc->bif.gra2 = factor;	/* not supported */
 
 	if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_DEVICE_NAME,
 	    sc->bif.model, sizeof(sc->bif.model)))
 		goto out;
 
 	if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_SERIAL_NUMBER, &val))
 		goto out;
 	snprintf(sc->bif.serial, sizeof(sc->bif.serial), "0x%04x", val);
 
 	if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_DEVICE_CHEMISTRY,
 	    sc->bif.type, sizeof(sc->bif.type)))
 		goto out;
 
 	if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_MANUFACTURER_DATA,
 	    sc->bif.oeminfo, sizeof(sc->bif.oeminfo)))
 		goto out;
 
 	/* XXX check if device was replugged during read? */
 
 	acpi_smbat_info_updated(&sc->bif_lastupdated);
 	error = 0;
 
 out:
 	if (error == 0)
 		memcpy(bif, &sc->bif, sizeof(sc->bif));
 	ACPI_SERIAL_END(smbat);
 	return (error);
 }
Index: head/sys/dev/acpica/acpi_thermal.c
===================================================================
--- head/sys/dev/acpica/acpi_thermal.c	(revision 336913)
+++ head/sys/dev/acpica/acpi_thermal.c	(revision 336914)
@@ -1,1225 +1,1225 @@
 /*-
  * Copyright (c) 2000, 2001 Michael Smith
  * Copyright (c) 2000 BSDi
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/power.h>
 
 #include "cpufreq_if.h"
 
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
 
 #include <dev/acpica/acpivar.h>
 
 /* Hooks for the ACPI CA debugging infrastructure */
 #define _COMPONENT	ACPI_THERMAL
 ACPI_MODULE_NAME("THERMAL")
 
 #define TZ_ZEROC	2731
 #define TZ_KELVTOC(x)	(((x) - TZ_ZEROC) / 10), abs(((x) - TZ_ZEROC) % 10)
 
 #define TZ_NOTIFY_TEMPERATURE	0x80 /* Temperature changed. */
 #define TZ_NOTIFY_LEVELS	0x81 /* Cooling levels changed. */
 #define TZ_NOTIFY_DEVICES	0x82 /* Device lists changed. */
 #define TZ_NOTIFY_CRITICAL	0xcc /* Fake notify that _CRT/_HOT reached. */
 
 /* Check for temperature changes every 10 seconds by default */
 #define TZ_POLLRATE	10
 
 /* Make sure the reported temperature is valid for this number of polls. */
 #define TZ_VALIDCHECKS	3
 
 /* Notify the user we will be shutting down in one more poll cycle. */
 #define TZ_NOTIFYCOUNT	(TZ_VALIDCHECKS - 1)
 
 /* ACPI spec defines this */
 #define TZ_NUMLEVELS	10
 struct acpi_tz_zone {
     int		ac[TZ_NUMLEVELS];
     ACPI_BUFFER	al[TZ_NUMLEVELS];
     int		crt;
     int		hot;
     ACPI_BUFFER	psl;
     int		psv;
     int		tc1;
     int		tc2;
     int		tsp;
     int		tzp;
 };
 
 struct acpi_tz_softc {
     device_t			tz_dev;
     ACPI_HANDLE			tz_handle;	/*Thermal zone handle*/
     int				tz_temperature;	/*Current temperature*/
     int				tz_active;	/*Current active cooling*/
 #define TZ_ACTIVE_NONE		-1
 #define TZ_ACTIVE_UNKNOWN	-2
     int				tz_requested;	/*Minimum active cooling*/
     int				tz_thflags;	/*Current temp-related flags*/
 #define TZ_THFLAG_NONE		0
 #define TZ_THFLAG_PSV		(1<<0)
 #define TZ_THFLAG_HOT		(1<<2)
 #define TZ_THFLAG_CRT		(1<<3)
     int				tz_flags;
 #define TZ_FLAG_NO_SCP		(1<<0)		/*No _SCP method*/
 #define TZ_FLAG_GETPROFILE	(1<<1)		/*Get power_profile in timeout*/
 #define TZ_FLAG_GETSETTINGS	(1<<2)		/*Get devs/setpoints*/
     struct timespec		tz_cooling_started;
 					/*Current cooling starting time*/
 
     struct sysctl_ctx_list	tz_sysctl_ctx;
     struct sysctl_oid		*tz_sysctl_tree;
     eventhandler_tag		tz_event;
 
     struct acpi_tz_zone 	tz_zone;	/*Thermal zone parameters*/
     int				tz_validchecks;
     int				tz_insane_tmp_notified;
 
     /* passive cooling */
     struct proc			*tz_cooling_proc;
     int				tz_cooling_proc_running;
     int				tz_cooling_enabled;
     int				tz_cooling_active;
     int				tz_cooling_updated;
     int				tz_cooling_saved_freq;
 };
 
 #define	TZ_ACTIVE_LEVEL(act)	((act) >= 0 ? (act) : TZ_NUMLEVELS)
 
 #define CPUFREQ_MAX_LEVELS	64 /* XXX cpufreq should export this */
 
 static int	acpi_tz_probe(device_t dev);
 static int	acpi_tz_attach(device_t dev);
 static int	acpi_tz_establish(struct acpi_tz_softc *sc);
 static void	acpi_tz_monitor(void *Context);
 static void	acpi_tz_switch_cooler_off(ACPI_OBJECT *obj, void *arg);
 static void	acpi_tz_switch_cooler_on(ACPI_OBJECT *obj, void *arg);
 static void	acpi_tz_getparam(struct acpi_tz_softc *sc, char *node,
 				 int *data);
 static void	acpi_tz_sanity(struct acpi_tz_softc *sc, int *val, char *what);
 static int	acpi_tz_active_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_tz_cooling_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_tz_temp_sysctl(SYSCTL_HANDLER_ARGS);
 static int	acpi_tz_passive_sysctl(SYSCTL_HANDLER_ARGS);
 static void	acpi_tz_notify_handler(ACPI_HANDLE h, UINT32 notify,
 				       void *context);
 static void	acpi_tz_signal(struct acpi_tz_softc *sc, int flags);
 static void	acpi_tz_timeout(struct acpi_tz_softc *sc, int flags);
 static void	acpi_tz_power_profile(void *arg);
 static void	acpi_tz_thread(void *arg);
 static int	acpi_tz_cooling_is_available(struct acpi_tz_softc *sc);
 static int	acpi_tz_cooling_thread_start(struct acpi_tz_softc *sc);
 
 static device_method_t acpi_tz_methods[] = {
     /* Device interface */
     DEVMETHOD(device_probe,	acpi_tz_probe),
     DEVMETHOD(device_attach,	acpi_tz_attach),
 
     DEVMETHOD_END
 };
 
 static driver_t acpi_tz_driver = {
     "acpi_tz",
     acpi_tz_methods,
     sizeof(struct acpi_tz_softc),
 };
 
 static char *acpi_tz_tmp_name = "_TMP";
 
 static devclass_t acpi_tz_devclass;
 DRIVER_MODULE(acpi_tz, acpi, acpi_tz_driver, acpi_tz_devclass, 0, 0);
 MODULE_DEPEND(acpi_tz, acpi, 1, 1, 1);
 
 static struct sysctl_ctx_list	acpi_tz_sysctl_ctx;
 static struct sysctl_oid	*acpi_tz_sysctl_tree;
 
 /* Minimum cooling run time */
 static int			acpi_tz_min_runtime;
 static int			acpi_tz_polling_rate = TZ_POLLRATE;
 static int			acpi_tz_override;
 
 /* Timezone polling thread */
 static struct proc		*acpi_tz_proc;
 ACPI_LOCK_DECL(thermal, "ACPI thermal zone");
 
 static int			acpi_tz_cooling_unit = -1;
 
 static int
 acpi_tz_probe(device_t dev)
 {
     int		result;
 
     if (acpi_get_type(dev) == ACPI_TYPE_THERMAL && !acpi_disabled("thermal")) {
 	device_set_desc(dev, "Thermal Zone");
 	result = -10;
     } else
 	result = ENXIO;
     return (result);
 }
 
 static int
 acpi_tz_attach(device_t dev)
 {
     struct acpi_tz_softc	*sc;
     struct acpi_softc		*acpi_sc;
     int				error;
     char			oidname[8];
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     sc = device_get_softc(dev);
     sc->tz_dev = dev;
     sc->tz_handle = acpi_get_handle(dev);
     sc->tz_requested = TZ_ACTIVE_NONE;
     sc->tz_active = TZ_ACTIVE_UNKNOWN;
     sc->tz_thflags = TZ_THFLAG_NONE;
     sc->tz_cooling_proc = NULL;
     sc->tz_cooling_proc_running = FALSE;
     sc->tz_cooling_active = FALSE;
     sc->tz_cooling_updated = FALSE;
     sc->tz_cooling_enabled = FALSE;
 
     /*
      * Parse the current state of the thermal zone and build control
      * structures.  We don't need to worry about interference with the
      * control thread since we haven't fully attached this device yet.
      */
     if ((error = acpi_tz_establish(sc)) != 0)
 	return (error);
 
     /*
      * Register for any Notify events sent to this zone.
      */
     AcpiInstallNotifyHandler(sc->tz_handle, ACPI_DEVICE_NOTIFY,
 			     acpi_tz_notify_handler, sc);
 
     /*
      * Create our sysctl nodes.
      *
      * XXX we need a mechanism for adding nodes under ACPI.
      */
     if (device_get_unit(dev) == 0) {
 	acpi_sc = acpi_device_get_parent_softc(dev);
 	sysctl_ctx_init(&acpi_tz_sysctl_ctx);
 	acpi_tz_sysctl_tree = SYSCTL_ADD_NODE(&acpi_tz_sysctl_ctx,
 			      SYSCTL_CHILDREN(acpi_sc->acpi_sysctl_tree),
 			      OID_AUTO, "thermal", CTLFLAG_RD, 0, "");
 	SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx,
 		       SYSCTL_CHILDREN(acpi_tz_sysctl_tree),
 		       OID_AUTO, "min_runtime", CTLFLAG_RW,
 		       &acpi_tz_min_runtime, 0,
 		       "minimum cooling run time in sec");
 	SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx,
 		       SYSCTL_CHILDREN(acpi_tz_sysctl_tree),
 		       OID_AUTO, "polling_rate", CTLFLAG_RW,
 		       &acpi_tz_polling_rate, 0, "monitor polling interval in seconds");
 	SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx,
 		       SYSCTL_CHILDREN(acpi_tz_sysctl_tree), OID_AUTO,
 		       "user_override", CTLFLAG_RW, &acpi_tz_override, 0,
 		       "allow override of thermal settings");
     }
     sysctl_ctx_init(&sc->tz_sysctl_ctx);
     sprintf(oidname, "tz%d", device_get_unit(dev));
     sc->tz_sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&sc->tz_sysctl_ctx,
 			 SYSCTL_CHILDREN(acpi_tz_sysctl_tree),
 			 OID_AUTO, oidname, CTLFLAG_RD, 0, "", "thermal_zone");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD,
 		    &sc->tz_temperature, 0, sysctl_handle_int,
 		    "IK", "current thermal zone temperature");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "active", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, 0, acpi_tz_active_sysctl, "I", "cooling is active");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "passive_cooling", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, 0, acpi_tz_cooling_sysctl, "I",
 		    "enable passive (speed reduction) cooling");
 
     SYSCTL_ADD_INT(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		   OID_AUTO, "thermal_flags", CTLFLAG_RD,
 		   &sc->tz_thflags, 0, "thermal zone flags");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_PSV", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.psv),
 		    acpi_tz_temp_sysctl, "IK", "passive cooling temp setpoint");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_HOT", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.hot),
 		    acpi_tz_temp_sysctl, "IK",
 		    "too hot temp setpoint (suspend now)");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_CRT", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.crt),
 		    acpi_tz_temp_sysctl, "IK",
 		    "critical temp setpoint (shutdown now)");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_ACx", CTLTYPE_INT | CTLFLAG_RD,
 		    &sc->tz_zone.ac, sizeof(sc->tz_zone.ac),
 		    sysctl_handle_opaque, "IK", "");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_TC1", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.tc1),
 		    acpi_tz_passive_sysctl, "I",
 		    "thermal constant 1 for passive cooling");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_TC2", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.tc2),
 		    acpi_tz_passive_sysctl, "I",
 		    "thermal constant 2 for passive cooling");
     SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree),
 		    OID_AUTO, "_TSP", CTLTYPE_INT | CTLFLAG_RW,
 		    sc, offsetof(struct acpi_tz_softc, tz_zone.tsp),
 		    acpi_tz_passive_sysctl, "I",
 		    "thermal sampling period for passive cooling");
 
     /*
      * Register our power profile event handler.
      */
     sc->tz_event = EVENTHANDLER_REGISTER(power_profile_change,
 	acpi_tz_power_profile, sc, 0);
 
     /*
      * Flag the event handler for a manual invocation by our timeout.
      * We defer it like this so that the rest of the subsystem has time
      * to come up.  Don't bother evaluating/printing the temperature at
      * this point; on many systems it'll be bogus until the EC is running.
      */
     sc->tz_flags |= TZ_FLAG_GETPROFILE;
 
     return_VALUE (0);
 }
 
 static void
 acpi_tz_startup(void *arg __unused)
 {
     struct acpi_tz_softc *sc;
     device_t *devs;
     int devcount, error, i;
 
     devclass_get_devices(acpi_tz_devclass, &devs, &devcount);
     if (devcount == 0) {
 	free(devs, M_TEMP);
 	return;
     }
 
     /*
      * Create thread to service all of the thermal zones.
      */
     error = kproc_create(acpi_tz_thread, NULL, &acpi_tz_proc, RFHIGHPID, 0,
 	"acpi_thermal");
     if (error != 0)
 	printf("acpi_tz: could not create thread - %d", error);
 
     /*
      * Create a thread to handle passive cooling for 1st zone which
      * has _PSV, _TSP, _TC1 and _TC2.  Users can enable it for other
      * zones manually for now.
      *
      * XXX We enable only one zone to avoid multiple zones conflict
      * with each other since cpufreq currently sets all CPUs to the
      * given frequency whereas it's possible for different thermal
      * zones to specify independent settings for multiple CPUs.
      */
     for (i = 0; i < devcount; i++) {
 	sc = device_get_softc(devs[i]);
 	if (acpi_tz_cooling_is_available(sc)) {
 	    sc->tz_cooling_enabled = TRUE;
 	    error = acpi_tz_cooling_thread_start(sc);
 	    if (error != 0) {
 		sc->tz_cooling_enabled = FALSE;
 		break;
 	    }
 	    acpi_tz_cooling_unit = device_get_unit(devs[i]);
 	    break;
 	}
     }
     free(devs, M_TEMP);
 }
 SYSINIT(acpi_tz, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, acpi_tz_startup, NULL);
 
 /*
  * Parse the current state of this thermal zone and set up to use it.
  *
  * Note that we may have previous state, which will have to be discarded.
  */
 static int
 acpi_tz_establish(struct acpi_tz_softc *sc)
 {
     ACPI_OBJECT	*obj;
     int		i;
     char	nbuf[8];
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     /* Erase any existing state. */
     for (i = 0; i < TZ_NUMLEVELS; i++)
 	if (sc->tz_zone.al[i].Pointer != NULL)
 	    AcpiOsFree(sc->tz_zone.al[i].Pointer);
     if (sc->tz_zone.psl.Pointer != NULL)
 	AcpiOsFree(sc->tz_zone.psl.Pointer);
 
     /*
      * XXX: We initialize only ACPI_BUFFER to avoid race condition
      * with passive cooling thread which refers psv, tc1, tc2 and tsp.
      */
     bzero(sc->tz_zone.ac, sizeof(sc->tz_zone.ac));
     bzero(sc->tz_zone.al, sizeof(sc->tz_zone.al));
     bzero(&sc->tz_zone.psl, sizeof(sc->tz_zone.psl));
 
     /* Evaluate thermal zone parameters. */
     for (i = 0; i < TZ_NUMLEVELS; i++) {
 	sprintf(nbuf, "_AC%d", i);
 	acpi_tz_getparam(sc, nbuf, &sc->tz_zone.ac[i]);
 	sprintf(nbuf, "_AL%d", i);
 	sc->tz_zone.al[i].Length = ACPI_ALLOCATE_BUFFER;
 	sc->tz_zone.al[i].Pointer = NULL;
 	AcpiEvaluateObject(sc->tz_handle, nbuf, NULL, &sc->tz_zone.al[i]);
 	obj = (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer;
 	if (obj != NULL) {
 	    /* Should be a package containing a list of power objects */
 	    if (obj->Type != ACPI_TYPE_PACKAGE) {
 		device_printf(sc->tz_dev, "%s has unknown type %d, rejecting\n",
 			      nbuf, obj->Type);
 		return_VALUE (ENXIO);
 	    }
 	}
     }
     acpi_tz_getparam(sc, "_CRT", &sc->tz_zone.crt);
     acpi_tz_getparam(sc, "_HOT", &sc->tz_zone.hot);
     sc->tz_zone.psl.Length = ACPI_ALLOCATE_BUFFER;
     sc->tz_zone.psl.Pointer = NULL;
     AcpiEvaluateObject(sc->tz_handle, "_PSL", NULL, &sc->tz_zone.psl);
     acpi_tz_getparam(sc, "_PSV", &sc->tz_zone.psv);
     acpi_tz_getparam(sc, "_TC1", &sc->tz_zone.tc1);
     acpi_tz_getparam(sc, "_TC2", &sc->tz_zone.tc2);
     acpi_tz_getparam(sc, "_TSP", &sc->tz_zone.tsp);
     acpi_tz_getparam(sc, "_TZP", &sc->tz_zone.tzp);
 
     /*
      * Sanity-check the values we've been given.
      *
      * XXX what do we do about systems that give us the same value for
      *     more than one of these setpoints?
      */
     acpi_tz_sanity(sc, &sc->tz_zone.crt, "_CRT");
     acpi_tz_sanity(sc, &sc->tz_zone.hot, "_HOT");
     acpi_tz_sanity(sc, &sc->tz_zone.psv, "_PSV");
     for (i = 0; i < TZ_NUMLEVELS; i++)
 	acpi_tz_sanity(sc, &sc->tz_zone.ac[i], "_ACx");
 
     return_VALUE (0);
 }
 
 static char *aclevel_string[] = {
     "NONE", "_AC0", "_AC1", "_AC2", "_AC3", "_AC4",
     "_AC5", "_AC6", "_AC7", "_AC8", "_AC9"
 };
 
 static __inline const char *
 acpi_tz_aclevel_string(int active)
 {
     if (active < -1 || active >= TZ_NUMLEVELS)
 	return (aclevel_string[0]);
 
     return (aclevel_string[active + 1]);
 }
 
 /*
  * Get the current temperature.
  */
 static int
 acpi_tz_get_temperature(struct acpi_tz_softc *sc)
 {
     int		temp;
     ACPI_STATUS	status;
 
     ACPI_FUNCTION_NAME ("acpi_tz_get_temperature");
 
     /* Evaluate the thermal zone's _TMP method. */
     status = acpi_GetInteger(sc->tz_handle, acpi_tz_tmp_name, &temp);
     if (ACPI_FAILURE(status)) {
 	ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 	    "error fetching current temperature -- %s\n",
 	     AcpiFormatException(status));
 	return (FALSE);
     }
 
     /* Check it for validity. */
     acpi_tz_sanity(sc, &temp, acpi_tz_tmp_name);
     if (temp == -1)
 	return (FALSE);
 
     ACPI_DEBUG_PRINT((ACPI_DB_VALUES, "got %d.%dC\n", TZ_KELVTOC(temp)));
     sc->tz_temperature = temp;
     return (TRUE);
 }
 
 /*
  * Evaluate the condition of a thermal zone, take appropriate actions.
  */
 static void
 acpi_tz_monitor(void *Context)
 {
     struct acpi_tz_softc *sc;
     struct	timespec curtime;
     int		temp;
     int		i;
     int		newactive, newflags;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     sc = (struct acpi_tz_softc *)Context;
 
     /* Get the current temperature. */
     if (!acpi_tz_get_temperature(sc)) {
 	/* XXX disable zone? go to max cooling? */
 	return_VOID;
     }
     temp = sc->tz_temperature;
 
     /*
      * Work out what we ought to be doing right now.
      *
      * Note that the _ACx levels sort from hot to cold.
      */
     newactive = TZ_ACTIVE_NONE;
     for (i = TZ_NUMLEVELS - 1; i >= 0; i--) {
 	if (sc->tz_zone.ac[i] != -1 && temp >= sc->tz_zone.ac[i])
 	    newactive = i;
     }
 
     /*
      * We are going to get _ACx level down (colder side), but give a guaranteed
      * minimum cooling run time if requested.
      */
     if (acpi_tz_min_runtime > 0 && sc->tz_active != TZ_ACTIVE_NONE &&
 	sc->tz_active != TZ_ACTIVE_UNKNOWN &&
 	(newactive == TZ_ACTIVE_NONE || newactive > sc->tz_active)) {
 
 	getnanotime(&curtime);
-	timespecsub(&curtime, &sc->tz_cooling_started);
+	timespecsub(&curtime, &sc->tz_cooling_started, &curtime);
 	if (curtime.tv_sec < acpi_tz_min_runtime)
 	    newactive = sc->tz_active;
     }
 
     /* Handle user override of active mode */
     if (sc->tz_requested != TZ_ACTIVE_NONE && (newactive == TZ_ACTIVE_NONE
         || sc->tz_requested < newactive))
 	newactive = sc->tz_requested;
 
     /* update temperature-related flags */
     newflags = TZ_THFLAG_NONE;
     if (sc->tz_zone.psv != -1 && temp >= sc->tz_zone.psv)
 	newflags |= TZ_THFLAG_PSV;
     if (sc->tz_zone.hot != -1 && temp >= sc->tz_zone.hot)
 	newflags |= TZ_THFLAG_HOT;
     if (sc->tz_zone.crt != -1 && temp >= sc->tz_zone.crt)
 	newflags |= TZ_THFLAG_CRT;
 
     /* If the active cooling state has changed, we have to switch things. */
     if (sc->tz_active == TZ_ACTIVE_UNKNOWN) {
 	/*
 	 * We don't know which cooling device is on or off,
 	 * so stop them all, because we now know which
 	 * should be on (if any).
 	 */
 	for (i = 0; i < TZ_NUMLEVELS; i++) {
 	    if (sc->tz_zone.al[i].Pointer != NULL) {
 		acpi_ForeachPackageObject(
 		    (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer,
 		    acpi_tz_switch_cooler_off, sc);
 	    }
 	}
 	/* now we know that all devices are off */
 	sc->tz_active = TZ_ACTIVE_NONE;
     }
 
     if (newactive != sc->tz_active) {
 	/* Turn off unneeded cooling devices that are on, if any are */
 	for (i = TZ_ACTIVE_LEVEL(sc->tz_active);
 	     i < TZ_ACTIVE_LEVEL(newactive); i++) {
 	    acpi_ForeachPackageObject(
 		(ACPI_OBJECT *)sc->tz_zone.al[i].Pointer,
 		acpi_tz_switch_cooler_off, sc);
 	}
 	/* Turn on cooling devices that are required, if any are */
 	for (i = TZ_ACTIVE_LEVEL(sc->tz_active) - 1;
 	     i >= TZ_ACTIVE_LEVEL(newactive); i--) {
 	    acpi_ForeachPackageObject(
 		(ACPI_OBJECT *)sc->tz_zone.al[i].Pointer,
 		acpi_tz_switch_cooler_on, sc);
 	}
 
 	ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 		    "switched from %s to %s: %d.%dC\n",
 		    acpi_tz_aclevel_string(sc->tz_active),
 		    acpi_tz_aclevel_string(newactive), TZ_KELVTOC(temp));
 	sc->tz_active = newactive;
 	getnanotime(&sc->tz_cooling_started);
     }
 
     /* XXX (de)activate any passive cooling that may be required. */
 
     /*
      * If the temperature is at _HOT or _CRT, increment our event count.
      * If it has occurred enough times, shutdown the system.  This is
      * needed because some systems will report an invalid high temperature
      * for one poll cycle.  It is suspected this is due to the embedded
      * controller timing out.  A typical value is 138C for one cycle on
      * a system that is otherwise 65C.
      *
      * If we're almost at that threshold, notify the user through devd(8).
      */
     if ((newflags & (TZ_THFLAG_HOT | TZ_THFLAG_CRT)) != 0) {
 	sc->tz_validchecks++;
 	if (sc->tz_validchecks == TZ_VALIDCHECKS) {
 	    device_printf(sc->tz_dev,
 		"WARNING - current temperature (%d.%dC) exceeds safe limits\n",
 		TZ_KELVTOC(sc->tz_temperature));
 	    shutdown_nice(RB_POWEROFF);
 	} else if (sc->tz_validchecks == TZ_NOTIFYCOUNT)
 	    acpi_UserNotify("Thermal", sc->tz_handle, TZ_NOTIFY_CRITICAL);
     } else {
 	sc->tz_validchecks = 0;
     }
     sc->tz_thflags = newflags;
 
     return_VOID;
 }
 
 /*
  * Given an object, verify that it's a reference to a device of some sort,
  * and try to switch it off.
  */
 static void
 acpi_tz_switch_cooler_off(ACPI_OBJECT *obj, void *arg)
 {
     ACPI_HANDLE			cooler;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     cooler = acpi_GetReference(NULL, obj);
     if (cooler == NULL) {
 	ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "can't get handle\n"));
 	return_VOID;
     }
 
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "called to turn %s off\n",
 		     acpi_name(cooler)));
     acpi_pwr_switch_consumer(cooler, ACPI_STATE_D3);
 
     return_VOID;
 }
 
 /*
  * Given an object, verify that it's a reference to a device of some sort,
  * and try to switch it on.
  *
  * XXX replication of off/on function code is bad.
  */
 static void
 acpi_tz_switch_cooler_on(ACPI_OBJECT *obj, void *arg)
 {
     struct acpi_tz_softc	*sc = (struct acpi_tz_softc *)arg;
     ACPI_HANDLE			cooler;
     ACPI_STATUS			status;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     cooler = acpi_GetReference(NULL, obj);
     if (cooler == NULL) {
 	ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "can't get handle\n"));
 	return_VOID;
     }
 
     ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "called to turn %s on\n",
 		     acpi_name(cooler)));
     status = acpi_pwr_switch_consumer(cooler, ACPI_STATE_D0);
     if (ACPI_FAILURE(status)) {
 	ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 		    "failed to activate %s - %s\n", acpi_name(cooler),
 		    AcpiFormatException(status));
     }
 
     return_VOID;
 }
 
 /*
  * Read/debug-print a parameter, default it to -1.
  */
 static void
 acpi_tz_getparam(struct acpi_tz_softc *sc, char *node, int *data)
 {
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     if (ACPI_FAILURE(acpi_GetInteger(sc->tz_handle, node, data))) {
 	*data = -1;
     } else {
 	ACPI_DEBUG_PRINT((ACPI_DB_VALUES, "%s.%s = %d\n",
 			 acpi_name(sc->tz_handle), node, *data));
     }
 
     return_VOID;
 }
 
 /*
  * Sanity-check a temperature value.  Assume that setpoints
  * should be between 0C and 200C.
  */
 static void
 acpi_tz_sanity(struct acpi_tz_softc *sc, int *val, char *what)
 {
     if (*val != -1 && (*val < TZ_ZEROC || *val > TZ_ZEROC + 2000)) {
 	/*
 	 * If the value we are checking is _TMP, warn the user only
 	 * once. This avoids spamming messages if, for instance, the
 	 * sensor is broken and always returns an invalid temperature.
 	 *
 	 * This is only done for _TMP; other values always emit a
 	 * warning.
 	 */
 	if (what != acpi_tz_tmp_name || !sc->tz_insane_tmp_notified) {
 	    device_printf(sc->tz_dev, "%s value is absurd, ignored (%d.%dC)\n",
 			  what, TZ_KELVTOC(*val));
 
 	    /* Don't warn the user again if the read value doesn't improve. */
 	    if (what == acpi_tz_tmp_name)
 		sc->tz_insane_tmp_notified = 1;
 	}
 	*val = -1;
 	return;
     }
 
     /* This value is correct. Warn if it's incorrect again. */
     if (what == acpi_tz_tmp_name)
 	sc->tz_insane_tmp_notified = 0;
 }
 
 /*
  * Respond to a sysctl on the active state node.
  */
 static int
 acpi_tz_active_sysctl(SYSCTL_HANDLER_ARGS)
 {
     struct acpi_tz_softc	*sc;
     int				active;
     int		 		error;
 
     sc = (struct acpi_tz_softc *)oidp->oid_arg1;
     active = sc->tz_active;
     error = sysctl_handle_int(oidp, &active, 0, req);
 
     /* Error or no new value */
     if (error != 0 || req->newptr == NULL)
 	return (error);
     if (active < -1 || active >= TZ_NUMLEVELS)
 	return (EINVAL);
 
     /* Set new preferred level and re-switch */
     sc->tz_requested = active;
     acpi_tz_signal(sc, 0);
     return (0);
 }
 
 static int
 acpi_tz_cooling_sysctl(SYSCTL_HANDLER_ARGS)
 {
     struct acpi_tz_softc *sc;
     int enabled, error;
 
     sc = (struct acpi_tz_softc *)oidp->oid_arg1;
     enabled = sc->tz_cooling_enabled;
     error = sysctl_handle_int(oidp, &enabled, 0, req);
 
     /* Error or no new value */
     if (error != 0 || req->newptr == NULL)
 	return (error);
     if (enabled != TRUE && enabled != FALSE)
 	return (EINVAL);
 
     if (enabled) {
 	if (acpi_tz_cooling_is_available(sc))
 	    error = acpi_tz_cooling_thread_start(sc);
 	else
 	    error = ENODEV;
 	if (error)
 	    enabled = FALSE;
     }
     sc->tz_cooling_enabled = enabled;
     return (error);
 }
 
 static int
 acpi_tz_temp_sysctl(SYSCTL_HANDLER_ARGS)
 {
     struct acpi_tz_softc	*sc;
     int				temp, *temp_ptr;
     int		 		error;
 
     sc = oidp->oid_arg1;
     temp_ptr = (int *)(void *)(uintptr_t)((uintptr_t)sc + oidp->oid_arg2);
     temp = *temp_ptr;
     error = sysctl_handle_int(oidp, &temp, 0, req);
 
     /* Error or no new value */
     if (error != 0 || req->newptr == NULL)
 	return (error);
 
     /* Only allow changing settings if override is set. */
     if (!acpi_tz_override)
 	return (EPERM);
 
     /* Check user-supplied value for sanity. */
     acpi_tz_sanity(sc, &temp, "user-supplied temp");
     if (temp == -1)
 	return (EINVAL);
 
     *temp_ptr = temp;
     return (0);
 }
 
 static int
 acpi_tz_passive_sysctl(SYSCTL_HANDLER_ARGS)
 {
     struct acpi_tz_softc	*sc;
     int				val, *val_ptr;
     int				error;
 
     sc = oidp->oid_arg1;
     val_ptr = (int *)(void *)(uintptr_t)((uintptr_t)sc + oidp->oid_arg2);
     val = *val_ptr;
     error = sysctl_handle_int(oidp, &val, 0, req);
 
     /* Error or no new value */
     if (error != 0 || req->newptr == NULL)
 	return (error);
 
     /* Only allow changing settings if override is set. */
     if (!acpi_tz_override)
 	return (EPERM);
 
     *val_ptr = val;
     return (0);
 }
 
 static void
 acpi_tz_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context)
 {
     struct acpi_tz_softc	*sc = (struct acpi_tz_softc *)context;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     switch (notify) {
     case TZ_NOTIFY_TEMPERATURE:
 	/* Temperature change occurred */
 	acpi_tz_signal(sc, 0);
 	break;
     case TZ_NOTIFY_DEVICES:
     case TZ_NOTIFY_LEVELS:
 	/* Zone devices/setpoints changed */
 	acpi_tz_signal(sc, TZ_FLAG_GETSETTINGS);
 	break;
     default:
 	ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 		    "unknown Notify event 0x%x\n", notify);
 	break;
     }
 
     acpi_UserNotify("Thermal", h, notify);
 
     return_VOID;
 }
 
 static void
 acpi_tz_signal(struct acpi_tz_softc *sc, int flags)
 {
     ACPI_LOCK(thermal);
     sc->tz_flags |= flags;
     ACPI_UNLOCK(thermal);
     wakeup(&acpi_tz_proc);
 }
 
 /*
  * Notifies can be generated asynchronously but have also been seen to be
  * triggered by other thermal methods.  One system generates a notify of
  * 0x81 when the fan is turned on or off.  Another generates it when _SCP
  * is called.  To handle these situations, we check the zone via
  * acpi_tz_monitor() before evaluating changes to setpoints or the cooling
  * policy.
  */
 static void
 acpi_tz_timeout(struct acpi_tz_softc *sc, int flags)
 {
 
     /* Check the current temperature and take action based on it */
     acpi_tz_monitor(sc);
 
     /* If requested, get the power profile settings. */
     if (flags & TZ_FLAG_GETPROFILE)
 	acpi_tz_power_profile(sc);
 
     /*
      * If requested, check for new devices/setpoints.  After finding them,
      * check if we need to switch fans based on the new values.
      */
     if (flags & TZ_FLAG_GETSETTINGS) {
 	acpi_tz_establish(sc);
 	acpi_tz_monitor(sc);
     }
 
     /* XXX passive cooling actions? */
 }
 
 /*
  * System power profile may have changed; fetch and notify the
  * thermal zone accordingly.
  *
  * Since this can be called from an arbitrary eventhandler, it needs
  * to get the ACPI lock itself.
  */
 static void
 acpi_tz_power_profile(void *arg)
 {
     ACPI_STATUS			status;
     struct acpi_tz_softc	*sc = (struct acpi_tz_softc *)arg;
     int				state;
 
     state = power_profile_get_state();
     if (state != POWER_PROFILE_PERFORMANCE && state != POWER_PROFILE_ECONOMY)
 	return;
 
     /* check that we haven't decided there's no _SCP method */
     if ((sc->tz_flags & TZ_FLAG_NO_SCP) == 0) {
 
 	/* Call _SCP to set the new profile */
 	status = acpi_SetInteger(sc->tz_handle, "_SCP",
 	    (state == POWER_PROFILE_PERFORMANCE) ? 0 : 1);
 	if (ACPI_FAILURE(status)) {
 	    if (status != AE_NOT_FOUND)
 		ACPI_VPRINT(sc->tz_dev,
 			    acpi_device_get_parent_softc(sc->tz_dev),
 			    "can't evaluate %s._SCP - %s\n",
 			    acpi_name(sc->tz_handle),
 			    AcpiFormatException(status));
 	    sc->tz_flags |= TZ_FLAG_NO_SCP;
 	} else {
 	    /* We have to re-evaluate the entire zone now */
 	    acpi_tz_signal(sc, TZ_FLAG_GETSETTINGS);
 	}
     }
 }
 
 /*
  * Thermal zone monitor thread.
  */
 static void
 acpi_tz_thread(void *arg)
 {
     device_t	*devs;
     int		devcount, i;
     int		flags;
     struct acpi_tz_softc **sc;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     devs = NULL;
     devcount = 0;
     sc = NULL;
 
     for (;;) {
 	/* If the number of devices has changed, re-evaluate. */
 	if (devclass_get_count(acpi_tz_devclass) != devcount) {
 	    if (devs != NULL) {
 		free(devs, M_TEMP);
 		free(sc, M_TEMP);
 	    }
 	    devclass_get_devices(acpi_tz_devclass, &devs, &devcount);
 	    sc = malloc(sizeof(struct acpi_tz_softc *) * devcount, M_TEMP,
 			M_WAITOK | M_ZERO);
 	    for (i = 0; i < devcount; i++)
 		sc[i] = device_get_softc(devs[i]);
 	}
 
 	/* Check for temperature events and act on them. */
 	for (i = 0; i < devcount; i++) {
 	    ACPI_LOCK(thermal);
 	    flags = sc[i]->tz_flags;
 	    sc[i]->tz_flags &= TZ_FLAG_NO_SCP;
 	    ACPI_UNLOCK(thermal);
 	    acpi_tz_timeout(sc[i], flags);
 	}
 
 	/* If more work to do, don't go to sleep yet. */
 	ACPI_LOCK(thermal);
 	for (i = 0; i < devcount; i++) {
 	    if (sc[i]->tz_flags & ~TZ_FLAG_NO_SCP)
 		break;
 	}
 
 	/*
 	 * If we have no more work, sleep for a while, setting PDROP so that
 	 * the mutex will not be reacquired.  Otherwise, drop the mutex and
 	 * loop to handle more events.
 	 */
 	if (i == devcount)
 	    msleep(&acpi_tz_proc, &thermal_mutex, PZERO | PDROP, "tzpoll",
 		hz * acpi_tz_polling_rate);
 	else
 	    ACPI_UNLOCK(thermal);
     }
 }
 
 static int
 acpi_tz_cpufreq_restore(struct acpi_tz_softc *sc)
 {
     device_t dev;
     int error;
 
     if (!sc->tz_cooling_updated)
 	return (0);
     if ((dev = devclass_get_device(devclass_find("cpufreq"), 0)) == NULL)
 	return (ENXIO);
     ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 	"temperature %d.%dC: resuming previous clock speed (%d MHz)\n",
 	TZ_KELVTOC(sc->tz_temperature), sc->tz_cooling_saved_freq);
     error = CPUFREQ_SET(dev, NULL, CPUFREQ_PRIO_KERN);
     if (error == 0)
 	sc->tz_cooling_updated = FALSE;
     return (error);
 }
 
 static int
 acpi_tz_cpufreq_update(struct acpi_tz_softc *sc, int req)
 {
     device_t dev;
     struct cf_level *levels;
     int num_levels, error, freq, desired_freq, perf, i;
 
     levels = malloc(CPUFREQ_MAX_LEVELS * sizeof(*levels), M_TEMP, M_NOWAIT);
     if (levels == NULL)
 	return (ENOMEM);
 
     /*
      * Find the main device, cpufreq0.  We don't yet support independent
      * CPU frequency control on SMP.
      */
     if ((dev = devclass_get_device(devclass_find("cpufreq"), 0)) == NULL) {
 	error = ENXIO;
 	goto out;
     }
 
     /* Get the current frequency. */
     error = CPUFREQ_GET(dev, &levels[0]);
     if (error)
 	goto out;
     freq = levels[0].total_set.freq;
 
     /* Get the current available frequency levels. */
     num_levels = CPUFREQ_MAX_LEVELS;
     error = CPUFREQ_LEVELS(dev, levels, &num_levels);
     if (error) {
 	if (error == E2BIG)
 	    printf("cpufreq: need to increase CPUFREQ_MAX_LEVELS\n");
 	goto out;
     }
 
     /* Calculate the desired frequency as a percent of the max frequency. */
     perf = 100 * freq / levels[0].total_set.freq - req;
     if (perf < 0)
 	perf = 0;
     else if (perf > 100)
 	perf = 100;
     desired_freq = levels[0].total_set.freq * perf / 100;
 
     if (desired_freq < freq) {
 	/* Find the closest available frequency, rounding down. */
 	for (i = 0; i < num_levels; i++)
 	    if (levels[i].total_set.freq <= desired_freq)
 		break;
 
 	/* If we didn't find a relevant setting, use the lowest. */
 	if (i == num_levels)
 	    i--;
     } else {
 	/* If we didn't decrease frequency yet, don't increase it. */
 	if (!sc->tz_cooling_updated) {
 	    sc->tz_cooling_active = FALSE;
 	    goto out;
 	}
 
 	/* Use saved cpu frequency as maximum value. */
 	if (desired_freq > sc->tz_cooling_saved_freq)
 	    desired_freq = sc->tz_cooling_saved_freq;
 
 	/* Find the closest available frequency, rounding up. */
 	for (i = num_levels - 1; i >= 0; i--)
 	    if (levels[i].total_set.freq >= desired_freq)
 		break;
 
 	/* If we didn't find a relevant setting, use the highest. */
 	if (i == -1)
 	    i++;
 
 	/* If we're going to the highest frequency, restore the old setting. */
 	if (i == 0 || desired_freq == sc->tz_cooling_saved_freq) {
 	    error = acpi_tz_cpufreq_restore(sc);
 	    if (error == 0)
 		sc->tz_cooling_active = FALSE;
 	    goto out;
 	}
     }
 
     /* If we are going to a new frequency, activate it. */
     if (levels[i].total_set.freq != freq) {
 	ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev),
 	    "temperature %d.%dC: %screasing clock speed "
 	    "from %d MHz to %d MHz\n",
 	    TZ_KELVTOC(sc->tz_temperature),
 	    (freq > levels[i].total_set.freq) ? "de" : "in",
 	    freq, levels[i].total_set.freq);
 	error = CPUFREQ_SET(dev, &levels[i], CPUFREQ_PRIO_KERN);
 	if (error == 0 && !sc->tz_cooling_updated) {
 	    sc->tz_cooling_saved_freq = freq;
 	    sc->tz_cooling_updated = TRUE;
 	}
     }
 
 out:
     if (levels)
 	free(levels, M_TEMP);
     return (error);
 }
 
 /*
  * Passive cooling thread; monitors current temperature according to the
  * cooling interval and calculates whether to scale back CPU frequency.
  */
 static void
 acpi_tz_cooling_thread(void *arg)
 {
     struct acpi_tz_softc *sc;
     int error, perf, curr_temp, prev_temp;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
     sc = (struct acpi_tz_softc *)arg;
 
     prev_temp = sc->tz_temperature;
     while (sc->tz_cooling_enabled) {
 	if (sc->tz_cooling_active)
 	    (void)acpi_tz_get_temperature(sc);
 	curr_temp = sc->tz_temperature;
 	if (curr_temp >= sc->tz_zone.psv)
 	    sc->tz_cooling_active = TRUE;
 	if (sc->tz_cooling_active) {
 	    perf = sc->tz_zone.tc1 * (curr_temp - prev_temp) +
 		   sc->tz_zone.tc2 * (curr_temp - sc->tz_zone.psv);
 	    perf /= 10;
 
 	    if (perf != 0) {
 		error = acpi_tz_cpufreq_update(sc, perf);
 
 		/*
 		 * If error and not simply a higher priority setting was
 		 * active, disable cooling.
 		 */
 		if (error != 0 && error != EPERM) {
 		    device_printf(sc->tz_dev,
 			"failed to set new freq, disabling passive cooling\n");
 		    sc->tz_cooling_enabled = FALSE;
 		}
 	    }
 	}
 	prev_temp = curr_temp;
 	tsleep(&sc->tz_cooling_proc, PZERO, "cooling",
 	    hz * sc->tz_zone.tsp / 10);
     }
     if (sc->tz_cooling_active) {
 	acpi_tz_cpufreq_restore(sc);
 	sc->tz_cooling_active = FALSE;
     }
     sc->tz_cooling_proc = NULL;
     ACPI_LOCK(thermal);
     sc->tz_cooling_proc_running = FALSE;
     ACPI_UNLOCK(thermal);
     kproc_exit(0);
 }
 
 /*
  * TODO: We ignore _PSL (list of cooling devices) since cpufreq enumerates
  * all CPUs for us.  However, it's possible in the future _PSL will
  * reference non-CPU devices so we may want to support it then.
  */
 static int
 acpi_tz_cooling_is_available(struct acpi_tz_softc *sc)
 {
     return (sc->tz_zone.tc1 != -1 && sc->tz_zone.tc2 != -1 &&
 	sc->tz_zone.tsp != -1 && sc->tz_zone.tsp != 0 &&
 	sc->tz_zone.psv != -1);
 }
 
 static int
 acpi_tz_cooling_thread_start(struct acpi_tz_softc *sc)
 {
     int error;
 
     ACPI_LOCK(thermal);
     if (sc->tz_cooling_proc_running) {
 	ACPI_UNLOCK(thermal);
 	return (0);
     }
     sc->tz_cooling_proc_running = TRUE;
     ACPI_UNLOCK(thermal);
     error = 0;
     if (sc->tz_cooling_proc == NULL) {
 	error = kproc_create(acpi_tz_cooling_thread, sc,
 	    &sc->tz_cooling_proc, RFHIGHPID, 0, "acpi_cooling%d",
 	    device_get_unit(sc->tz_dev));
 	if (error != 0) {
 	    device_printf(sc->tz_dev, "could not create thread - %d", error);
 	    ACPI_LOCK(thermal);
 	    sc->tz_cooling_proc_running = FALSE;
 	    ACPI_UNLOCK(thermal);
 	}
     }
     return (error);
 }
Index: head/sys/dev/drm2/i915/i915_gem.c
===================================================================
--- head/sys/dev/drm2/i915/i915_gem.c	(revision 336913)
+++ head/sys/dev/drm2/i915/i915_gem.c	(revision 336914)
@@ -1,4767 +1,4767 @@
 /*
  * Copyright © 2008 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/i915/i915_drm.h>
 #include <dev/drm2/i915/i915_drv.h>
 #include <dev/drm2/i915/intel_drv.h>
 
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 
 #include <machine/md_var.h>
 
 static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
 static __must_check int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
 						    unsigned alignment,
 						    bool map_and_fenceable,
 						    bool nonblocking);
 static int i915_gem_phys_pwrite(struct drm_device *dev,
 				struct drm_i915_gem_object *obj,
 				struct drm_i915_gem_pwrite *args,
 				struct drm_file *file);
 
 static void i915_gem_write_fence(struct drm_device *dev, int reg,
 				 struct drm_i915_gem_object *obj);
 static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
 					 struct drm_i915_fence_reg *fence,
 					 bool enable);
 
 static void i915_gem_inactive_shrink(void *);
 static long i915_gem_purge(struct drm_i915_private *dev_priv, long target);
 static void i915_gem_shrink_all(struct drm_i915_private *dev_priv);
 static void i915_gem_object_truncate(struct drm_i915_gem_object *obj);
 
 static int i915_gem_object_get_pages_range(struct drm_i915_gem_object *obj,
     off_t start, off_t end);
 
 static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex,
     bool *fresh);
 
 MALLOC_DEFINE(DRM_I915_GEM, "i915gem", "Allocations from i915 gem");
 long i915_gem_wired_pages_cnt;
 
 static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj)
 {
 	if (obj->tiling_mode)
 		i915_gem_release_mmap(obj);
 
 	/* As we do not have an associated fence register, we will force
 	 * a tiling change if we ever need to acquire one.
 	 */
 	obj->fence_dirty = false;
 	obj->fence_reg = I915_FENCE_REG_NONE;
 }
 
 /* some bookkeeping */
 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
 				  size_t size)
 {
 	dev_priv->mm.object_count++;
 	dev_priv->mm.object_memory += size;
 }
 
 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 				     size_t size)
 {
 	dev_priv->mm.object_count--;
 	dev_priv->mm.object_memory -= size;
 }
 
 static int
 i915_gem_wait_for_error(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct completion *x = &dev_priv->error_completion;
 	int ret;
 
 	if (!atomic_read(&dev_priv->mm.wedged))
 		return 0;
 
 	/*
 	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 	 * userspace. If it takes that long something really bad is going on and
 	 * we should simply try to bail out and fail as gracefully as possible.
 	 */
 	ret = wait_for_completion_interruptible_timeout(x, 10*HZ);
 	if (ret == 0) {
 		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 		return -EIO;
 	} else if (ret < 0) {
 		return ret;
 	}
 
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		/* GPU is hung, bump the completion count to account for
 		 * the token we just consumed so that we never hit zero and
 		 * end up waiting upon a subsequent completion event that
 		 * will never happen.
 		 */
 		mtx_lock(&x->lock);
 		x->done++;
 		mtx_unlock(&x->lock);
 	}
 	return 0;
 }
 
 int i915_mutex_lock_interruptible(struct drm_device *dev)
 {
 	int ret;
 
 	ret = i915_gem_wait_for_error(dev);
 	if (ret)
 		return ret;
 
 	/*
 	 * interruptible shall it be. might indeed be if dev_lock is
 	 * changed to sx
 	 */
 	ret = sx_xlock_sig(&dev->dev_struct_lock);
 	if (ret)
 		return -EINTR;
 
 	WARN_ON(i915_verify_lists(dev));
 	return 0;
 }
 
 static inline bool
 i915_gem_object_is_inactive(struct drm_i915_gem_object *obj)
 {
 	return obj->gtt_space && !obj->active;
 }
 
 int
 i915_gem_init_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file)
 {
 	struct drm_i915_gem_init *args = data;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return -ENODEV;
 
 	if (args->gtt_start >= args->gtt_end ||
 	    (args->gtt_end | args->gtt_start) & (PAGE_SIZE - 1))
 		return -EINVAL;
 
 	/* GEM with user mode setting was never supported on ilk and later. */
 	if (INTEL_INFO(dev)->gen >= 5)
 		return -ENODEV;
 
 	/*
 	 * XXXKIB. The second-time initialization should be guarded
 	 * against.
 	 */
 	DRM_LOCK(dev);
 	i915_gem_init_global_gtt(dev, args->gtt_start,
 				 args->gtt_end, args->gtt_end);
 	DRM_UNLOCK(dev);
 
 	return 0;
 }
 
 int
 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_get_aperture *args = data;
 	struct drm_i915_gem_object *obj;
 	size_t pinned;
 
 	pinned = 0;
 	DRM_LOCK(dev);
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list)
 		if (obj->pin_count)
 			pinned += obj->gtt_space->size;
 	DRM_UNLOCK(dev);
 
 	args->aper_size = dev_priv->mm.gtt_total;
 	args->aper_available_size = args->aper_size - pinned;
 
 	return 0;
 }
 
 static int
 i915_gem_create(struct drm_file *file,
 		struct drm_device *dev,
 		uint64_t size,
 		uint32_t *handle_p)
 {
 	struct drm_i915_gem_object *obj;
 	int ret;
 	u32 handle;
 
 	size = roundup(size, PAGE_SIZE);
 	if (size == 0)
 		return -EINVAL;
 
 	/* Allocate the new object */
 	obj = i915_gem_alloc_object(dev, size);
 	if (obj == NULL)
 		return -ENOMEM;
 
 	ret = drm_gem_handle_create(file, &obj->base, &handle);
 	if (ret) {
 		drm_gem_object_release(&obj->base);
 		i915_gem_info_remove_obj(dev->dev_private, obj->base.size);
 		free(obj, DRM_I915_GEM);
 		return ret;
 	}
 
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_unreference(&obj->base);
 	CTR2(KTR_DRM, "object_create %p %x", obj, size);
 
 	*handle_p = handle;
 	return 0;
 }
 
 int
 i915_gem_dumb_create(struct drm_file *file,
 		     struct drm_device *dev,
 		     struct drm_mode_create_dumb *args)
 {
 	/* have to work out size/pitch and return them */
 	args->pitch = roundup2(args->width * ((args->bpp + 7) / 8), 64);
 	args->size = args->pitch * args->height;
 	return i915_gem_create(file, dev,
 			       args->size, &args->handle);
 }
 
 int i915_gem_dumb_destroy(struct drm_file *file,
 			  struct drm_device *dev,
 			  uint32_t handle)
 {
 	return drm_gem_handle_delete(file, handle);
 }
 
 /**
  * Creates a new mm object and returns a handle to it.
  */
 int
 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file)
 {
 	struct drm_i915_gem_create *args = data;
 
 	return i915_gem_create(file, dev,
 			       args->size, &args->handle);
 }
 
 static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = obj->base.dev->dev_private;
 
 	return dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_9_10_17 &&
 		obj->tiling_mode != I915_TILING_NONE;
 }
 
 static inline int
 __copy_to_user_swizzled(char __user *cpu_vaddr,
 			const char *gpu_vaddr, int gpu_offset,
 			int length)
 {
 	int ret, cpu_offset = 0;
 
 	while (length > 0) {
 		int cacheline_end = roundup2(gpu_offset + 1, 64);
 		int this_length = min(cacheline_end - gpu_offset, length);
 		int swizzled_gpu_offset = gpu_offset ^ 64;
 
 		ret = __copy_to_user(cpu_vaddr + cpu_offset,
 				     gpu_vaddr + swizzled_gpu_offset,
 				     this_length);
 		if (ret)
 			return ret + length;
 
 		cpu_offset += this_length;
 		gpu_offset += this_length;
 		length -= this_length;
 	}
 
 	return 0;
 }
 
 static inline int
 __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 			  const char __user *cpu_vaddr,
 			  int length)
 {
 	int ret, cpu_offset = 0;
 
 	while (length > 0) {
 		int cacheline_end = roundup2(gpu_offset + 1, 64);
 		int this_length = min(cacheline_end - gpu_offset, length);
 		int swizzled_gpu_offset = gpu_offset ^ 64;
 
 		ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 				       cpu_vaddr + cpu_offset,
 				       this_length);
 		if (ret)
 			return ret + length;
 
 		cpu_offset += this_length;
 		gpu_offset += this_length;
 		length -= this_length;
 	}
 
 	return 0;
 }
 
 /* Per-page copy function for the shmem pread fastpath.
  * Flushes invalid cachelines before reading the target if
  * needs_clflush is set. */
 static int
 shmem_pread_fast(vm_page_t page, int shmem_page_offset, int page_length,
 		 char __user *user_data,
 		 bool page_do_bit17_swizzling, bool needs_clflush)
 {
 	char *vaddr;
 	struct sf_buf *sf;
 	int ret;
 
 	if (unlikely(page_do_bit17_swizzling))
 		return -EINVAL;
 
 	sched_pin();
 	sf = sf_buf_alloc(page, SFB_NOWAIT | SFB_CPUPRIVATE);
 	if (sf == NULL) {
 		sched_unpin();
 		return (-EFAULT);
 	}
 	vaddr = (char *)sf_buf_kva(sf);
 	if (needs_clflush)
 		drm_clflush_virt_range(vaddr + shmem_page_offset,
 				       page_length);
 	ret = __copy_to_user_inatomic(user_data,
 				      vaddr + shmem_page_offset,
 				      page_length);
 	sf_buf_free(sf);
 	sched_unpin();
 
 	return ret ? -EFAULT : 0;
 }
 
 static void
 shmem_clflush_swizzled_range(char *addr, unsigned long length,
 			     bool swizzled)
 {
 	if (unlikely(swizzled)) {
 		unsigned long start = (unsigned long) addr;
 		unsigned long end = (unsigned long) addr + length;
 
 		/* For swizzling simply ensure that we always flush both
 		 * channels. Lame, but simple and it works. Swizzled
 		 * pwrite/pread is far from a hotpath - current userspace
 		 * doesn't use it at all. */
 		start = round_down(start, 128);
 		end = round_up(end, 128);
 
 		drm_clflush_virt_range((void *)start, end - start);
 	} else {
 		drm_clflush_virt_range(addr, length);
 	}
 
 }
 
 /* Only difference to the fast-path function is that this can handle bit17
  * and uses non-atomic copy and kmap functions. */
 static int
 shmem_pread_slow(vm_page_t page, int shmem_page_offset, int page_length,
 		 char __user *user_data,
 		 bool page_do_bit17_swizzling, bool needs_clflush)
 {
 	char *vaddr;
 	struct sf_buf *sf;
 	int ret;
 
 	sf = sf_buf_alloc(page, 0);
 	vaddr = (char *)sf_buf_kva(sf);
 	if (needs_clflush)
 		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
 					     page_length,
 					     page_do_bit17_swizzling);
 
 	if (page_do_bit17_swizzling)
 		ret = __copy_to_user_swizzled(user_data,
 					      vaddr, shmem_page_offset,
 					      page_length);
 	else
 		ret = __copy_to_user(user_data,
 				     vaddr + shmem_page_offset,
 				     page_length);
 	sf_buf_free(sf);
 
 	return ret ? - EFAULT : 0;
 }
 
 static int
 i915_gem_shmem_pread(struct drm_device *dev,
 		     struct drm_i915_gem_object *obj,
 		     struct drm_i915_gem_pread *args,
 		     struct drm_file *file)
 {
 	char __user *user_data;
 	ssize_t remain;
 	off_t offset;
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
 	int prefaulted = 0;
 	int needs_clflush = 0;
 
 	user_data = to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
 	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) {
 		/* If we're not in the cpu read domain, set ourself into the gtt
 		 * read domain and manually flush cachelines (if required). This
 		 * optimizes for the case when the gpu will dirty the data
 		 * anyway again before the next pread happens. */
 		if (obj->cache_level == I915_CACHE_NONE)
 			needs_clflush = 1;
 		if (obj->gtt_space) {
 			ret = i915_gem_object_set_to_gtt_domain(obj, false);
 			if (ret)
 				return ret;
 		}
 	}
 
 	ret = i915_gem_object_get_pages(obj);
 	if (ret)
 		return ret;
 
 	i915_gem_object_pin_pages(obj);
 
 	offset = args->offset;
 
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	for (vm_page_t page = vm_page_find_least(obj->base.vm_obj,
 	    OFF_TO_IDX(offset));; page = vm_page_next(page)) {
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 
 		if (remain <= 0)
 			break;
 
 		/* Operation in this page
 		 *
 		 * shmem_page_offset = offset within page in shmem file
 		 * page_length = bytes to copy for this page
 		 */
 		shmem_page_offset = offset_in_page(offset);
 		page_length = remain;
 		if ((shmem_page_offset + page_length) > PAGE_SIZE)
 			page_length = PAGE_SIZE - shmem_page_offset;
 
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
 		ret = shmem_pread_fast(page, shmem_page_offset, page_length,
 				       user_data, page_do_bit17_swizzling,
 				       needs_clflush);
 		if (ret == 0)
 			goto next_page;
 
 		hit_slowpath = 1;
 		DRM_UNLOCK(dev);
 
 		if (!prefaulted) {
 			ret = fault_in_multipages_writeable(user_data, remain);
 			/* Userspace is tricking us, but we've already clobbered
 			 * its pages with the prefault and promised to write the
 			 * data up to the first fault. Hence ignore any errors
 			 * and just continue. */
 			(void)ret;
 			prefaulted = 1;
 		}
 
 		ret = shmem_pread_slow(page, shmem_page_offset, page_length,
 				       user_data, page_do_bit17_swizzling,
 				       needs_clflush);
 
 		DRM_LOCK(dev);
 
 next_page:
 		vm_page_reference(page);
 
 		if (ret)
 			goto out;
 
 		remain -= page_length;
 		user_data += page_length;
 		offset += page_length;
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 	}
 
 out:
 	i915_gem_object_unpin_pages(obj);
 
 	if (hit_slowpath) {
 		/* Fixup: Kill any reinstated backing storage pages */
 		if (obj->madv == __I915_MADV_PURGED)
 			i915_gem_object_truncate(obj);
 	}
 
 	return ret;
 }
 
 /**
  * Reads data from the object referenced by handle.
  *
  * On error, the contents of *data are undefined.
  */
 int
 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
 	struct drm_i915_gem_pread *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret = 0;
 
 	if (args->size == 0)
 		return 0;
 
 	if (!useracc(to_user_ptr(args->data_ptr), args->size, VM_PROT_WRITE))
 		return -EFAULT;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	/* Bounds check source.  */
 	if (args->offset > obj->base.size ||
 	    args->size > obj->base.size - args->offset) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 #ifdef FREEBSD_WIP
 	/* prime objects have no backing filp to GEM pread/pwrite
 	 * pages from.
 	 */
 	if (!obj->base.filp) {
 		ret = -EINVAL;
 		goto out;
 	}
 #endif /* FREEBSD_WIP */
 
 	CTR3(KTR_DRM, "pread %p %jx %jx", obj, args->offset, args->size);
 
 	ret = i915_gem_shmem_pread(dev, obj, args, file);
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 /* This is the fast write path which cannot handle
  * page faults in the source data
  */
 
 static inline int
 fast_user_write(vm_paddr_t mapping_addr,
 		off_t page_base, int page_offset,
 		char __user *user_data,
 		int length)
 {
 	void __iomem *vaddr_atomic;
 	void *vaddr;
 	unsigned long unwritten;
 
 	vaddr_atomic = pmap_mapdev_attr(mapping_addr + page_base,
 	    length, PAT_WRITE_COMBINING);
 	/* We can use the cpu mem copy function because this is X86. */
 	vaddr = (char __force*)vaddr_atomic + page_offset;
 	unwritten = __copy_from_user_inatomic_nocache(vaddr,
 						      user_data, length);
 	pmap_unmapdev((vm_offset_t)vaddr_atomic, length);
 	return unwritten;
 }
 
 /**
  * This is the fast pwrite path, where we copy the data directly from the
  * user into the GTT, uncached.
  */
 static int
 i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 			 struct drm_i915_gem_object *obj,
 			 struct drm_i915_gem_pwrite *args,
 			 struct drm_file *file)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	ssize_t remain;
 	off_t offset, page_base;
 	char __user *user_data;
 	int page_offset, page_length, ret;
 
 	ret = i915_gem_object_pin(obj, 0, true, true);
 	if (ret)
 		goto out;
 
 	ret = i915_gem_object_set_to_gtt_domain(obj, true);
 	if (ret)
 		goto out_unpin;
 
 	ret = i915_gem_object_put_fence(obj);
 	if (ret)
 		goto out_unpin;
 
 	user_data = to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	offset = obj->gtt_offset + args->offset;
 
 	while (remain > 0) {
 		/* Operation in this page
 		 *
 		 * page_base = page offset within aperture
 		 * page_offset = offset within page
 		 * page_length = bytes to copy for this page
 		 */
 		page_base = offset & ~PAGE_MASK;
 		page_offset = offset_in_page(offset);
 		page_length = remain;
 		if ((page_offset + remain) > PAGE_SIZE)
 			page_length = PAGE_SIZE - page_offset;
 
 		/* If we get a fault while copying data, then (presumably) our
 		 * source page isn't available.  Return the error and we'll
 		 * retry in the slow path.
 		 */
 		if (fast_user_write(dev_priv->mm.gtt_base_addr, page_base,
 				    page_offset, user_data, page_length)) {
 			ret = -EFAULT;
 			goto out_unpin;
 		}
 
 		remain -= page_length;
 		user_data += page_length;
 		offset += page_length;
 	}
 
 out_unpin:
 	i915_gem_object_unpin(obj);
 out:
 	return ret;
 }
 
 /* Per-page copy function for the shmem pwrite fastpath.
  * Flushes invalid cachelines before writing to the target if
  * needs_clflush_before is set and flushes out any written cachelines after
  * writing if needs_clflush is set. */
 static int
 shmem_pwrite_fast(vm_page_t page, int shmem_page_offset, int page_length,
 		  char __user *user_data,
 		  bool page_do_bit17_swizzling,
 		  bool needs_clflush_before,
 		  bool needs_clflush_after)
 {
 	char *vaddr;
 	struct sf_buf *sf;
 	int ret;
 
 	if (unlikely(page_do_bit17_swizzling))
 		return -EINVAL;
 
 	sched_pin();
 	sf = sf_buf_alloc(page, SFB_NOWAIT | SFB_CPUPRIVATE);
 	if (sf == NULL) {
 		sched_unpin();
 		return (-EFAULT);
 	}
 	vaddr = (char *)sf_buf_kva(sf);
 	if (needs_clflush_before)
 		drm_clflush_virt_range(vaddr + shmem_page_offset,
 				       page_length);
 	ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset,
 						user_data,
 						page_length);
 	if (needs_clflush_after)
 		drm_clflush_virt_range(vaddr + shmem_page_offset,
 				       page_length);
 	sf_buf_free(sf);
 	sched_unpin();
 
 	return ret ? -EFAULT : 0;
 }
 
 /* Only difference to the fast-path function is that this can handle bit17
  * and uses non-atomic copy and kmap functions. */
 static int
 shmem_pwrite_slow(vm_page_t page, int shmem_page_offset, int page_length,
 		  char __user *user_data,
 		  bool page_do_bit17_swizzling,
 		  bool needs_clflush_before,
 		  bool needs_clflush_after)
 {
 	char *vaddr;
 	struct sf_buf *sf;
 	int ret;
 
 	sf = sf_buf_alloc(page, 0);
 	vaddr = (char *)sf_buf_kva(sf);
 	if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
 		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
 					     page_length,
 					     page_do_bit17_swizzling);
 	if (page_do_bit17_swizzling)
 		ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
 						user_data,
 						page_length);
 	else
 		ret = __copy_from_user(vaddr + shmem_page_offset,
 				       user_data,
 				       page_length);
 	if (needs_clflush_after)
 		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
 					     page_length,
 					     page_do_bit17_swizzling);
 	sf_buf_free(sf);
 
 	return ret ? -EFAULT : 0;
 }
 
 static int
 i915_gem_shmem_pwrite(struct drm_device *dev,
 		      struct drm_i915_gem_object *obj,
 		      struct drm_i915_gem_pwrite *args,
 		      struct drm_file *file)
 {
 	ssize_t remain;
 	off_t offset;
 	char __user *user_data;
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
 	int needs_clflush_after = 0;
 	int needs_clflush_before = 0;
 
 	user_data = to_user_ptr(args->data_ptr);
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 		/* If we're not in the cpu write domain, set ourself into the gtt
 		 * write domain and manually flush cachelines (if required). This
 		 * optimizes for the case when the gpu will use the data
 		 * right away and we therefore have to clflush anyway. */
 		if (obj->cache_level == I915_CACHE_NONE)
 			needs_clflush_after = 1;
 		if (obj->gtt_space) {
 			ret = i915_gem_object_set_to_gtt_domain(obj, true);
 			if (ret)
 				return ret;
 		}
 	}
 	/* Same trick applies for invalidate partially written cachelines before
 	 * writing.  */
 	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)
 	    && obj->cache_level == I915_CACHE_NONE)
 		needs_clflush_before = 1;
 
 	ret = i915_gem_object_get_pages(obj);
 	if (ret)
 		return ret;
 
 	i915_gem_object_pin_pages(obj);
 
 	offset = args->offset;
 	obj->dirty = 1;
 
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	for (vm_page_t page = vm_page_find_least(obj->base.vm_obj,
 	    OFF_TO_IDX(offset));; page = vm_page_next(page)) {
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 		int partial_cacheline_write;
 
 		if (remain <= 0)
 			break;
 
 		/* Operation in this page
 		 *
 		 * shmem_page_offset = offset within page in shmem file
 		 * page_length = bytes to copy for this page
 		 */
 		shmem_page_offset = offset_in_page(offset);
 
 		page_length = remain;
 		if ((shmem_page_offset + page_length) > PAGE_SIZE)
 			page_length = PAGE_SIZE - shmem_page_offset;
 
 		/* If we don't overwrite a cacheline completely we need to be
 		 * careful to have up-to-date data by first clflushing. Don't
 		 * overcomplicate things and flush the entire patch. */
 		partial_cacheline_write = needs_clflush_before &&
 			((shmem_page_offset | page_length)
 				& (cpu_clflush_line_size - 1));
 
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
 		ret = shmem_pwrite_fast(page, shmem_page_offset, page_length,
 					user_data, page_do_bit17_swizzling,
 					partial_cacheline_write,
 					needs_clflush_after);
 		if (ret == 0)
 			goto next_page;
 
 		hit_slowpath = 1;
 		DRM_UNLOCK(dev);
 		ret = shmem_pwrite_slow(page, shmem_page_offset, page_length,
 					user_data, page_do_bit17_swizzling,
 					partial_cacheline_write,
 					needs_clflush_after);
 
 		DRM_LOCK(dev);
 
 next_page:
 		vm_page_dirty(page);
 		vm_page_reference(page);
 
 		if (ret)
 			goto out;
 
 		remain -= page_length;
 		user_data += page_length;
 		offset += page_length;
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 	}
 
 out:
 	i915_gem_object_unpin_pages(obj);
 
 	if (hit_slowpath) {
 		/* Fixup: Kill any reinstated backing storage pages */
 		if (obj->madv == __I915_MADV_PURGED)
 			i915_gem_object_truncate(obj);
 		/* and flush dirty cachelines in case the object isn't in the cpu write
 		 * domain anymore. */
 		if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 			i915_gem_clflush_object(obj);
 			i915_gem_chipset_flush(dev);
 		}
 	}
 
 	if (needs_clflush_after)
 		i915_gem_chipset_flush(dev);
 
 	return ret;
 }
 
 /**
  * Writes data to the object referenced by handle.
  *
  * On error, the contents of the buffer that were to be modified are undefined.
  */
 int
 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file)
 {
 	struct drm_i915_gem_pwrite *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	if (args->size == 0)
 		return 0;
 
 	if (!useracc(to_user_ptr(args->data_ptr), args->size, VM_PROT_READ))
 		return -EFAULT;
 
 	ret = fault_in_multipages_readable(to_user_ptr(args->data_ptr),
 					   args->size);
 	if (ret)
 		return -EFAULT;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	/* Bounds check destination. */
 	if (args->offset > obj->base.size ||
 	    args->size > obj->base.size - args->offset) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 #ifdef FREEBSD_WIP
 	/* prime objects have no backing filp to GEM pread/pwrite
 	 * pages from.
 	 */
 	if (!obj->base.filp) {
 		ret = -EINVAL;
 		goto out;
 	}
 #endif /* FREEBSD_WIP */
 
 	CTR3(KTR_DRM, "pwrite %p %jx %jx", obj, args->offset, args->size);
 
 	ret = -EFAULT;
 	/* We can only do the GTT pwrite on untiled buffers, as otherwise
 	 * it would end up going through the fenced access, and we'll get
 	 * different detiling behavior between reading and writing.
 	 * pread/pwrite currently are reading and writing from the CPU
 	 * perspective, requiring manual detiling by the client.
 	 */
 	if (obj->phys_obj) {
 		ret = i915_gem_phys_pwrite(dev, obj, args, file);
 		goto out;
 	}
 
 	if (obj->cache_level == I915_CACHE_NONE &&
 	    obj->tiling_mode == I915_TILING_NONE &&
 	    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 		ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file);
 		/* Note that the gtt paths might fail with non-page-backed user
 		 * pointers (e.g. gtt mappings when moving data between
 		 * textures). Fallback to the shmem path in that case. */
 	}
 
 	if (ret == -EFAULT || ret == -ENOSPC)
 		ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 int
 i915_gem_check_wedge(struct drm_i915_private *dev_priv,
 		     bool interruptible)
 {
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		struct completion *x = &dev_priv->error_completion;
 		bool recovery_complete;
 
 		/* Give the error handler a chance to run. */
 		mtx_lock(&x->lock);
 		recovery_complete = x->done > 0;
 		mtx_unlock(&x->lock);
 
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
 			return -EIO;
 
 		/* Recovery complete, but still wedged means reset failure. */
 		if (recovery_complete)
 			return -EIO;
 
 		return -EAGAIN;
 	}
 
 	return 0;
 }
 
 /*
  * Compare seqno against outstanding lazy request. Emit a request if they are
  * equal.
  */
 static int
 i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
 {
 	int ret;
 
 	DRM_LOCK_ASSERT(ring->dev);
 
 	ret = 0;
 	if (seqno == ring->outstanding_lazy_request)
 		ret = i915_add_request(ring, NULL, NULL);
 
 	return ret;
 }
 
 /**
  * __wait_seqno - wait until execution of seqno has finished
  * @ring: the ring expected to report seqno
  * @seqno: duh!
  * @interruptible: do an interruptible wait (normally yes)
  * @timeout: in - how long to wait (NULL forever); out - how much time remaining
  *
  * Returns 0 if the seqno was found within the alloted time. Else returns the
  * errno with remaining time filled in timeout argument.
  */
 static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			bool interruptible, struct timespec *timeout)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
 	struct timespec before, now, wait_time={1,0};
 	sbintime_t timeout_sbt;
 	long end;
 	bool wait_forever = true;
 	int ret, flags;
 
 	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
 		return 0;
 
 	CTR2(KTR_DRM, "request_wait_begin %s %d", ring->name, seqno);
 
 	if (timeout != NULL) {
 		wait_time = *timeout;
 		wait_forever = false;
 	}
 
 	timeout_sbt = tstosbt(wait_time);
 
 	if (WARN_ON(!ring->irq_get(ring)))
 		return -ENODEV;
 
 	/* Record current time in case interrupted by signal, or wedged * */
 	getrawmonotonic(&before);
 
 #define EXIT_COND \
 	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
 	atomic_read(&dev_priv->mm.wedged))
 	flags = interruptible ? PCATCH : 0;
 	mtx_lock(&dev_priv->irq_lock);
 	do {
 		if (EXIT_COND) {
 			end = 1;
 		} else {
 			ret = -msleep_sbt(&ring->irq_queue, &dev_priv->irq_lock, flags,
 			    "915gwr", timeout_sbt, 0, 0);
 
 			/*
 			 * NOTE Linux<->FreeBSD: Convert msleep_sbt() return
 			 * value to something close to wait_event*_timeout()
 			 * functions used on Linux.
 			 *
 			 * >0 -> condition is true (end = time remaining)
 			 * =0 -> sleep timed out
 			 * <0 -> error (interrupted)
 			 *
 			 * We fake the remaining time by returning 1. We
 			 * compute a proper value later.
 			 */
 			if (EXIT_COND)
 				/* We fake a remaining time of 1 tick. */
 				end = 1;
 			else if (ret == -EINTR || ret == -ERESTART)
 				/* Interrupted. */
 				end = -ERESTARTSYS;
 			else
 				/* Timeout. */
 				end = 0;
 		}
 
 		ret = i915_gem_check_wedge(dev_priv, interruptible);
 		if (ret)
 			end = ret;
 	} while (end == 0 && wait_forever);
 	mtx_unlock(&dev_priv->irq_lock);
 
 	getrawmonotonic(&now);
 
 	ring->irq_put(ring);
 	CTR3(KTR_DRM, "request_wait_end %s %d %d", ring->name, seqno, end);
 #undef EXIT_COND
 
 	if (timeout) {
-		timespecsub(&now, &before);
-		timespecsub(timeout, &now);
+		timespecsub(&now, &before, &now);
+		timespecsub(timeout, &now, timeout);
 	}
 
 	switch (end) {
 	case -EIO:
 	case -EAGAIN: /* Wedged */
 	case -ERESTARTSYS: /* Signal */
 	case -ETIMEDOUT: /* Timeout */
 		return (int)end;
 	case 0: /* Timeout */
 		return -ETIMEDOUT;
 	default: /* Completed */
 		WARN_ON(end < 0); /* We're not aware of other errors */
 		return 0;
 	}
 }
 
 /**
  * Waits for a sequence number to be signaled, and cleans up the
  * request and object lists appropriately for that event.
  */
 int
 i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	bool interruptible = dev_priv->mm.interruptible;
 	int ret;
 
 	DRM_LOCK_ASSERT(dev);
 	BUG_ON(seqno == 0);
 
 	ret = i915_gem_check_wedge(dev_priv, interruptible);
 	if (ret)
 		return ret;
 
 	ret = i915_gem_check_olr(ring, seqno);
 	if (ret)
 		return ret;
 
 	return __wait_seqno(ring, seqno, interruptible, NULL);
 }
 
 /**
  * Ensures that all rendering to the object has completed and the object is
  * safe to unbind from the GTT or access from the CPU.
  */
 static __must_check int
 i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
 			       bool readonly)
 {
 	struct intel_ring_buffer *ring = obj->ring;
 	u32 seqno;
 	int ret;
 
 	seqno = readonly ? obj->last_write_seqno : obj->last_read_seqno;
 	if (seqno == 0)
 		return 0;
 
 	ret = i915_wait_seqno(ring, seqno);
 	if (ret)
 		return ret;
 
 	i915_gem_retire_requests_ring(ring);
 
 	/* Manually manage the write flush as we may have not yet
 	 * retired the buffer.
 	 */
 	if (obj->last_write_seqno &&
 	    i915_seqno_passed(seqno, obj->last_write_seqno)) {
 		obj->last_write_seqno = 0;
 		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 	}
 
 	return 0;
 }
 
 /* A nonblocking variant of the above wait. This is a highly dangerous routine
  * as the object state may change during this call.
  */
 static __must_check int
 i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 					    bool readonly)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring = obj->ring;
 	u32 seqno;
 	int ret;
 
 	DRM_LOCK_ASSERT(dev);
 	BUG_ON(!dev_priv->mm.interruptible);
 
 	seqno = readonly ? obj->last_write_seqno : obj->last_read_seqno;
 	if (seqno == 0)
 		return 0;
 
 	ret = i915_gem_check_wedge(dev_priv, true);
 	if (ret)
 		return ret;
 
 	ret = i915_gem_check_olr(ring, seqno);
 	if (ret)
 		return ret;
 
 	DRM_UNLOCK(dev);
 	ret = __wait_seqno(ring, seqno, true, NULL);
 	DRM_LOCK(dev);
 
 	i915_gem_retire_requests_ring(ring);
 
 	/* Manually manage the write flush as we may have not yet
 	 * retired the buffer.
 	 */
 	if (ret == 0 &&
 	    obj->last_write_seqno &&
 	    i915_seqno_passed(seqno, obj->last_write_seqno)) {
 		obj->last_write_seqno = 0;
 		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 	}
 
 	return ret;
 }
 
 /**
  * Called when user space prepares to use an object with the CPU, either
  * through the mmap ioctl's mapping or a GTT mapping.
  */
 int
 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file)
 {
 	struct drm_i915_gem_set_domain *args = data;
 	struct drm_i915_gem_object *obj;
 	uint32_t read_domains = args->read_domains;
 	uint32_t write_domain = args->write_domain;
 	int ret;
 
 	/* Only handle setting domains to types used by the CPU. */
 	if (write_domain & I915_GEM_GPU_DOMAINS)
 		return -EINVAL;
 
 	if (read_domains & I915_GEM_GPU_DOMAINS)
 		return -EINVAL;
 
 	/* Having something in the write domain implies it's in the read
 	 * domain, and only that read domain.  Enforce that in the request.
 	 */
 	if (write_domain != 0 && read_domains != write_domain)
 		return -EINVAL;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	/* Try to flush the object off the GPU without holding the lock.
 	 * We will repeat the flush holding the lock in the normal manner
 	 * to catch cases where we are gazumped.
 	 */
 	ret = i915_gem_object_wait_rendering__nonblocking(obj, !write_domain);
 	if (ret)
 		goto unref;
 
 	if (read_domains & I915_GEM_DOMAIN_GTT) {
 		ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0);
 
 		/* Silently promote "you're not bound, there was nothing to do"
 		 * to success, since the client was just asking us to
 		 * make sure everything was done.
 		 */
 		if (ret == -EINVAL)
 			ret = 0;
 	} else {
 		ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
 	}
 
 unref:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 /**
  * Called when user space has done writes to this buffer
  */
 int
 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file)
 {
 	struct drm_i915_gem_sw_finish *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret = 0;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	/* Pinned buffers may be scanout, so flush the cache */
 	if (obj->pin_count)
 		i915_gem_object_flush_cpu_write_domain(obj);
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 /**
  * Maps the contents of an object, returning the address it is mapped
  * into.
  *
  * While the mapping holds a reference on the contents of the object, it doesn't
  * imply a ref on the object itself.
  */
 int
 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file)
 {
 	struct drm_i915_gem_mmap *args = data;
 	struct drm_gem_object *obj;
 	struct proc *p;
 	vm_map_t map;
 	vm_offset_t addr;
 	vm_size_t size;
 	int error, rv;
 
 	obj = drm_gem_object_lookup(dev, file, args->handle);
 	if (obj == NULL)
 		return -ENOENT;
 
 #ifdef FREEBSD_WIP
 	/* prime objects have no backing filp to GEM mmap
 	 * pages from.
 	 */
 	if (!obj->filp) {
 		drm_gem_object_unreference_unlocked(obj);
 		return -EINVAL;
 	}
 #endif /* FREEBSD_WIP */
 
 	error = 0;
 	if (args->size == 0)
 		goto out;
 	p = curproc;
 	map = &p->p_vmspace->vm_map;
 	size = round_page(args->size);
 	PROC_LOCK(p);
 	if (map->size + size > lim_cur_proc(p, RLIMIT_VMEM)) {
 		PROC_UNLOCK(p);
 		error = -ENOMEM;
 		goto out;
 	}
 	PROC_UNLOCK(p);
 
 	addr = 0;
 	vm_object_reference(obj->vm_obj);
 	rv = vm_map_find(map, obj->vm_obj, args->offset, &addr, args->size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, MAP_INHERIT_SHARE);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(obj->vm_obj);
 		error = -vm_mmap_to_errno(rv);
 	} else {
 		args->addr_ptr = (uint64_t)addr;
 	}
 out:
 	drm_gem_object_unreference_unlocked(obj);
 	return (error);
 }
 
 static int
 i915_gem_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	/*
 	 * NOTE Linux<->FreeBSD: drm_gem_mmap_single() takes care of
 	 * calling drm_gem_object_reference(). That's why we don't
 	 * do this here. i915_gem_pager_dtor(), below, will call
 	 * drm_gem_object_unreference().
 	 *
 	 * On Linux, drm_gem_vm_open() references the object because
 	 * it's called the mapping is copied. drm_gem_vm_open() is not
 	 * called when the mapping is created. So the possible sequences
 	 * are:
 	 *     1. drm_gem_mmap():     ref++
 	 *     2. drm_gem_vm_close(): ref--
 	 *
 	 *     1. drm_gem_mmap():     ref++
 	 *     2. drm_gem_vm_open():  ref++ (for the copied vma)
 	 *     3. drm_gem_vm_close(): ref-- (for the copied vma)
 	 *     4. drm_gem_vm_close(): ref-- (for the initial vma)
 	 *
 	 * On FreeBSD, i915_gem_pager_ctor() is called once during the
 	 * creation of the mapping. No callback is called when the
 	 * mapping is shared during a fork(). i915_gem_pager_dtor() is
 	 * called when the last reference to the mapping is dropped. So
 	 * the only sequence is:
 	 *     1. drm_gem_mmap_single(): ref++
 	 *     2. i915_gem_pager_ctor(): <noop>
 	 *     3. i915_gem_pager_dtor(): ref--
 	 */
 
 	*color = 0; /* XXXKIB */
 	return (0);
 }
 
 /**
  * i915_gem_fault - fault a page into the GTT
  * vma: VMA in question
  * vmf: fault info
  *
  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
  * from userspace.  The fault handler takes care of binding the object to
  * the GTT (if needed), allocating and programming a fence register (again,
  * only if needed based on whether the old reg is still valid or the object
  * is tiled) and inserting a new PTE into the faulting process.
  *
  * Note that the faulting process may involve evicting existing objects
  * from the GTT and/or fence registers to make room.  So performance may
  * suffer if the GTT working set is large or there are few fence registers
  * left.
  */
 
 int i915_intr_pf;
 
 static int
 i915_gem_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	struct drm_gem_object *gem_obj = vm_obj->handle;
 	struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	vm_page_t page;
 	int ret = 0;
 	bool write = (max_prot & VM_PROT_WRITE) != 0;
 	bool pinned;
 
 	VM_OBJECT_WUNLOCK(vm_obj);
 retry:
 	ret = 0;
 	pinned = 0;
 	page = NULL;
 
 	if (i915_intr_pf) {
 		ret = i915_mutex_lock_interruptible(dev);
 		if (ret != 0)
 			goto out;
 	} else
 		DRM_LOCK(dev);
 
 	/*
 	 * Since the object lock was dropped, other thread might have
 	 * faulted on the same GTT address and instantiated the
 	 * mapping for the page.  Recheck.
 	 */
 	VM_OBJECT_WLOCK(vm_obj);
 	page = vm_page_lookup(vm_obj, pidx);
 	if (page != NULL) {
 		if (vm_page_busied(page)) {
 			DRM_UNLOCK(dev);
 			vm_page_lock(page);
 			VM_OBJECT_WUNLOCK(vm_obj);
 			vm_page_busy_sleep(page, "915pee", false);
 			goto retry;
 		}
 		goto have_page;
 	} else
 		VM_OBJECT_WUNLOCK(vm_obj);
 
 	/* Now bind it into the GTT if needed */
 	ret = i915_gem_object_pin(obj, 0, true, false);
 	if (ret)
 		goto unlock;
 	pinned = 1;
 
 	ret = i915_gem_object_set_to_gtt_domain(obj, write);
 	if (ret)
 		goto unpin;
 
 	ret = i915_gem_object_get_fence(obj);
 	if (ret)
 		goto unpin;
 
 	obj->fault_mappable = true;
 
 	page = PHYS_TO_VM_PAGE(dev_priv->mm.gtt_base_addr + obj->gtt_offset +
 	    IDX_TO_OFF(pidx));
 	if (page == NULL) {
 		ret = -EFAULT;
 		goto unpin;
 	}
 	KASSERT((page->flags & PG_FICTITIOUS) != 0,
 	    ("physical address %#jx not fictitious, page %p",
 	    (uintmax_t)(dev_priv->mm.gtt_base_addr + obj->gtt_offset +
 	    IDX_TO_OFF(pidx)), page));
 	KASSERT(page->wire_count == 1, ("wire_count not 1 %p", page));
 
 	VM_OBJECT_WLOCK(vm_obj);
 	if (vm_page_busied(page)) {
 		i915_gem_object_unpin(obj);
 		DRM_UNLOCK(dev);
 		vm_page_lock(page);
 		VM_OBJECT_WUNLOCK(vm_obj);
 		vm_page_busy_sleep(page, "915pbs", false);
 		goto retry;
 	}
 	if (vm_page_insert(page, vm_obj, pidx)) {
 		i915_gem_object_unpin(obj);
 		DRM_UNLOCK(dev);
 		VM_OBJECT_WUNLOCK(vm_obj);
 		vm_wait(vm_obj);
 		goto retry;
 	}
 	page->valid = VM_PAGE_BITS_ALL;
 have_page:
 	vm_page_xbusy(page);
 
 	CTR4(KTR_DRM, "fault %p %jx %x phys %x", gem_obj, pidx, fault_type,
 	    page->phys_addr);
 	if (pinned) {
 		/*
 		 * We may have not pinned the object if the page was
 		 * found by the call to vm_page_lookup().
 		 */
 		i915_gem_object_unpin(obj);
 	}
 	DRM_UNLOCK(dev);
 	*first = *last = pidx;
 	return (VM_PAGER_OK);
 
 unpin:
 	i915_gem_object_unpin(obj);
 unlock:
 	DRM_UNLOCK(dev);
 out:
 	KASSERT(ret != 0, ("i915_gem_pager_fault: wrong return"));
 	CTR4(KTR_DRM, "fault_fail %p %jx %x err %d", gem_obj, pidx, fault_type,
 	    -ret);
 	if (ret == -ERESTARTSYS) {
 		/*
 		 * NOTE Linux<->FreeBSD: Convert Linux' -ERESTARTSYS to
 		 * the more common -EINTR, so the page fault is retried.
 		 */
 		ret = -EINTR;
 	}
 	if (ret == -EAGAIN || ret == -EIO || ret == -EINTR) {
 		kern_yield(PRI_USER);
 		goto retry;
 	}
 	VM_OBJECT_WLOCK(vm_obj);
 	return (VM_PAGER_ERROR);
 }
 
 static void
 i915_gem_pager_dtor(void *handle)
 {
 	struct drm_gem_object *obj = handle;
 	struct drm_device *dev = obj->dev;
 
 	DRM_LOCK(dev);
 	drm_gem_object_unreference(obj);
 	DRM_UNLOCK(dev);
 }
 
 struct cdev_pager_ops i915_gem_pager_ops = {
 	.cdev_pg_populate	= i915_gem_pager_populate,
 	.cdev_pg_ctor		= i915_gem_pager_ctor,
 	.cdev_pg_dtor		= i915_gem_pager_dtor,
 };
 
 /**
  * i915_gem_release_mmap - remove physical page mappings
  * @obj: obj in question
  *
  * Preserve the reservation of the mmapping with the DRM core code, but
  * relinquish ownership of the pages back to the system.
  *
  * It is vital that we remove the page mapping if we have mapped a tiled
  * object through the GTT and then lose the fence register due to
  * resource pressure. Similarly if the object has been moved out of the
  * aperture, than pages mapped into userspace must be revoked. Removing the
  * mapping will then trigger a page fault on the next user access, allowing
  * fixup by i915_gem_fault().
  */
 void
 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 {
 	vm_object_t devobj;
 	vm_page_t page;
 	int i, page_count;
 
 	if (!obj->fault_mappable)
 		return;
 
 	CTR3(KTR_DRM, "release_mmap %p %x %x", obj, obj->gtt_offset,
 	    OFF_TO_IDX(obj->base.size));
 	devobj = cdev_pager_lookup(obj);
 	if (devobj != NULL) {
 		page_count = OFF_TO_IDX(obj->base.size);
 
 		VM_OBJECT_WLOCK(devobj);
 retry:
 		for (i = 0; i < page_count; i++) {
 			page = vm_page_lookup(devobj, i);
 			if (page == NULL)
 				continue;
 			if (vm_page_sleep_if_busy(page, "915unm"))
 				goto retry;
 			cdev_pager_free_page(devobj, page);
 		}
 		VM_OBJECT_WUNLOCK(devobj);
 		vm_object_deallocate(devobj);
 	}
 
 	obj->fault_mappable = false;
 }
 
 static uint32_t
 i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode)
 {
 	uint32_t gtt_size;
 
 	if (INTEL_INFO(dev)->gen >= 4 ||
 	    tiling_mode == I915_TILING_NONE)
 		return size;
 
 	/* Previous chips need a power-of-two fence region when tiling */
 	if (INTEL_INFO(dev)->gen == 3)
 		gtt_size = 1024*1024;
 	else
 		gtt_size = 512*1024;
 
 	while (gtt_size < size)
 		gtt_size <<= 1;
 
 	return gtt_size;
 }
 
 /**
  * i915_gem_get_gtt_alignment - return required GTT alignment for an object
  * @obj: object to check
  *
  * Return the required GTT alignment for an object, taking into account
  * potential fence register mapping.
  */
 static uint32_t
 i915_gem_get_gtt_alignment(struct drm_device *dev,
 			   uint32_t size,
 			   int tiling_mode)
 {
 	/*
 	 * Minimum alignment is 4k (GTT page size), but might be greater
 	 * if a fence register is needed for the object.
 	 */
 	if (INTEL_INFO(dev)->gen >= 4 ||
 	    tiling_mode == I915_TILING_NONE)
 		return 4096;
 
 	/*
 	 * Previous chips need to be aligned to the size of the smallest
 	 * fence register that can contain the object.
 	 */
 	return i915_gem_get_gtt_size(dev, size, tiling_mode);
 }
 
 /**
  * i915_gem_get_unfenced_gtt_alignment - return required GTT alignment for an
  *					 unfenced object
  * @dev: the device
  * @size: size of the object
  * @tiling_mode: tiling mode of the object
  *
  * Return the required GTT alignment for an object, only taking into account
  * unfenced tiled surface requirements.
  */
 uint32_t
 i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev,
 				    uint32_t size,
 				    int tiling_mode)
 {
 	/*
 	 * Minimum alignment is 4k (GTT page size) for sane hw.
 	 */
 	if (INTEL_INFO(dev)->gen >= 4 || IS_G33(dev) ||
 	    tiling_mode == I915_TILING_NONE)
 		return 4096;
 
 	/* Previous hardware however needs to be aligned to a power-of-two
 	 * tile height. The simplest method for determining this is to reuse
 	 * the power-of-tile object size.
 	 */
 	return i915_gem_get_gtt_size(dev, size, tiling_mode);
 }
 
 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 	int ret;
 
 	if (obj->base.on_map)
 		return 0;
 
 	dev_priv->mm.shrinker_no_lock_stealing = true;
 
 	ret = drm_gem_create_mmap_offset(&obj->base);
 	if (ret != -ENOSPC)
 		goto out;
 
 	/* Badly fragmented mmap space? The only way we can recover
 	 * space is by destroying unwanted objects. We can't randomly release
 	 * mmap_offsets as userspace expects them to be persistent for the
 	 * lifetime of the objects. The closest we can is to release the
 	 * offsets on purgeable objects by truncating it and marking it purged,
 	 * which prevents userspace from ever using that object again.
 	 */
 	i915_gem_purge(dev_priv, obj->base.size >> PAGE_SHIFT);
 	ret = drm_gem_create_mmap_offset(&obj->base);
 	if (ret != -ENOSPC)
 		goto out;
 
 	i915_gem_shrink_all(dev_priv);
 	ret = drm_gem_create_mmap_offset(&obj->base);
 out:
 	dev_priv->mm.shrinker_no_lock_stealing = false;
 
 	return ret;
 }
 
 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
 {
 	if (!obj->base.on_map)
 		return;
 
 	drm_gem_free_mmap_offset(&obj->base);
 }
 
 int
 i915_gem_mmap_gtt(struct drm_file *file,
 		  struct drm_device *dev,
 		  uint32_t handle,
 		  uint64_t *offset)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->base.size > dev_priv->mm.gtt_mappable_end) {
 		ret = -E2BIG;
 		goto out;
 	}
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to mmap a purgeable buffer\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	ret = i915_gem_object_create_mmap_offset(obj);
 	if (ret)
 		goto out;
 
 	*offset = DRM_GEM_MAPPING_OFF(obj->base.map_list.key) |
 	    DRM_GEM_MAPPING_KEY;
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 /**
  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
  * @dev: DRM device
  * @data: GTT mapping ioctl data
  * @file: GEM object info
  *
  * Simply returns the fake offset to userspace so it can mmap it.
  * The mmap call will end up in drm_gem_mmap(), which will set things
  * up so we can get faults in the handler above.
  *
  * The fault handler will take care of binding the object into the GTT
  * (since it may have been evicted to make room for something), allocating
  * a fence register, and mapping the appropriate aperture address into
  * userspace.
  */
 int
 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file)
 {
 	struct drm_i915_gem_mmap_gtt *args = data;
 
 	return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
 }
 
 /* Immediately discard the backing storage */
 static void
 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
 {
 	vm_object_t vm_obj;
 
 	vm_obj = obj->base.vm_obj;
 	VM_OBJECT_WLOCK(vm_obj);
 	vm_object_page_remove(vm_obj, 0, 0, false);
 	VM_OBJECT_WUNLOCK(vm_obj);
 	i915_gem_object_free_mmap_offset(obj);
 
 	obj->madv = __I915_MADV_PURGED;
 }
 
 static inline int
 i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj)
 {
 	return obj->madv == I915_MADV_DONTNEED;
 }
 
 static void
 i915_gem_object_put_pages_range_locked(struct drm_i915_gem_object *obj,
     vm_pindex_t si, vm_pindex_t ei)
 {
 	vm_object_t vm_obj;
 	vm_page_t page;
 	vm_pindex_t i;
 
 	vm_obj = obj->base.vm_obj;
 	VM_OBJECT_ASSERT_LOCKED(vm_obj);
 	for (i = si,  page = vm_page_lookup(vm_obj, i); i < ei;
 	    page = vm_page_next(page), i++) {
 		KASSERT(page->pindex == i, ("pindex %jx %jx",
 		    (uintmax_t)page->pindex, (uintmax_t)i));
 		vm_page_lock(page);
 		if (vm_page_unwire(page, PQ_INACTIVE))
 			atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 		vm_page_unlock(page);
 	}
 }
 
 #define	GEM_PARANOID_CHECK_GTT 0
 #if GEM_PARANOID_CHECK_GTT
 static void
 i915_gem_assert_pages_not_mapped(struct drm_device *dev, vm_page_t *ma,
     int page_count)
 {
 	struct drm_i915_private *dev_priv;
 	vm_paddr_t pa;
 	unsigned long start, end;
 	u_int i;
 	int j;
 
 	dev_priv = dev->dev_private;
 	start = OFF_TO_IDX(dev_priv->mm.gtt_start);
 	end = OFF_TO_IDX(dev_priv->mm.gtt_end);
 	for (i = start; i < end; i++) {
 		pa = intel_gtt_read_pte_paddr(i);
 		for (j = 0; j < page_count; j++) {
 			if (pa == VM_PAGE_TO_PHYS(ma[j])) {
 				panic("Page %p in GTT pte index %d pte %x",
 				    ma[i], i, intel_gtt_read_pte(i));
 			}
 		}
 	}
 }
 #endif
 
 static void
 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 {
 	int page_count = obj->base.size / PAGE_SIZE;
 	int ret, i;
 
 	BUG_ON(obj->madv == __I915_MADV_PURGED);
 
 	ret = i915_gem_object_set_to_cpu_domain(obj, true);
 	if (ret) {
 		/* In the event of a disaster, abandon all caches and
 		 * hope for the best.
 		 */
 		WARN_ON(ret != -EIO);
 		i915_gem_clflush_object(obj);
 		obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	}
 
 	if (i915_gem_object_needs_bit17_swizzle(obj))
 		i915_gem_object_save_bit_17_swizzle(obj);
 
 	if (obj->madv == I915_MADV_DONTNEED)
 		obj->dirty = 0;
 
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 #if GEM_PARANOID_CHECK_GTT
 	i915_gem_assert_pages_not_mapped(obj->base.dev, obj->pages, page_count);
 #endif
 	for (i = 0; i < page_count; i++) {
 		vm_page_t page = obj->pages[i];
 
 		if (obj->dirty)
 			vm_page_dirty(page);
 
 		if (obj->madv == I915_MADV_WILLNEED)
 			vm_page_reference(page);
 
 		vm_page_lock(page);
 		vm_page_unwire(obj->pages[i], PQ_ACTIVE);
 		vm_page_unlock(page);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 	obj->dirty = 0;
 
 	free(obj->pages, DRM_I915_GEM);
 	obj->pages = NULL;
 }
 
 static int
 i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 {
 	const struct drm_i915_gem_object_ops *ops = obj->ops;
 
 	if (obj->pages == NULL)
 		return 0;
 
 	BUG_ON(obj->gtt_space);
 
 	if (obj->pages_pin_count)
 		return -EBUSY;
 
 	/* ->put_pages might need to allocate memory for the bit17 swizzle
 	 * array, hence protect them from being reaped by removing them from gtt
 	 * lists early. */
 	list_del(&obj->gtt_list);
 
 	ops->put_pages(obj);
 	obj->pages = NULL;
 
 	if (i915_gem_object_is_purgeable(obj))
 		i915_gem_object_truncate(obj);
 
 	return 0;
 }
 
 static long
 __i915_gem_shrink(struct drm_i915_private *dev_priv, long target,
 		  bool purgeable_only)
 {
 	struct drm_i915_gem_object *obj, *next;
 	long count = 0;
 
 	list_for_each_entry_safe(obj, next,
 				 &dev_priv->mm.unbound_list,
 				 gtt_list) {
 		if ((i915_gem_object_is_purgeable(obj) || !purgeable_only) &&
 		    i915_gem_object_put_pages(obj) == 0) {
 			count += obj->base.size >> PAGE_SHIFT;
 			if (target != -1 && count >= target)
 				return count;
 		}
 	}
 
 	list_for_each_entry_safe(obj, next,
 				 &dev_priv->mm.inactive_list,
 				 mm_list) {
 		if ((i915_gem_object_is_purgeable(obj) || !purgeable_only) &&
 		    i915_gem_object_unbind(obj) == 0 &&
 		    i915_gem_object_put_pages(obj) == 0) {
 			count += obj->base.size >> PAGE_SHIFT;
 			if (target != -1 && count >= target)
 				return count;
 		}
 	}
 
 	return count;
 }
 
 static long
 i915_gem_purge(struct drm_i915_private *dev_priv, long target)
 {
 	return __i915_gem_shrink(dev_priv, target, true);
 }
 
 static void
 i915_gem_shrink_all(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_gem_object *obj, *next;
 
 	i915_gem_evict_everything(dev_priv->dev);
 
 	list_for_each_entry_safe(obj, next, &dev_priv->mm.unbound_list, gtt_list)
 		i915_gem_object_put_pages(obj);
 }
 
 static int
 i915_gem_object_get_pages_range(struct drm_i915_gem_object *obj,
     off_t start, off_t end)
 {
 	vm_object_t vm_obj;
 	vm_page_t page;
 	vm_pindex_t si, ei, i;
 	bool need_swizzle, fresh;
 
 	need_swizzle = i915_gem_object_needs_bit17_swizzle(obj) != 0;
 	vm_obj = obj->base.vm_obj;
 	si = OFF_TO_IDX(trunc_page(start));
 	ei = OFF_TO_IDX(round_page(end));
 	VM_OBJECT_WLOCK(vm_obj);
 	for (i = si; i < ei; i++) {
 		page = i915_gem_wire_page(vm_obj, i, &fresh);
 		if (page == NULL)
 			goto failed;
 		if (need_swizzle && fresh)
 			i915_gem_object_do_bit_17_swizzle_page(obj, page);
 	}
 	VM_OBJECT_WUNLOCK(vm_obj);
 	return (0);
 failed:
 	i915_gem_object_put_pages_range_locked(obj, si, i);
 	VM_OBJECT_WUNLOCK(vm_obj);
 	return (-EIO);
 }
 
 static int
 i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 {
 	vm_object_t vm_obj;
 	vm_page_t page;
 	vm_pindex_t i, page_count;
 	int res;
 
 	/* Assert that the object is not currently in any GPU domain. As it
 	 * wasn't in the GTT, there shouldn't be any way it could have been in
 	 * a GPU cache
 	 */
 	BUG_ON(obj->base.read_domains & I915_GEM_GPU_DOMAINS);
 	BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
 	KASSERT(obj->pages == NULL, ("Obj already has pages"));
 
 	page_count = OFF_TO_IDX(obj->base.size);
 	obj->pages = malloc(page_count * sizeof(vm_page_t), DRM_I915_GEM,
 	    M_WAITOK);
 	res = i915_gem_object_get_pages_range(obj, 0, obj->base.size);
 	if (res != 0) {
 		free(obj->pages, DRM_I915_GEM);
 		obj->pages = NULL;
 		return (res);
 	}
 	vm_obj = obj->base.vm_obj;
 	VM_OBJECT_WLOCK(vm_obj);
 	for (i = 0, page = vm_page_lookup(vm_obj, 0); i < page_count;
 	    i++, page = vm_page_next(page)) {
 		KASSERT(page->pindex == i, ("pindex %jx %jx",
 		    (uintmax_t)page->pindex, (uintmax_t)i));
 		obj->pages[i] = page;
 	}
 	VM_OBJECT_WUNLOCK(vm_obj);
 	return (0);
 }
 
 /* Ensure that the associated pages are gathered from the backing storage
  * and pinned into our object. i915_gem_object_get_pages() may be called
  * multiple times before they are released by a single call to
  * i915_gem_object_put_pages() - once the pages are no longer referenced
  * either as a result of memory pressure (reaping pages under the shrinker)
  * or as the object is itself released.
  */
 int
 i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 	const struct drm_i915_gem_object_ops *ops = obj->ops;
 	int ret;
 
 	if (obj->pages)
 		return 0;
 
 	BUG_ON(obj->pages_pin_count);
 
 	ret = ops->get_pages(obj);
 	if (ret)
 		return ret;
 
 	list_add_tail(&obj->gtt_list, &dev_priv->mm.unbound_list);
 	return 0;
 }
 
 void
 i915_gem_object_move_to_active(struct drm_i915_gem_object *obj,
 			       struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 seqno = intel_ring_get_seqno(ring);
 
 	BUG_ON(ring == NULL);
 	obj->ring = ring;
 
 	/* Add a reference if we're newly entering the active list. */
 	if (!obj->active) {
 		drm_gem_object_reference(&obj->base);
 		obj->active = 1;
 	}
 
 	/* Move from whatever list we were on to the tail of execution. */
 	list_move_tail(&obj->mm_list, &dev_priv->mm.active_list);
 	list_move_tail(&obj->ring_list, &ring->active_list);
 
 	obj->last_read_seqno = seqno;
 
 	if (obj->fenced_gpu_access) {
 		obj->last_fenced_seqno = seqno;
 
 		/* Bump MRU to take account of the delayed flush */
 		if (obj->fence_reg != I915_FENCE_REG_NONE) {
 			struct drm_i915_fence_reg *reg;
 
 			reg = &dev_priv->fence_regs[obj->fence_reg];
 			list_move_tail(&reg->lru_list,
 				       &dev_priv->mm.fence_list);
 		}
 	}
 }
 
 static void
 i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	BUG_ON(obj->base.write_domain & ~I915_GEM_GPU_DOMAINS);
 	BUG_ON(!obj->active);
 
 	list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	list_del_init(&obj->ring_list);
 	obj->ring = NULL;
 
 	obj->last_read_seqno = 0;
 	obj->last_write_seqno = 0;
 	obj->base.write_domain = 0;
 
 	obj->last_fenced_seqno = 0;
 	obj->fenced_gpu_access = false;
 
 	obj->active = 0;
 	drm_gem_object_unreference(&obj->base);
 
 	WARN_ON(i915_verify_lists(dev));
 }
 
 static int
 i915_gem_handle_seqno_wrap(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	int ret, i, j;
 
 	/* The hardware uses various monotonic 32-bit counters, if we
 	 * detect that they will wraparound we need to idle the GPU
 	 * and reset those counters.
 	 */
 	ret = 0;
 	for_each_ring(ring, dev_priv, i) {
 		for (j = 0; j < ARRAY_SIZE(ring->sync_seqno); j++)
 			ret |= ring->sync_seqno[j] != 0;
 	}
 	if (ret == 0)
 		return ret;
 
 	ret = i915_gpu_idle(dev);
 	if (ret)
 		return ret;
 
 	i915_gem_retire_requests(dev);
 	for_each_ring(ring, dev_priv, i) {
 		for (j = 0; j < ARRAY_SIZE(ring->sync_seqno); j++)
 			ring->sync_seqno[j] = 0;
 	}
 
 	return 0;
 }
 
 int
 i915_gem_get_seqno(struct drm_device *dev, u32 *seqno)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	/* reserve 0 for non-seqno */
 	if (dev_priv->next_seqno == 0) {
 		int ret = i915_gem_handle_seqno_wrap(dev);
 		if (ret)
 			return ret;
 
 		dev_priv->next_seqno = 1;
 	}
 
 	*seqno = dev_priv->next_seqno++;
 	return 0;
 }
 
 int
 i915_add_request(struct intel_ring_buffer *ring,
 		 struct drm_file *file,
 		 u32 *out_seqno)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
 	struct drm_i915_gem_request *request;
 	u32 request_ring_position;
 	int was_empty;
 	int ret;
 
 	/*
 	 * Emit any outstanding flushes - execbuf can fail to emit the flush
 	 * after having emitted the batchbuffer command. Hence we need to fix
 	 * things up similar to emitting the lazy request. The difference here
 	 * is that the flush _must_ happen before the next request, no matter
 	 * what.
 	 */
 	ret = intel_ring_flush_all_caches(ring);
 	if (ret)
 		return ret;
 
 	request = malloc(sizeof(*request), DRM_I915_GEM, M_NOWAIT);
 	if (request == NULL)
 		return -ENOMEM;
 
 
 	/* Record the position of the start of the request so that
 	 * should we detect the updated seqno part-way through the
 	 * GPU processing the request, we never over-estimate the
 	 * position of the head.
 	 */
 	request_ring_position = intel_ring_get_tail(ring);
 
 	ret = ring->add_request(ring);
 	if (ret) {
 		free(request, DRM_I915_GEM);
 		return ret;
 	}
 
 	request->seqno = intel_ring_get_seqno(ring);
 	request->ring = ring;
 	request->tail = request_ring_position;
 	request->emitted_jiffies = jiffies;
 	was_empty = list_empty(&ring->request_list);
 	list_add_tail(&request->list, &ring->request_list);
 	request->file_priv = NULL;
 
 	if (file) {
 		struct drm_i915_file_private *file_priv = file->driver_priv;
 
 		mtx_lock(&file_priv->mm.lock);
 		request->file_priv = file_priv;
 		list_add_tail(&request->client_list,
 			      &file_priv->mm.request_list);
 		mtx_unlock(&file_priv->mm.lock);
 	}
 
 	CTR2(KTR_DRM, "request_add %s %d", ring->name, request->seqno);
 	ring->outstanding_lazy_request = 0;
 
 	if (!dev_priv->mm.suspended) {
 		if (i915_enable_hangcheck) {
 			callout_schedule(&dev_priv->hangcheck_timer,
 			    DRM_I915_HANGCHECK_PERIOD);
 		}
 		if (was_empty) {
 			taskqueue_enqueue_timeout(dev_priv->wq,
 			    &dev_priv->mm.retire_work, hz);
 			intel_mark_busy(dev_priv->dev);
 		}
 	}
 
 	if (out_seqno)
 		*out_seqno = request->seqno;
 	return 0;
 }
 
 static inline void
 i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_file_private *file_priv = request->file_priv;
 
 	if (!file_priv)
 		return;
 
 	mtx_lock(&file_priv->mm.lock);
 	if (request->file_priv) {
 		list_del(&request->client_list);
 		request->file_priv = NULL;
 	}
 	mtx_unlock(&file_priv->mm.lock);
 }
 
 static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
 				      struct intel_ring_buffer *ring)
 {
 	if (ring->dev != NULL)
 		DRM_LOCK_ASSERT(ring->dev);
 
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
 		request = list_first_entry(&ring->request_list,
 					   struct drm_i915_gem_request,
 					   list);
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
 		free(request, DRM_I915_GEM);
 	}
 
 	while (!list_empty(&ring->active_list)) {
 		struct drm_i915_gem_object *obj;
 
 		obj = list_first_entry(&ring->active_list,
 				       struct drm_i915_gem_object,
 				       ring_list);
 
 		i915_gem_object_move_to_inactive(obj);
 	}
 }
 
 static void i915_gem_reset_fences(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
 
 		i915_gem_write_fence(dev, i, NULL);
 
 		if (reg->obj)
 			i915_gem_object_fence_lost(reg->obj);
 
 		reg->pin_count = 0;
 		reg->obj = NULL;
 		INIT_LIST_HEAD(&reg->lru_list);
 	}
 
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 }
 
 void i915_gem_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj;
 	struct intel_ring_buffer *ring;
 	int i;
 
 	for_each_ring(ring, dev_priv, i)
 		i915_gem_reset_ring_lists(dev_priv, ring);
 
 	/* Move everything out of the GPU domains to ensure we do any
 	 * necessary invalidation upon reuse.
 	 */
 	list_for_each_entry(obj,
 			    &dev_priv->mm.inactive_list,
 			    mm_list)
 	{
 		obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS;
 	}
 
 	/* The fence registers are invalidated so clear them out */
 	i915_gem_reset_fences(dev);
 }
 
 /**
  * This function clears the request list as sequence numbers are passed.
  */
 void
 i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
 {
 	uint32_t seqno;
 
 	if (list_empty(&ring->request_list))
 		return;
 
 	WARN_ON(i915_verify_lists(ring->dev));
 
 	seqno = ring->get_seqno(ring, true);
 	CTR2(KTR_DRM, "retire_request_ring %s %d", ring->name, seqno);
 
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
 		request = list_first_entry(&ring->request_list,
 					   struct drm_i915_gem_request,
 					   list);
 
 		if (!i915_seqno_passed(seqno, request->seqno))
 			break;
 
 		CTR2(KTR_DRM, "retire_request_seqno_passed %s %d",
 		    ring->name, seqno);
 		/* We know the GPU must have read the request to have
 		 * sent us the seqno + interrupt, so use the position
 		 * of tail of the request to update the last known position
 		 * of the GPU head.
 		 */
 		ring->last_retired_head = request->tail;
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
 		free(request, DRM_I915_GEM);
 	}
 
 	/* Move any buffers on the active list that are no longer referenced
 	 * by the ringbuffer to the flushing/inactive lists as appropriate.
 	 */
 	while (!list_empty(&ring->active_list)) {
 		struct drm_i915_gem_object *obj;
 
 		obj = list_first_entry(&ring->active_list,
 				      struct drm_i915_gem_object,
 				      ring_list);
 
 		if (!i915_seqno_passed(seqno, obj->last_read_seqno))
 			break;
 
 		i915_gem_object_move_to_inactive(obj);
 	}
 
 	if (unlikely(ring->trace_irq_seqno &&
 		     i915_seqno_passed(seqno, ring->trace_irq_seqno))) {
 		ring->irq_put(ring);
 		ring->trace_irq_seqno = 0;
 	}
 
 	WARN_ON(i915_verify_lists(ring->dev));
 }
 
 void
 i915_gem_retire_requests(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	int i;
 
 	for_each_ring(ring, dev_priv, i)
 		i915_gem_retire_requests_ring(ring);
 }
 
 static void
 i915_gem_retire_work_handler(void *arg, int pending)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_device *dev;
 	struct intel_ring_buffer *ring;
 	bool idle;
 	int i;
 
 	dev_priv = arg;
 	dev = dev_priv->dev;
 
 	/* Come back later if the device is busy... */
 	if (!sx_try_xlock(&dev->dev_struct_lock)) {
 		taskqueue_enqueue_timeout(dev_priv->wq,
 		    &dev_priv->mm.retire_work, hz);
 		return;
 	}
 
 	CTR0(KTR_DRM, "retire_task");
 
 	i915_gem_retire_requests(dev);
 
 	/* Send a periodic flush down the ring so we don't hold onto GEM
 	 * objects indefinitely.
 	 */
 	idle = true;
 	for_each_ring(ring, dev_priv, i) {
 		if (ring->gpu_caches_dirty)
 			i915_add_request(ring, NULL, NULL);
 
 		idle &= list_empty(&ring->request_list);
 	}
 
 	if (!dev_priv->mm.suspended && !idle)
 		taskqueue_enqueue_timeout(dev_priv->wq,
 		    &dev_priv->mm.retire_work, hz);
 	if (idle)
 		intel_mark_idle(dev);
 
 	DRM_UNLOCK(dev);
 }
 
 /**
  * Ensures that an object will eventually get non-busy by flushing any required
  * write domains, emitting any outstanding lazy request and retiring and
  * completed requests.
  */
 static int
 i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
 	if (obj->active) {
 		ret = i915_gem_check_olr(obj->ring, obj->last_read_seqno);
 		if (ret)
 			return ret;
 
 		i915_gem_retire_requests_ring(obj->ring);
 	}
 
 	return 0;
 }
 
 /**
  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
  * @DRM_IOCTL_ARGS: standard ioctl arguments
  *
  * Returns 0 if successful, else an error is returned with the remaining time in
  * the timeout parameter.
  *  -ETIME: object is still busy after timeout
  *  -ERESTARTSYS: signal interrupted the wait
  *  -ENONENT: object doesn't exist
  * Also possible, but rare:
  *  -EAGAIN: GPU wedged
  *  -ENOMEM: damn
  *  -ENODEV: Internal IRQ fail
  *  -E?: The add request failed
  *
  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
  * non-zero timeout parameter the wait ioctl will wait for the given number of
  * nanoseconds on an object becoming unbusy. Since the wait itself does so
  * without holding struct_mutex the object may become re-busied before this
  * function completes. A similar but shorter * race condition exists in the busy
  * ioctl
  */
 int
 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct drm_i915_gem_wait *args = data;
 	struct drm_i915_gem_object *obj;
 	struct intel_ring_buffer *ring = NULL;
 	struct timespec timeout_stack, *timeout = NULL;
 	u32 seqno = 0;
 	int ret = 0;
 
 	if (args->timeout_ns >= 0) {
 		timeout_stack.tv_sec = args->timeout_ns / 1000000;
 		timeout_stack.tv_nsec = args->timeout_ns % 1000000;
 		timeout = &timeout_stack;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->bo_handle));
 	if (&obj->base == NULL) {
 		DRM_UNLOCK(dev);
 		return -ENOENT;
 	}
 
 	/* Need to make sure the object gets inactive eventually. */
 	ret = i915_gem_object_flush_active(obj);
 	if (ret)
 		goto out;
 
 	if (obj->active) {
 		seqno = obj->last_read_seqno;
 		ring = obj->ring;
 	}
 
 	if (seqno == 0)
 		 goto out;
 
 	/* Do this after OLR check to make sure we make forward progress polling
 	 * on this IOCTL with a 0 timeout (like busy ioctl)
 	 */
 	if (!args->timeout_ns) {
 		ret = -ETIMEDOUT;
 		goto out;
 	}
 
 	drm_gem_object_unreference(&obj->base);
 	DRM_UNLOCK(dev);
 
 	ret = __wait_seqno(ring, seqno, true, timeout);
 	if (timeout) {
 		args->timeout_ns = timeout->tv_sec * 1000000 + timeout->tv_nsec;
 	}
 	return ret;
 
 out:
 	drm_gem_object_unreference(&obj->base);
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 /**
  * i915_gem_object_sync - sync an object to a ring.
  *
  * @obj: object which may be in use on another ring.
  * @to: ring we wish to use the object on. May be NULL.
  *
  * This code is meant to abstract object synchronization with the GPU.
  * Calling with NULL implies synchronizing the object with the CPU
  * rather than a particular GPU ring.
  *
  * Returns 0 if successful, else propagates up the lower layer error.
  */
 int
 i915_gem_object_sync(struct drm_i915_gem_object *obj,
 		     struct intel_ring_buffer *to)
 {
 	struct intel_ring_buffer *from = obj->ring;
 	u32 seqno;
 	int ret, idx;
 
 	if (from == NULL || to == from)
 		return 0;
 
 	if (to == NULL || !i915_semaphore_is_enabled(obj->base.dev))
 		return i915_gem_object_wait_rendering(obj, false);
 
 	idx = intel_ring_sync_index(from, to);
 
 	seqno = obj->last_read_seqno;
 	if (seqno <= from->sync_seqno[idx])
 		return 0;
 
 	ret = i915_gem_check_olr(obj->ring, seqno);
 	if (ret)
 		return ret;
 
 	ret = to->sync_to(to, from, seqno);
 	if (!ret)
 		/* We use last_read_seqno because sync_to()
 		 * might have just caused seqno wrap under
 		 * the radar.
 		 */
 		from->sync_seqno[idx] = obj->last_read_seqno;
 
 	return ret;
 }
 
 static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj)
 {
 	u32 old_write_domain, old_read_domains;
 
 	/* Act a barrier for all accesses through the GTT */
 	mb();
 
 	/* Force a pagefault for domain tracking on next user access */
 	i915_gem_release_mmap(obj);
 
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0)
 		return;
 
 	old_read_domains = obj->base.read_domains;
 	old_write_domain = obj->base.write_domain;
 
 	obj->base.read_domains &= ~I915_GEM_DOMAIN_GTT;
 	obj->base.write_domain &= ~I915_GEM_DOMAIN_GTT;
 
 	CTR3(KTR_DRM, "object_change_domain finish gtt %p %x %x",
 	    obj, old_read_domains, old_write_domain);
 }
 
 /**
  * Unbinds an object from the GTT aperture.
  */
 int
 i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = obj->base.dev->dev_private;
 	int ret = 0;
 
 	if (obj->gtt_space == NULL)
 		return 0;
 
 	if (obj->pin_count)
 		return -EBUSY;
 
 	BUG_ON(obj->pages == NULL);
 
 	ret = i915_gem_object_finish_gpu(obj);
 	if (ret)
 		return ret;
 	/* Continue on if we fail due to EIO, the GPU is hung so we
 	 * should be safe and we need to cleanup or else we might
 	 * cause memory corruption through use-after-free.
 	 */
 
 	i915_gem_object_finish_gtt(obj);
 
 	/* release the fence reg _after_ flushing */
 	ret = i915_gem_object_put_fence(obj);
 	if (ret)
 		return ret;
 
 	if (obj->has_global_gtt_mapping)
 		i915_gem_gtt_unbind_object(obj);
 	if (obj->has_aliasing_ppgtt_mapping) {
 		i915_ppgtt_unbind_object(dev_priv->mm.aliasing_ppgtt, obj);
 		obj->has_aliasing_ppgtt_mapping = 0;
 	}
 	i915_gem_gtt_finish_object(obj);
 
 	list_del(&obj->mm_list);
 	list_move_tail(&obj->gtt_list, &dev_priv->mm.unbound_list);
 	/* Avoid an unnecessary call to unbind on rebind. */
 	obj->map_and_fenceable = true;
 
 	drm_mm_put_block(obj->gtt_space);
 	obj->gtt_space = NULL;
 	obj->gtt_offset = 0;
 
 	return 0;
 }
 
 int i915_gpu_idle(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	int ret, i;
 
 	/* Flush everything onto the inactive list. */
 	for_each_ring(ring, dev_priv, i) {
 		ret = i915_switch_context(ring, NULL, DEFAULT_CONTEXT_ID);
 		if (ret)
 			return ret;
 
 		ret = intel_ring_idle(ring);
 		if (ret)
 			return ret;
 	}
 
 	return 0;
 }
 
 static void sandybridge_write_fence_reg(struct drm_device *dev, int reg,
 					struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	uint64_t val;
 
 	if (obj) {
 		u32 size = obj->gtt_space->size;
 
 		val = (uint64_t)((obj->gtt_offset + size - 4096) &
 				 0xfffff000) << 32;
 		val |= obj->gtt_offset & 0xfffff000;
 		val |= (uint64_t)((obj->stride / 128) - 1) <<
 			SANDYBRIDGE_FENCE_PITCH_SHIFT;
 
 		if (obj->tiling_mode == I915_TILING_Y)
 			val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 		val |= I965_FENCE_REG_VALID;
 	} else
 		val = 0;
 
 	I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + reg * 8, val);
 	POSTING_READ(FENCE_REG_SANDYBRIDGE_0 + reg * 8);
 }
 
 static void i965_write_fence_reg(struct drm_device *dev, int reg,
 				 struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	uint64_t val;
 
 	if (obj) {
 		u32 size = obj->gtt_space->size;
 
 		val = (uint64_t)((obj->gtt_offset + size - 4096) &
 				 0xfffff000) << 32;
 		val |= obj->gtt_offset & 0xfffff000;
 		val |= ((obj->stride / 128) - 1) << I965_FENCE_PITCH_SHIFT;
 		if (obj->tiling_mode == I915_TILING_Y)
 			val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 		val |= I965_FENCE_REG_VALID;
 	} else
 		val = 0;
 
 	I915_WRITE64(FENCE_REG_965_0 + reg * 8, val);
 	POSTING_READ(FENCE_REG_965_0 + reg * 8);
 }
 
 static void i915_write_fence_reg(struct drm_device *dev, int reg,
 				 struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 val;
 
 	if (obj) {
 		u32 size = obj->gtt_space->size;
 		int pitch_val;
 		int tile_width;
 
 		WARN((obj->gtt_offset & ~I915_FENCE_START_MASK) ||
 		     (size & -size) != size ||
 		     (obj->gtt_offset & (size - 1)),
 		     "object 0x%08x [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
 		     obj->gtt_offset, obj->map_and_fenceable, size);
 
 		if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
 			tile_width = 128;
 		else
 			tile_width = 512;
 
 		/* Note: pitch better be a power of two tile widths */
 		pitch_val = obj->stride / tile_width;
 		pitch_val = ffs(pitch_val) - 1;
 
 		val = obj->gtt_offset;
 		if (obj->tiling_mode == I915_TILING_Y)
 			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 		val |= I915_FENCE_SIZE_BITS(size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
 	} else
 		val = 0;
 
 	if (reg < 8)
 		reg = FENCE_REG_830_0 + reg * 4;
 	else
 		reg = FENCE_REG_945_8 + (reg - 8) * 4;
 
 	I915_WRITE(reg, val);
 	POSTING_READ(reg);
 }
 
 static void i830_write_fence_reg(struct drm_device *dev, int reg,
 				struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	uint32_t val;
 
 	if (obj) {
 		u32 size = obj->gtt_space->size;
 		uint32_t pitch_val;
 
 		WARN((obj->gtt_offset & ~I830_FENCE_START_MASK) ||
 		     (size & -size) != size ||
 		     (obj->gtt_offset & (size - 1)),
 		     "object 0x%08x not 512K or pot-size 0x%08x aligned\n",
 		     obj->gtt_offset, size);
 
 		pitch_val = obj->stride / 128;
 		pitch_val = ffs(pitch_val) - 1;
 
 		val = obj->gtt_offset;
 		if (obj->tiling_mode == I915_TILING_Y)
 			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 		val |= I830_FENCE_SIZE_BITS(size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
 	} else
 		val = 0;
 
 	I915_WRITE(FENCE_REG_830_0 + reg * 4, val);
 	POSTING_READ(FENCE_REG_830_0 + reg * 4);
 }
 
 static void i915_gem_write_fence(struct drm_device *dev, int reg,
 				 struct drm_i915_gem_object *obj)
 {
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
 	case 6: sandybridge_write_fence_reg(dev, reg, obj); break;
 	case 5:
 	case 4: i965_write_fence_reg(dev, reg, obj); break;
 	case 3: i915_write_fence_reg(dev, reg, obj); break;
 	case 2: i830_write_fence_reg(dev, reg, obj); break;
 	default: break;
 	}
 }
 
 static inline int fence_number(struct drm_i915_private *dev_priv,
 			       struct drm_i915_fence_reg *fence)
 {
 	return fence - dev_priv->fence_regs;
 }
 
 static void i915_gem_write_fence__ipi(void *data)
 {
 	wbinvd();
 }
 
 static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
 					 struct drm_i915_fence_reg *fence,
 					 bool enable)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int fence_reg = fence_number(dev_priv, fence);
 
 	/* In order to fully serialize access to the fenced region and
 	 * the update to the fence register we need to take extreme
 	 * measures on SNB+. In theory, the write to the fence register
 	 * flushes all memory transactions before, and coupled with the
 	 * mb() placed around the register write we serialise all memory
 	 * operations with respect to the changes in the tiler. Yet, on
 	 * SNB+ we need to take a step further and emit an explicit wbinvd()
 	 * on each processor in order to manually flush all memory
 	 * transactions before updating the fence register.
 	 */
 	if (HAS_LLC(obj->base.dev))
 		on_each_cpu(i915_gem_write_fence__ipi, NULL, 1);
 	i915_gem_write_fence(dev, fence_reg, enable ? obj : NULL);
 
 	if (enable) {
 		obj->fence_reg = fence_reg;
 		fence->obj = obj;
 		list_move_tail(&fence->lru_list, &dev_priv->mm.fence_list);
 	} else {
 		obj->fence_reg = I915_FENCE_REG_NONE;
 		fence->obj = NULL;
 		list_del_init(&fence->lru_list);
 	}
 }
 
 static int
 i915_gem_object_flush_fence(struct drm_i915_gem_object *obj)
 {
 	if (obj->last_fenced_seqno) {
 		int ret = i915_wait_seqno(obj->ring, obj->last_fenced_seqno);
 		if (ret)
 			return ret;
 
 		obj->last_fenced_seqno = 0;
 	}
 
 	/* Ensure that all CPU reads are completed before installing a fence
 	 * and all writes before removing the fence.
 	 */
 	if (obj->base.read_domains & I915_GEM_DOMAIN_GTT)
 		mb();
 
 	obj->fenced_gpu_access = false;
 	return 0;
 }
 
 int
 i915_gem_object_put_fence(struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 	int ret;
 
 	ret = i915_gem_object_flush_fence(obj);
 	if (ret)
 		return ret;
 
 	if (obj->fence_reg == I915_FENCE_REG_NONE)
 		return 0;
 
 	i915_gem_object_update_fence(obj,
 				     &dev_priv->fence_regs[obj->fence_reg],
 				     false);
 	i915_gem_object_fence_lost(obj);
 
 	return 0;
 }
 
 static struct drm_i915_fence_reg *
 i915_find_fence_reg(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_fence_reg *reg, *avail;
 	int i;
 
 	/* First try to find a free reg */
 	avail = NULL;
 	for (i = dev_priv->fence_reg_start; i < dev_priv->num_fence_regs; i++) {
 		reg = &dev_priv->fence_regs[i];
 		if (!reg->obj)
 			return reg;
 
 		if (!reg->pin_count)
 			avail = reg;
 	}
 
 	if (avail == NULL)
 		return NULL;
 
 	/* None available, try to steal one or wait for a user to finish */
 	list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) {
 		if (reg->pin_count)
 			continue;
 
 		return reg;
 	}
 
 	return NULL;
 }
 
 /**
  * i915_gem_object_get_fence - set up fencing for an object
  * @obj: object to map through a fence reg
  *
  * When mapping objects through the GTT, userspace wants to be able to write
  * to them without having to worry about swizzling if the object is tiled.
  * This function walks the fence regs looking for a free one for @obj,
  * stealing one if it can't find any.
  *
  * It then sets up the reg based on the object's properties: address, pitch
  * and tiling format.
  *
  * For an untiled surface, this removes any existing fence.
  */
 int
 i915_gem_object_get_fence(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	bool enable = obj->tiling_mode != I915_TILING_NONE;
 	struct drm_i915_fence_reg *reg;
 	int ret;
 
 	/* Have we updated the tiling parameters upon the object and so
 	 * will need to serialise the write to the associated fence register?
 	 */
 	if (obj->fence_dirty) {
 		ret = i915_gem_object_flush_fence(obj);
 		if (ret)
 			return ret;
 	}
 
 	/* Just update our place in the LRU if our fence is getting reused. */
 	if (obj->fence_reg != I915_FENCE_REG_NONE) {
 		reg = &dev_priv->fence_regs[obj->fence_reg];
 		if (!obj->fence_dirty) {
 			list_move_tail(&reg->lru_list,
 				       &dev_priv->mm.fence_list);
 			return 0;
 		}
 	} else if (enable) {
 		reg = i915_find_fence_reg(dev);
 		if (reg == NULL)
 			return -EDEADLK;
 
 		if (reg->obj) {
 			struct drm_i915_gem_object *old = reg->obj;
 
 			ret = i915_gem_object_flush_fence(old);
 			if (ret)
 				return ret;
 
 			i915_gem_object_fence_lost(old);
 		}
 	} else
 		return 0;
 
 	i915_gem_object_update_fence(obj, reg, enable);
 	obj->fence_dirty = false;
 
 	return 0;
 }
 
 static bool i915_gem_valid_gtt_space(struct drm_device *dev,
 				     struct drm_mm_node *gtt_space,
 				     unsigned long cache_level)
 {
 	struct drm_mm_node *other;
 
 	/* On non-LLC machines we have to be careful when putting differing
 	 * types of snoopable memory together to avoid the prefetcher
 	 * crossing memory domains and dying.
 	 */
 	if (HAS_LLC(dev))
 		return true;
 
 	if (gtt_space == NULL)
 		return true;
 
 	if (list_empty(&gtt_space->node_list))
 		return true;
 
 	other = list_entry(gtt_space->node_list.prev, struct drm_mm_node, node_list);
 	if (other->allocated && !other->hole_follows && other->color != cache_level)
 		return false;
 
 	other = list_entry(gtt_space->node_list.next, struct drm_mm_node, node_list);
 	if (other->allocated && !gtt_space->hole_follows && other->color != cache_level)
 		return false;
 
 	return true;
 }
 
 static void i915_gem_verify_gtt(struct drm_device *dev)
 {
 #if WATCH_GTT
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj;
 	int err = 0;
 
 	list_for_each_entry(obj, &dev_priv->mm.gtt_list, gtt_list) {
 		if (obj->gtt_space == NULL) {
 			DRM_ERROR("object found on GTT list with no space reserved\n");
 			err++;
 			continue;
 		}
 
 		if (obj->cache_level != obj->gtt_space->color) {
 			DRM_ERROR("object reserved space [%08lx, %08lx] with wrong color, cache_level=%x, color=%lx\n",
 			       obj->gtt_space->start,
 			       obj->gtt_space->start + obj->gtt_space->size,
 			       obj->cache_level,
 			       obj->gtt_space->color);
 			err++;
 			continue;
 		}
 
 		if (!i915_gem_valid_gtt_space(dev,
 					      obj->gtt_space,
 					      obj->cache_level)) {
 			DRM_ERROR("invalid GTT space found at [%08lx, %08lx] - color=%x\n",
 			       obj->gtt_space->start,
 			       obj->gtt_space->start + obj->gtt_space->size,
 			       obj->cache_level);
 			err++;
 			continue;
 		}
 	}
 
 	WARN_ON(err);
 #endif
 }
 
 /**
  * Finds free space in the GTT aperture and binds the object there.
  */
 static int
 i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
 			    unsigned alignment,
 			    bool map_and_fenceable,
 			    bool nonblocking)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_mm_node *node;
 	u32 size, fence_size, fence_alignment, unfenced_alignment;
 	bool mappable, fenceable;
 	int ret;
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to bind a purgeable object\n");
 		return -EINVAL;
 	}
 
 	fence_size = i915_gem_get_gtt_size(dev,
 					   obj->base.size,
 					   obj->tiling_mode);
 	fence_alignment = i915_gem_get_gtt_alignment(dev,
 						     obj->base.size,
 						     obj->tiling_mode);
 	unfenced_alignment =
 		i915_gem_get_unfenced_gtt_alignment(dev,
 						    obj->base.size,
 						    obj->tiling_mode);
 
 	if (alignment == 0)
 		alignment = map_and_fenceable ? fence_alignment :
 						unfenced_alignment;
 	if (map_and_fenceable && alignment & (fence_alignment - 1)) {
 		DRM_ERROR("Invalid object alignment requested %u\n", alignment);
 		return -EINVAL;
 	}
 
 	size = map_and_fenceable ? fence_size : obj->base.size;
 
 	/* If the object is bigger than the entire aperture, reject it early
 	 * before evicting everything in a vain attempt to find space.
 	 */
 	if (obj->base.size >
 	    (map_and_fenceable ? dev_priv->mm.gtt_mappable_end : dev_priv->mm.gtt_total)) {
 		DRM_ERROR("Attempting to bind an object larger than the aperture\n");
 		return -E2BIG;
 	}
 
 	ret = i915_gem_object_get_pages(obj);
 	if (ret)
 		return ret;
 
 	i915_gem_object_pin_pages(obj);
 
 	node = malloc(sizeof(*node), DRM_MEM_MM, M_NOWAIT | M_ZERO);
 	if (node == NULL) {
 		i915_gem_object_unpin_pages(obj);
 		return -ENOMEM;
 	}
 
  search_free:
 	if (map_and_fenceable)
 		ret = drm_mm_insert_node_in_range_generic(&dev_priv->mm.gtt_space, node,
 							  size, alignment, obj->cache_level,
 							  0, dev_priv->mm.gtt_mappable_end);
 	else
 		ret = drm_mm_insert_node_generic(&dev_priv->mm.gtt_space, node,
 						 size, alignment, obj->cache_level);
 	if (ret) {
 		ret = i915_gem_evict_something(dev, size, alignment,
 					       obj->cache_level,
 					       map_and_fenceable,
 					       nonblocking);
 		if (ret == 0)
 			goto search_free;
 
 		i915_gem_object_unpin_pages(obj);
 		free(node, DRM_MEM_MM);
 		return ret;
 	}
 	if (WARN_ON(!i915_gem_valid_gtt_space(dev, node, obj->cache_level))) {
 		i915_gem_object_unpin_pages(obj);
 		drm_mm_put_block(node);
 		return -EINVAL;
 	}
 
 	ret = i915_gem_gtt_prepare_object(obj);
 	if (ret) {
 		i915_gem_object_unpin_pages(obj);
 		drm_mm_put_block(node);
 		return ret;
 	}
 
 	list_move_tail(&obj->gtt_list, &dev_priv->mm.bound_list);
 	list_add_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	obj->gtt_space = node;
 	obj->gtt_offset = node->start;
 
 	fenceable =
 		node->size == fence_size &&
 		(node->start & (fence_alignment - 1)) == 0;
 
 	mappable =
 		obj->gtt_offset + obj->base.size <= dev_priv->mm.gtt_mappable_end;
 
 	obj->map_and_fenceable = mappable && fenceable;
 
 	i915_gem_object_unpin_pages(obj);
 	CTR4(KTR_DRM, "object_bind %p %x %x %d", obj, obj->gtt_offset,
 	    obj->base.size, map_and_fenceable);
 	i915_gem_verify_gtt(dev);
 	return 0;
 }
 
 void
 i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 {
 	/* If we don't have a page list set up, then we're not pinned
 	 * to GPU, and we can ignore the cache flush because it'll happen
 	 * again at bind time.
 	 */
 	if (obj->pages == NULL)
 		return;
 
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
 	 * the caches are only snooped when the render cache is
 	 * flushed/invalidated.  As we always have to emit invalidations
 	 * and flushes when moving into and out of the RENDER domain, correct
 	 * snooping behaviour occurs naturally as the result of our domain
 	 * tracking.
 	 */
 	if (obj->cache_level != I915_CACHE_NONE)
 		return;
 
 	CTR1(KTR_DRM, "object_clflush %p", obj);
 
 	drm_clflush_pages(obj->pages, obj->base.size / PAGE_SIZE);
 }
 
 /** Flushes the GTT write domain for the object if it's dirty. */
 static void
 i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 {
 	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_GTT)
 		return;
 
 	/* No actual flushing is required for the GTT write domain.  Writes
 	 * to it immediately go to main memory as far as we know, so there's
 	 * no chipset flush.  It also doesn't land in render cache.
 	 *
 	 * However, we do have to enforce the order so that all writes through
 	 * the GTT land before any writes to the device, such as updates to
 	 * the GATT itself.
 	 */
 	wmb();
 
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
 	CTR3(KTR_DRM, "object_change_domain flush gtt_write %p %x %x", obj,
 	    obj->base.read_domains, old_write_domain);
 }
 
 /** Flushes the CPU write domain for the object if it's dirty. */
 static void
 i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 {
 	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
 	i915_gem_clflush_object(obj);
 	i915_gem_chipset_flush(obj->base.dev);
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
 	CTR3(KTR_DRM, "object_change_domain flush_cpu_write %p %x %x", obj,
 	    obj->base.read_domains, old_write_domain);
 }
 
 /**
  * Moves a single object to the GTT read, and possibly write domain.
  *
  * This function returns when the move is complete, including waiting on
  * flushes to occur.
  */
 int
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	drm_i915_private_t *dev_priv = obj->base.dev->dev_private;
 	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	/* Not valid to be called on unbound objects. */
 	if (obj->gtt_space == NULL)
 		return -EINVAL;
 
 	if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
 		return 0;
 
 	ret = i915_gem_object_wait_rendering(obj, !write);
 	if (ret)
 		return ret;
 
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	/* It should now be out of any other write domains, and we can update
 	 * the domain values for our changes.
 	 */
 	BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
 	obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
 	if (write) {
 		obj->base.read_domains = I915_GEM_DOMAIN_GTT;
 		obj->base.write_domain = I915_GEM_DOMAIN_GTT;
 		obj->dirty = 1;
 	}
 
 	CTR3(KTR_DRM, "object_change_domain set_to_gtt %p %x %x", obj,
 	    old_read_domains, old_write_domain);
 
 	/* And bump the LRU for this access */
 	if (i915_gem_object_is_inactive(obj))
 		list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	return 0;
 }
 
 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 				    enum i915_cache_level cache_level)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
 	if (obj->cache_level == cache_level)
 		return 0;
 
 	if (obj->pin_count) {
 		DRM_DEBUG("can not change the cache level of pinned objects\n");
 		return -EBUSY;
 	}
 
 	if (!i915_gem_valid_gtt_space(dev, obj->gtt_space, cache_level)) {
 		ret = i915_gem_object_unbind(obj);
 		if (ret)
 			return ret;
 	}
 
 	if (obj->gtt_space) {
 		ret = i915_gem_object_finish_gpu(obj);
 		if (ret)
 			return ret;
 
 		i915_gem_object_finish_gtt(obj);
 
 		/* Before SandyBridge, you could not use tiling or fence
 		 * registers with snooped memory, so relinquish any fences
 		 * currently pointing to our region in the aperture.
 		 */
 		if (INTEL_INFO(dev)->gen < 6) {
 			ret = i915_gem_object_put_fence(obj);
 			if (ret)
 				return ret;
 		}
 
 		if (obj->has_global_gtt_mapping)
 			i915_gem_gtt_bind_object(obj, cache_level);
 		if (obj->has_aliasing_ppgtt_mapping)
 			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
 					       obj, cache_level);
 
 		obj->gtt_space->color = cache_level;
 	}
 
 	if (cache_level == I915_CACHE_NONE) {
 		u32 old_read_domains, old_write_domain;
 
 		/* If we're coming from LLC cached, then we haven't
 		 * actually been tracking whether the data is in the
 		 * CPU cache or not, since we only allow one bit set
 		 * in obj->write_domain and have been skipping the clflushes.
 		 * Just set it to the CPU cache for now.
 		 */
 		WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
 		WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU);
 
 		old_read_domains = obj->base.read_domains;
 		old_write_domain = obj->base.write_domain;
 
 		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 
 		CTR3(KTR_DRM, "object_change_domain set_cache_level %p %x %x",
 		    obj, old_read_domains, old_write_domain);
 	}
 
 	obj->cache_level = cache_level;
 	i915_gem_verify_gtt(dev);
 	return 0;
 }
 
 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file)
 {
 	struct drm_i915_gem_caching *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	args->caching = obj->cache_level != I915_CACHE_NONE;
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file)
 {
 	struct drm_i915_gem_caching *args = data;
 	struct drm_i915_gem_object *obj;
 	enum i915_cache_level level;
 	int ret;
 
 	switch (args->caching) {
 	case I915_CACHING_NONE:
 		level = I915_CACHE_NONE;
 		break;
 	case I915_CACHING_CACHED:
 		level = I915_CACHE_LLC;
 		break;
 	default:
 		return -EINVAL;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	ret = i915_gem_object_set_cache_level(obj, level);
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 static bool is_pin_display(struct drm_i915_gem_object *obj)
 {
 	/* There are 3 sources that pin objects:
 	 *   1. The display engine (scanouts, sprites, cursors);
 	 *   2. Reservations for execbuffer;
 	 *   3. The user.
 	 *
 	 * We can ignore reservations as we hold the struct_mutex and
 	 * are only called outside of the reservation path.  The user
 	 * can only increment pin_count once, and so if after
 	 * subtracting the potential reference by the user, any pin_count
 	 * remains, it must be due to another use by the display engine.
 	 */
 	return obj->pin_count - !!obj->user_pin_count;
 }
 
 /*
  * Prepare buffer for display plane (scanout, cursors, etc).
  * Can be called from an uninterruptible phase (modesetting) and allows
  * any flushes to be pipelined (for pageflips).
  */
 int
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined)
 {
 	u32 old_read_domains, old_write_domain;
 	int ret;
 
 	if (pipelined != obj->ring) {
 		ret = i915_gem_object_sync(obj, pipelined);
 		if (ret)
 			return ret;
 	}
 
 	/* Mark the pin_display early so that we account for the
 	 * display coherency whilst setting up the cache domains.
 	 */
 	obj->pin_display = true;
 
 	/* The display engine is not coherent with the LLC cache on gen6.  As
 	 * a result, we make sure that the pinning that is about to occur is
 	 * done with uncached PTEs. This is lowest common denominator for all
 	 * chipsets.
 	 *
 	 * However for gen6+, we could do better by using the GFDT bit instead
 	 * of uncaching, which would allow us to flush all the LLC-cached data
 	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
 	 */
 	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
 	if (ret)
 		goto err_unpin_display;
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
 	 * always use map_and_fenceable for all scanout buffers.
 	 */
 	ret = i915_gem_object_pin(obj, alignment, true, false);
 	if (ret)
 		goto err_unpin_display;
 
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	/* It should now be out of any other write domains, and we can update
 	 * the domain values for our changes.
 	 */
 	obj->base.write_domain = 0;
 	obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
 
 	CTR3(KTR_DRM, "object_change_domain pin_to_display_plan %p %x %x",
 	    obj, old_read_domains, old_write_domain);
 
 	return 0;
 
 err_unpin_display:
 	obj->pin_display = is_pin_display(obj);
 	return ret;
 }
 
 void
 i915_gem_object_unpin_from_display_plane(struct drm_i915_gem_object *obj)
 {
 	i915_gem_object_unpin(obj);
 	obj->pin_display = is_pin_display(obj);
 }
 
 int
 i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
 	if ((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0)
 		return 0;
 
 	ret = i915_gem_object_wait_rendering(obj, false);
 	if (ret)
 		return ret;
 
 	/* Ensure that we invalidate the GPU's caches and TLBs. */
 	obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS;
 	return 0;
 }
 
 /**
  * Moves a single object to the CPU read, and possibly write domain.
  *
  * This function returns when the move is complete, including waiting on
  * flushes to occur.
  */
 int
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
 		return 0;
 
 	ret = i915_gem_object_wait_rendering(obj, !write);
 	if (ret)
 		return ret;
 
 	i915_gem_object_flush_gtt_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	/* Flush the CPU cache if it's still invalid. */
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
 		i915_gem_clflush_object(obj);
 
 		obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
 	}
 
 	/* It should now be out of any other write domains, and we can update
 	 * the domain values for our changes.
 	 */
 	BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
 
 	/* If we're writing through the CPU, then the GPU read domains will
 	 * need to be invalidated at next use.
 	 */
 	if (write) {
 		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	}
 
 	CTR3(KTR_DRM, "object_change_domain set_to_cpu %p %x %x", obj,
 	    old_read_domains, old_write_domain);
 
 	return 0;
 }
 
 /* Throttle our rendering by waiting until the ring has completed our requests
  * emitted over 20 msec ago.
  *
  * Note that if we were to use the current jiffies each time around the loop,
  * we wouldn't escape the function with any frames outstanding if the time to
  * render a frame was over 20ms.
  *
  * This should get us reasonable parallelism between CPU and GPU but also
  * relatively low latency when blocking on a particular request to finish.
  */
 static int
 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	unsigned long recent_enough = jiffies - msecs_to_jiffies(20);
 	struct drm_i915_gem_request *request;
 	struct intel_ring_buffer *ring = NULL;
 	u32 seqno = 0;
 	int ret;
 
 	if (atomic_read(&dev_priv->mm.wedged))
 		return -EIO;
 
 	mtx_lock(&file_priv->mm.lock);
 	list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
 		if (time_after_eq(request->emitted_jiffies, recent_enough))
 			break;
 
 		ring = request->ring;
 		seqno = request->seqno;
 	}
 	mtx_unlock(&file_priv->mm.lock);
 
 	if (seqno == 0)
 		return 0;
 
 	ret = __wait_seqno(ring, seqno, true, NULL);
 	if (ret == 0)
 		taskqueue_enqueue_timeout(dev_priv->wq,
 		    &dev_priv->mm.retire_work, 0);
 
 	return ret;
 }
 
 int
 i915_gem_object_pin(struct drm_i915_gem_object *obj,
 		    uint32_t alignment,
 		    bool map_and_fenceable,
 		    bool nonblocking)
 {
 	int ret;
 
 	if (WARN_ON(obj->pin_count == DRM_I915_GEM_OBJECT_MAX_PIN_COUNT))
 		return -EBUSY;
 
 	if (obj->gtt_space != NULL) {
 		if ((alignment && obj->gtt_offset & (alignment - 1)) ||
 		    (map_and_fenceable && !obj->map_and_fenceable)) {
 			WARN(obj->pin_count,
 			     "bo is already pinned with incorrect alignment:"
 			     " offset=%x, req.alignment=%x, req.map_and_fenceable=%d,"
 			     " obj->map_and_fenceable=%d\n",
 			     obj->gtt_offset, alignment,
 			     map_and_fenceable,
 			     obj->map_and_fenceable);
 			ret = i915_gem_object_unbind(obj);
 			if (ret)
 				return ret;
 		}
 	}
 
 	if (obj->gtt_space == NULL) {
 		struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 
 		ret = i915_gem_object_bind_to_gtt(obj, alignment,
 						  map_and_fenceable,
 						  nonblocking);
 		if (ret)
 			return ret;
 
 		if (!dev_priv->mm.aliasing_ppgtt)
 			i915_gem_gtt_bind_object(obj, obj->cache_level);
 	}
 
 	if (!obj->has_global_gtt_mapping && map_and_fenceable)
 		i915_gem_gtt_bind_object(obj, obj->cache_level);
 
 	obj->pin_count++;
 	obj->pin_mappable |= map_and_fenceable;
 
 	return 0;
 }
 
 void
 i915_gem_object_unpin(struct drm_i915_gem_object *obj)
 {
 	BUG_ON(obj->pin_count == 0);
 	BUG_ON(obj->gtt_space == NULL);
 
 	if (--obj->pin_count == 0)
 		obj->pin_mappable = false;
 }
 
 int
 i915_gem_pin_ioctl(struct drm_device *dev, void *data,
 		   struct drm_file *file)
 {
 	struct drm_i915_gem_pin *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to pin a purgeable buffer\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (obj->pin_filp != NULL && obj->pin_filp != file) {
 		DRM_ERROR("Already pinned in i915_gem_pin_ioctl(): %d\n",
 			  args->handle);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (obj->user_pin_count == 0) {
 		ret = i915_gem_object_pin(obj, args->alignment, true, false);
 		if (ret)
 			goto out;
 	}
 
 	obj->user_pin_count++;
 	obj->pin_filp = file;
 
 	/* XXX - flush the CPU caches for pinned objects
 	 * as the X server doesn't manage domains yet
 	 */
 	i915_gem_object_flush_cpu_write_domain(obj);
 	args->offset = obj->gtt_offset;
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 int
 i915_gem_unpin_ioctl(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
 	struct drm_i915_gem_pin *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->pin_filp != file) {
 		DRM_ERROR("Not pinned by caller in i915_gem_pin_ioctl(): %d\n",
 			  args->handle);
 		ret = -EINVAL;
 		goto out;
 	}
 	obj->user_pin_count--;
 	if (obj->user_pin_count == 0) {
 		obj->pin_filp = NULL;
 		i915_gem_object_unpin(obj);
 	}
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 int
 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file)
 {
 	struct drm_i915_gem_busy *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	/* Count all active objects as busy, even if they are currently not used
 	 * by the gpu. Users of this interface expect objects to eventually
 	 * become non-busy without any further actions, therefore emit any
 	 * necessary flushes here.
 	 */
 	ret = i915_gem_object_flush_active(obj);
 
 	args->busy = obj->active;
 	if (obj->ring) {
 		BUILD_BUG_ON(I915_NUM_RINGS > 16);
 		args->busy |= intel_ring_flag(obj->ring) << 16;
 	}
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 int
 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv)
 {
 	return i915_gem_ring_throttle(dev, file_priv);
 }
 
 int
 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv)
 {
 	struct drm_i915_gem_madvise *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	switch (args->madv) {
 	case I915_MADV_DONTNEED:
 	case I915_MADV_WILLNEED:
 	    break;
 	default:
 	    return -EINVAL;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->pin_count) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (obj->madv != __I915_MADV_PURGED)
 		obj->madv = args->madv;
 
 	/* if the object is no longer attached, discard its backing storage */
 	if (i915_gem_object_is_purgeable(obj) && obj->pages == NULL)
 		i915_gem_object_truncate(obj);
 
 	args->retained = obj->madv != __I915_MADV_PURGED;
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return ret;
 }
 
 void i915_gem_object_init(struct drm_i915_gem_object *obj,
 			  const struct drm_i915_gem_object_ops *ops)
 {
 	INIT_LIST_HEAD(&obj->mm_list);
 	INIT_LIST_HEAD(&obj->gtt_list);
 	INIT_LIST_HEAD(&obj->ring_list);
 	INIT_LIST_HEAD(&obj->exec_list);
 
 	obj->ops = ops;
 
 	obj->fence_reg = I915_FENCE_REG_NONE;
 	obj->madv = I915_MADV_WILLNEED;
 	/* Avoid an unnecessary call to unbind on the first bind. */
 	obj->map_and_fenceable = true;
 
 	i915_gem_info_add_obj(obj->base.dev->dev_private, obj->base.size);
 }
 
 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
 	.get_pages = i915_gem_object_get_pages_gtt,
 	.put_pages = i915_gem_object_put_pages_gtt,
 };
 
 struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
 						  size_t size)
 {
 	struct drm_i915_gem_object *obj;
 
 	obj = malloc(sizeof(*obj), DRM_I915_GEM, M_WAITOK | M_ZERO);
 	if (obj == NULL)
 		return NULL;
 
 	if (drm_gem_object_init(dev, &obj->base, size) != 0) {
 		free(obj, DRM_I915_GEM);
 		return NULL;
 	}
 
 #ifdef FREEBSD_WIP
 	mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
 	if (IS_CRESTLINE(dev) || IS_BROADWATER(dev)) {
 		/* 965gm cannot relocate objects above 4GiB. */
 		mask &= ~__GFP_HIGHMEM;
 		mask |= __GFP_DMA32;
 	}
 
 	mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
 	mapping_set_gfp_mask(mapping, mask);
 #endif /* FREEBSD_WIP */
 
 	i915_gem_object_init(obj, &i915_gem_object_ops);
 
 	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 
 	if (HAS_LLC(dev)) {
 		/* On some devices, we can have the GPU use the LLC (the CPU
 		 * cache) for about a 10% performance improvement
 		 * compared to uncached.  Graphics requests other than
 		 * display scanout are coherent with the CPU in
 		 * accessing this cache.  This means in this mode we
 		 * don't need to clflush on the CPU side, and on the
 		 * GPU side we only need to flush internal caches to
 		 * get data visible to the CPU.
 		 *
 		 * However, we maintain the display planes as UC, and so
 		 * need to rebind when first used as such.
 		 */
 		obj->cache_level = I915_CACHE_LLC;
 	} else
 		obj->cache_level = I915_CACHE_NONE;
 
 	return obj;
 }
 
 int i915_gem_init_object(struct drm_gem_object *obj)
 {
 	printf("i915_gem_init_object called\n");
 
 	return 0;
 }
 
 void i915_gem_free_object(struct drm_gem_object *gem_obj)
 {
 	struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	CTR1(KTR_DRM, "object_destroy_tail %p", obj);
 
 	if (obj->phys_obj)
 		i915_gem_detach_phys_object(dev, obj);
 
 	obj->pin_count = 0;
 	if (WARN_ON(i915_gem_object_unbind(obj) == -ERESTARTSYS)) {
 		bool was_interruptible;
 
 		was_interruptible = dev_priv->mm.interruptible;
 		dev_priv->mm.interruptible = false;
 
 		WARN_ON(i915_gem_object_unbind(obj));
 
 		dev_priv->mm.interruptible = was_interruptible;
 	}
 
 	obj->pages_pin_count = 0;
 	i915_gem_object_put_pages(obj);
 	i915_gem_object_free_mmap_offset(obj);
 
 	BUG_ON(obj->pages);
 
 #ifdef FREEBSD_WIP
 	if (obj->base.import_attach)
 		drm_prime_gem_destroy(&obj->base, NULL);
 #endif /* FREEBSD_WIP */
 
 	drm_gem_object_release(&obj->base);
 	i915_gem_info_remove_obj(dev_priv, obj->base.size);
 
 	free(obj->bit_17, DRM_I915_GEM);
 	free(obj, DRM_I915_GEM);
 }
 
 int
 i915_gem_idle(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
 	DRM_LOCK(dev);
 
 	if (dev_priv->mm.suspended) {
 		DRM_UNLOCK(dev);
 		return 0;
 	}
 
 	ret = i915_gpu_idle(dev);
 	if (ret) {
 		DRM_UNLOCK(dev);
 		return ret;
 	}
 	i915_gem_retire_requests(dev);
 
 	/* Under UMS, be paranoid and evict. */
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		i915_gem_evict_everything(dev);
 
 	i915_gem_reset_fences(dev);
 
 	/* Hack!  Don't let anybody do execbuf while we don't control the chip.
 	 * We need to replace this with a semaphore, or something.
 	 * And not confound mm.suspended!
 	 */
 	dev_priv->mm.suspended = 1;
 	callout_stop(&dev_priv->hangcheck_timer);
 
 	i915_kernel_lost_context(dev);
 	i915_gem_cleanup_ringbuffer(dev);
 
 	DRM_UNLOCK(dev);
 
 	/* Cancel the retire work handler, which should be idle now. */
 	taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->mm.retire_work, NULL);
 
 	return 0;
 }
 
 void i915_gem_l3_remap(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 misccpctl;
 	int i;
 
 	if (!HAS_L3_GPU_CACHE(dev))
 		return;
 
 	if (!dev_priv->l3_parity.remap_info)
 		return;
 
 	misccpctl = I915_READ(GEN7_MISCCPCTL);
 	I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
 	POSTING_READ(GEN7_MISCCPCTL);
 
 	for (i = 0; i < GEN7_L3LOG_SIZE; i += 4) {
 		u32 remap = I915_READ(GEN7_L3LOG_BASE + i);
 		if (remap && remap != dev_priv->l3_parity.remap_info[i/4])
 			DRM_DEBUG("0x%x was already programmed to %x\n",
 				  GEN7_L3LOG_BASE + i, remap);
 		if (remap && !dev_priv->l3_parity.remap_info[i/4])
 			DRM_DEBUG_DRIVER("Clearing remapped register\n");
 		I915_WRITE(GEN7_L3LOG_BASE + i, dev_priv->l3_parity.remap_info[i/4]);
 	}
 
 	/* Make sure all the writes land before disabling dop clock gating */
 	POSTING_READ(GEN7_L3LOG_BASE);
 
 	I915_WRITE(GEN7_MISCCPCTL, misccpctl);
 }
 
 void i915_gem_init_swizzling(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	if (INTEL_INFO(dev)->gen < 5 ||
 	    dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
 		return;
 
 	I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
 				 DISP_TILE_SURFACE_SWIZZLING);
 
 	if (IS_GEN5(dev))
 		return;
 
 	I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
 	if (IS_GEN6(dev))
 		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
 	else
 		I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
 }
 
 static bool
 intel_enable_blt(struct drm_device *dev)
 {
 	if (!HAS_BLT(dev))
 		return false;
 
 	/* The blitter was dysfunctional on early prototypes */
 	if (IS_GEN6(dev) && pci_get_revid(dev->dev) < 8) {
 		DRM_INFO("BLT not supported on this pre-production hardware;"
 			 " graphics performance will be degraded.\n");
 		return false;
 	}
 
 	return true;
 }
 
 int
 i915_gem_init_hw(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
 #ifdef FREEBSD_WIP
 	if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
 		return -EIO;
 #endif /* FREEBSD_WIP */
 
 	if (IS_HASWELL(dev) && (I915_READ(0x120010) == 1))
 		I915_WRITE(0x9008, I915_READ(0x9008) | 0xf0000);
 
 	i915_gem_l3_remap(dev);
 
 	i915_gem_init_swizzling(dev);
 
 	ret = intel_init_render_ring_buffer(dev);
 	if (ret)
 		return ret;
 
 	if (HAS_BSD(dev)) {
 		ret = intel_init_bsd_ring_buffer(dev);
 		if (ret)
 			goto cleanup_render_ring;
 	}
 
 	if (intel_enable_blt(dev)) {
 		ret = intel_init_blt_ring_buffer(dev);
 		if (ret)
 			goto cleanup_bsd_ring;
 	}
 
 	dev_priv->next_seqno = 1;
 
 	/*
 	 * XXX: There was some w/a described somewhere suggesting loading
 	 * contexts before PPGTT.
 	 */
 	i915_gem_context_init(dev);
 	i915_gem_init_ppgtt(dev);
 
 	return 0;
 
 cleanup_bsd_ring:
 	intel_cleanup_ring_buffer(&dev_priv->ring[VCS]);
 cleanup_render_ring:
 	intel_cleanup_ring_buffer(&dev_priv->ring[RCS]);
 	return ret;
 }
 
 static bool
 intel_enable_ppgtt(struct drm_device *dev)
 {
 	if (i915_enable_ppgtt >= 0)
 		return i915_enable_ppgtt;
 
 #ifdef CONFIG_INTEL_IOMMU
 	/* Disable ppgtt on SNB if VT-d is on. */
 	if (INTEL_INFO(dev)->gen == 6 && intel_iommu_gfx_mapped)
 		return false;
 #endif
 
 	return true;
 }
 
 int i915_gem_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	unsigned long gtt_size, mappable_size;
 	int ret;
 
 	gtt_size = dev_priv->mm.gtt->gtt_total_entries << PAGE_SHIFT;
 	mappable_size = dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT;
 
 	DRM_LOCK(dev);
 	if (intel_enable_ppgtt(dev) && HAS_ALIASING_PPGTT(dev)) {
 		/* PPGTT pdes are stolen from global gtt ptes, so shrink the
 		 * aperture accordingly when using aliasing ppgtt. */
 		gtt_size -= I915_PPGTT_PD_ENTRIES*PAGE_SIZE;
 
 		i915_gem_init_global_gtt(dev, 0, mappable_size, gtt_size);
 
 		ret = i915_gem_init_aliasing_ppgtt(dev);
 		if (ret) {
 			DRM_UNLOCK(dev);
 			return ret;
 		}
 	} else {
 		/* Let GEM Manage all of the aperture.
 		 *
 		 * However, leave one page at the end still bound to the scratch
 		 * page.  There are a number of places where the hardware
 		 * apparently prefetches past the end of the object, and we've
 		 * seen multiple hangs with the GPU head pointer stuck in a
 		 * batchbuffer bound at the last page of the aperture.  One page
 		 * should be enough to keep any prefetching inside of the
 		 * aperture.
 		 */
 		i915_gem_init_global_gtt(dev, 0, mappable_size,
 					 gtt_size);
 	}
 
 	ret = i915_gem_init_hw(dev);
 	DRM_UNLOCK(dev);
 	if (ret) {
 		i915_gem_cleanup_aliasing_ppgtt(dev);
 		return ret;
 	}
 
 	/* Allow hardware batchbuffers unless told otherwise, but not for KMS. */
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		dev_priv->dri1.allow_batchbuffer = 1;
 	return 0;
 }
 
 void
 i915_gem_cleanup_ringbuffer(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	int i;
 
 	for_each_ring(ring, dev_priv, i)
 		intel_cleanup_ring_buffer(ring);
 }
 
 int
 i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return 0;
 
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		DRM_ERROR("Reenabling wedged hardware, good luck\n");
 		atomic_set(&dev_priv->mm.wedged, 0);
 	}
 
 	DRM_LOCK(dev);
 	dev_priv->mm.suspended = 0;
 
 	ret = i915_gem_init_hw(dev);
 	if (ret != 0) {
 		DRM_UNLOCK(dev);
 		return ret;
 	}
 
 	BUG_ON(!list_empty(&dev_priv->mm.active_list));
 	DRM_UNLOCK(dev);
 
 	ret = drm_irq_install(dev);
 	if (ret)
 		goto cleanup_ringbuffer;
 
 	return 0;
 
 cleanup_ringbuffer:
 	DRM_LOCK(dev);
 	i915_gem_cleanup_ringbuffer(dev);
 	dev_priv->mm.suspended = 1;
 	DRM_UNLOCK(dev);
 
 	return ret;
 }
 
 int
 i915_gem_leavevt_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv)
 {
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return 0;
 
 	drm_irq_uninstall(dev);
 	return i915_gem_idle(dev);
 }
 
 void
 i915_gem_lastclose(struct drm_device *dev)
 {
 	int ret;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return;
 
 	ret = i915_gem_idle(dev);
 	if (ret)
 		DRM_ERROR("failed to idle hardware: %d\n", ret);
 }
 
 static void
 init_ring_lists(struct intel_ring_buffer *ring)
 {
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 }
 
 void
 i915_gem_load(struct drm_device *dev)
 {
 	int i;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	INIT_LIST_HEAD(&dev_priv->mm.active_list);
 	INIT_LIST_HEAD(&dev_priv->mm.inactive_list);
 	INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.bound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 	for (i = 0; i < I915_NUM_RINGS; i++)
 		init_ring_lists(&dev_priv->ring[i]);
 	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
 		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
 	TIMEOUT_TASK_INIT(dev_priv->wq, &dev_priv->mm.retire_work, 0,
 	    i915_gem_retire_work_handler, dev_priv);
 	init_completion(&dev_priv->error_completion);
 
 	/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
 	if (IS_GEN3(dev)) {
 		I915_WRITE(MI_ARB_STATE,
 			   _MASKED_BIT_ENABLE(MI_ARB_C3_LP_WRITE_ENABLE));
 	}
 
 	dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL;
 
 	/* Old X drivers will take 0-2 for front, back, depth buffers */
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		dev_priv->fence_reg_start = 3;
 
 	if (INTEL_INFO(dev)->gen >= 4 || IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
 		dev_priv->num_fence_regs = 16;
 	else
 		dev_priv->num_fence_regs = 8;
 
 	/* Initialize fence registers to zero */
 	i915_gem_reset_fences(dev);
 
 	i915_gem_detect_bit_6_swizzle(dev);
 	DRM_INIT_WAITQUEUE(&dev_priv->pending_flip_queue);
 
 	dev_priv->mm.interruptible = true;
 
 	dev_priv->mm.inactive_shrinker = EVENTHANDLER_REGISTER(vm_lowmem,
 	    i915_gem_inactive_shrink, dev, EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Create a physically contiguous memory object for this object
  * e.g. for cursor + overlay regs
  */
 static int i915_gem_init_phys_object(struct drm_device *dev,
 				     int id, int size, int align)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_phys_object *phys_obj;
 	int ret;
 
 	if (dev_priv->mm.phys_objs[id - 1] || !size)
 		return 0;
 
 	phys_obj = malloc(sizeof(struct drm_i915_gem_phys_object),
 	    DRM_I915_GEM, M_WAITOK | M_ZERO);
 	if (!phys_obj)
 		return -ENOMEM;
 
 	phys_obj->id = id;
 
 	phys_obj->handle = drm_pci_alloc(dev, size, align, BUS_SPACE_MAXADDR);
 	if (!phys_obj->handle) {
 		ret = -ENOMEM;
 		goto kfree_obj;
 	}
 #ifdef CONFIG_X86
 	pmap_change_attr((vm_offset_t)phys_obj->handle->vaddr,
 	    size / PAGE_SIZE, PAT_WRITE_COMBINING);
 #endif
 
 	dev_priv->mm.phys_objs[id - 1] = phys_obj;
 
 	return 0;
 kfree_obj:
 	free(phys_obj, DRM_I915_GEM);
 	return ret;
 }
 
 static void i915_gem_free_phys_object(struct drm_device *dev, int id)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_phys_object *phys_obj;
 
 	if (!dev_priv->mm.phys_objs[id - 1])
 		return;
 
 	phys_obj = dev_priv->mm.phys_objs[id - 1];
 	if (phys_obj->cur_obj) {
 		i915_gem_detach_phys_object(dev, phys_obj->cur_obj);
 	}
 
 #ifdef FREEBSD_WIP
 #ifdef CONFIG_X86
 	set_memory_wb((unsigned long)phys_obj->handle->vaddr, phys_obj->handle->size / PAGE_SIZE);
 #endif
 #endif /* FREEBSD_WIP */
 
 	drm_pci_free(dev, phys_obj->handle);
 	free(phys_obj, DRM_I915_GEM);
 	dev_priv->mm.phys_objs[id - 1] = NULL;
 }
 
 void i915_gem_free_all_phys_object(struct drm_device *dev)
 {
 	int i;
 
 	for (i = I915_GEM_PHYS_CURSOR_0; i <= I915_MAX_PHYS_OBJECT; i++)
 		i915_gem_free_phys_object(dev, i);
 }
 
 void i915_gem_detach_phys_object(struct drm_device *dev,
 				 struct drm_i915_gem_object *obj)
 {
 	struct sf_buf *sf;
 	char *vaddr;
 	char *dst;
 	int i;
 	int page_count;
 
 	if (!obj->phys_obj)
 		return;
 	vaddr = obj->phys_obj->handle->vaddr;
 
 	page_count = obj->base.size / PAGE_SIZE;
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	for (i = 0; i < page_count; i++) {
 		vm_page_t page = i915_gem_wire_page(obj->base.vm_obj, i, NULL);
 		if (page == NULL)
 			continue; /* XXX */
 
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 		sf = sf_buf_alloc(page, 0);
 		if (sf != NULL) {
 			dst = (char *)sf_buf_kva(sf);
 			memcpy(dst, vaddr + IDX_TO_OFF(i), PAGE_SIZE);
 			sf_buf_free(sf);
 		}
 		drm_clflush_pages(&page, 1);
 
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 		vm_page_reference(page);
 		vm_page_lock(page);
 		vm_page_dirty(page);
 		vm_page_unwire(page, PQ_INACTIVE);
 		vm_page_unlock(page);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 	i915_gem_chipset_flush(dev);
 
 	obj->phys_obj->cur_obj = NULL;
 	obj->phys_obj = NULL;
 }
 
 int
 i915_gem_attach_phys_object(struct drm_device *dev,
 			    struct drm_i915_gem_object *obj,
 			    int id,
 			    int align)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct sf_buf *sf;
 	char *dst, *src;
 	int ret = 0;
 	int page_count;
 	int i;
 
 	if (id > I915_MAX_PHYS_OBJECT)
 		return -EINVAL;
 
 	if (obj->phys_obj) {
 		if (obj->phys_obj->id == id)
 			return 0;
 		i915_gem_detach_phys_object(dev, obj);
 	}
 
 	/* create a new object */
 	if (!dev_priv->mm.phys_objs[id - 1]) {
 		ret = i915_gem_init_phys_object(dev, id,
 						obj->base.size, align);
 		if (ret) {
 			DRM_ERROR("failed to init phys object %d size: %zu\n",
 				  id, obj->base.size);
 			return ret;
 		}
 	}
 
 	/* bind to the object */
 	obj->phys_obj = dev_priv->mm.phys_objs[id - 1];
 	obj->phys_obj->cur_obj = obj;
 
 	page_count = obj->base.size / PAGE_SIZE;
 
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	for (i = 0; i < page_count; i++) {
 		vm_page_t page = i915_gem_wire_page(obj->base.vm_obj, i, NULL);
 		if (page == NULL) {
 			ret = -EIO;
 			break;
 		}
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 		sf = sf_buf_alloc(page, 0);
 		src = (char *)sf_buf_kva(sf);
 		dst = (char *)obj->phys_obj->handle->vaddr + IDX_TO_OFF(i);
 		memcpy(dst, src, PAGE_SIZE);
 		sf_buf_free(sf);
 
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 
 		vm_page_reference(page);
 		vm_page_lock(page);
 		vm_page_unwire(page, PQ_INACTIVE);
 		vm_page_unlock(page);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 
 	return ret;
 }
 
 static int
 i915_gem_phys_pwrite(struct drm_device *dev,
 		     struct drm_i915_gem_object *obj,
 		     struct drm_i915_gem_pwrite *args,
 		     struct drm_file *file_priv)
 {
 	void *vaddr = (char *)obj->phys_obj->handle->vaddr + args->offset;
 	char __user *user_data = to_user_ptr(args->data_ptr);
 
 	if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) {
 		unsigned long unwritten;
 
 		/* The physical object once assigned is fixed for the lifetime
 		 * of the obj, so we can safely drop the lock and continue
 		 * to access vaddr.
 		 */
 		DRM_UNLOCK(dev);
 		unwritten = copy_from_user(vaddr, user_data, args->size);
 		DRM_LOCK(dev);
 		if (unwritten)
 			return -EFAULT;
 	}
 
 	i915_gem_chipset_flush(dev);
 	return 0;
 }
 
 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 
 	/* Clean up our request list when the client is going away, so that
 	 * later retire_requests won't dereference our soon-to-be-gone
 	 * file_priv.
 	 */
 	mtx_lock(&file_priv->mm.lock);
 	while (!list_empty(&file_priv->mm.request_list)) {
 		struct drm_i915_gem_request *request;
 
 		request = list_first_entry(&file_priv->mm.request_list,
 					   struct drm_i915_gem_request,
 					   client_list);
 		list_del(&request->client_list);
 		request->file_priv = NULL;
 	}
 	mtx_unlock(&file_priv->mm.lock);
 }
 
 static void
 i915_gem_inactive_shrink(void *arg)
 {
 	struct drm_device *dev = arg;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pass1, pass2;
 
 	if (!sx_try_xlock(&dev->dev_struct_lock)) {
 		return;
 	}
 
 	CTR0(KTR_DRM, "gem_lowmem");
 
 	pass1 = i915_gem_purge(dev_priv, -1);
 	pass2 = __i915_gem_shrink(dev_priv, -1, false);
 
 	if (pass2 <= pass1 / 100)
 		i915_gem_shrink_all(dev_priv);
 
 	DRM_UNLOCK(dev);
 }
 
 static vm_page_t
 i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex, bool *fresh)
 {
 	vm_page_t page;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	page = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED);
 	if (page->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(page);
 		if (vm_pager_has_page(object, pindex, NULL, NULL)) {
 			rv = vm_pager_get_pages(object, &page, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(page);
 				vm_page_unwire(page, PQ_NONE);
 				vm_page_free(page);
 				vm_page_unlock(page);
 				return (NULL);
 			}
 			if (fresh != NULL)
 				*fresh = true;
 		} else {
 			pmap_zero_page(page);
 			page->valid = VM_PAGE_BITS_ALL;
 			page->dirty = 0;
 			if (fresh != NULL)
 				*fresh = false;
 		}
 		vm_page_xunbusy(page);
 	} else if (fresh != NULL)
 		*fresh = false;
 	atomic_add_long(&i915_gem_wired_pages_cnt, 1);
 	return (page);
 }
Index: head/sys/dev/drm2/i915/intel_pm.c
===================================================================
--- head/sys/dev/drm2/i915/intel_pm.c	(revision 336913)
+++ head/sys/dev/drm2/i915/intel_pm.c	(revision 336914)
@@ -1,4510 +1,4509 @@
 /*
  * Copyright © 2012 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/i915/i915_drv.h>
 #include <dev/drm2/i915/intel_drv.h>
 #include <sys/kdb.h>
 #include <machine/clock.h>
 
 #define FORCEWAKE_ACK_TIMEOUT_MS 2
 
 /* FBC, or Frame Buffer Compression, is a technique employed to compress the
  * framebuffer contents in-memory, aiming at reducing the required bandwidth
  * during in-memory transfers and, therefore, reduce the power packet.
  *
  * The benefits of FBC are mostly visible with solid backgrounds and
  * variation-less patterns.
  *
  * FBC-related functionality can be enabled by the means of the
  * i915.i915_enable_fbc parameter
  */
 
 static bool intel_crtc_active(struct drm_crtc *crtc)
 {
 	/* Be paranoid as we can arrive here with only partial
 	 * state retrieved from the hardware during setup.
 	 */
 	return to_intel_crtc(crtc)->active && crtc->fb && crtc->mode.clock;
 }
 
 static void i8xx_disable_fbc(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 fbc_ctl;
 
 	/* Disable compression */
 	fbc_ctl = I915_READ(FBC_CONTROL);
 	if ((fbc_ctl & FBC_CTL_EN) == 0)
 		return;
 
 	fbc_ctl &= ~FBC_CTL_EN;
 	I915_WRITE(FBC_CONTROL, fbc_ctl);
 
 	/* Wait for compressing bit to clear */
 	if (wait_for((I915_READ(FBC_STATUS) & FBC_STAT_COMPRESSING) == 0, 10)) {
 		DRM_DEBUG_KMS("FBC idle timed out\n");
 		return;
 	}
 
 	DRM_DEBUG_KMS("disabled FBC\n");
 }
 
 static void i8xx_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
 {
 	struct drm_device *dev = crtc->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_framebuffer *fb = crtc->fb;
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	struct drm_i915_gem_object *obj = intel_fb->obj;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int cfb_pitch;
 	int plane, i;
 	u32 fbc_ctl, fbc_ctl2;
 
 	cfb_pitch = dev_priv->cfb_size / FBC_LL_SIZE;
 	if (fb->pitches[0] < cfb_pitch)
 		cfb_pitch = fb->pitches[0];
 
 	/* FBC_CTL wants 64B units */
 	cfb_pitch = (cfb_pitch / 64) - 1;
 	plane = intel_crtc->plane == 0 ? FBC_CTL_PLANEA : FBC_CTL_PLANEB;
 
 	/* Clear old tags */
 	for (i = 0; i < (FBC_LL_SIZE / 32) + 1; i++)
 		I915_WRITE(FBC_TAG + (i * 4), 0);
 
 	/* Set it up... */
 	fbc_ctl2 = FBC_CTL_FENCE_DBL | FBC_CTL_IDLE_IMM | FBC_CTL_CPU_FENCE;
 	fbc_ctl2 |= plane;
 	I915_WRITE(FBC_CONTROL2, fbc_ctl2);
 	I915_WRITE(FBC_FENCE_OFF, crtc->y);
 
 	/* enable it... */
 	fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC;
 	if (IS_I945GM(dev))
 		fbc_ctl |= FBC_CTL_C3_IDLE; /* 945 needs special SR handling */
 	fbc_ctl |= (cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT;
 	fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT;
 	fbc_ctl |= obj->fence_reg;
 	I915_WRITE(FBC_CONTROL, fbc_ctl);
 
 	DRM_DEBUG_KMS("enabled FBC, pitch %d, yoff %d, plane %d, ",
 		      cfb_pitch, crtc->y, intel_crtc->plane);
 }
 
 static bool i8xx_fbc_enabled(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	return I915_READ(FBC_CONTROL) & FBC_CTL_EN;
 }
 
 static void g4x_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
 {
 	struct drm_device *dev = crtc->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_framebuffer *fb = crtc->fb;
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	struct drm_i915_gem_object *obj = intel_fb->obj;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int plane = intel_crtc->plane == 0 ? DPFC_CTL_PLANEA : DPFC_CTL_PLANEB;
 	unsigned long stall_watermark = 200;
 	u32 dpfc_ctl;
 
 	dpfc_ctl = plane | DPFC_SR_EN | DPFC_CTL_LIMIT_1X;
 	dpfc_ctl |= DPFC_CTL_FENCE_EN | obj->fence_reg;
 	I915_WRITE(DPFC_CHICKEN, DPFC_HT_MODIFY);
 
 	I915_WRITE(DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN |
 		   (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) |
 		   (interval << DPFC_RECOMP_TIMER_COUNT_SHIFT));
 	I915_WRITE(DPFC_FENCE_YOFF, crtc->y);
 
 	/* enable it... */
 	I915_WRITE(DPFC_CONTROL, I915_READ(DPFC_CONTROL) | DPFC_CTL_EN);
 
 	DRM_DEBUG_KMS("enabled fbc on plane %d\n", intel_crtc->plane);
 }
 
 static void g4x_disable_fbc(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 dpfc_ctl;
 
 	/* Disable compression */
 	dpfc_ctl = I915_READ(DPFC_CONTROL);
 	if (dpfc_ctl & DPFC_CTL_EN) {
 		dpfc_ctl &= ~DPFC_CTL_EN;
 		I915_WRITE(DPFC_CONTROL, dpfc_ctl);
 
 		DRM_DEBUG_KMS("disabled FBC\n");
 	}
 }
 
 static bool g4x_fbc_enabled(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN;
 }
 
 static void sandybridge_blit_fbc_update(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 blt_ecoskpd;
 
 	/* Make sure blitter notifies FBC of writes */
 	gen6_gt_force_wake_get(dev_priv);
 	blt_ecoskpd = I915_READ(GEN6_BLITTER_ECOSKPD);
 	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY <<
 		GEN6_BLITTER_LOCK_SHIFT;
 	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
 	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY;
 	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
 	blt_ecoskpd &= ~(GEN6_BLITTER_FBC_NOTIFY <<
 			 GEN6_BLITTER_LOCK_SHIFT);
 	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
 	POSTING_READ(GEN6_BLITTER_ECOSKPD);
 	gen6_gt_force_wake_put(dev_priv);
 }
 
 static void ironlake_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
 {
 	struct drm_device *dev = crtc->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_framebuffer *fb = crtc->fb;
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	struct drm_i915_gem_object *obj = intel_fb->obj;
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 	int plane = intel_crtc->plane == 0 ? DPFC_CTL_PLANEA : DPFC_CTL_PLANEB;
 	unsigned long stall_watermark = 200;
 	u32 dpfc_ctl;
 
 	dpfc_ctl = I915_READ(ILK_DPFC_CONTROL);
 	dpfc_ctl &= DPFC_RESERVED;
 	dpfc_ctl |= (plane | DPFC_CTL_LIMIT_1X);
 	/* Set persistent mode for front-buffer rendering, ala X. */
 	dpfc_ctl |= DPFC_CTL_PERSISTENT_MODE;
 	dpfc_ctl |= (DPFC_CTL_FENCE_EN | obj->fence_reg);
 	I915_WRITE(ILK_DPFC_CHICKEN, DPFC_HT_MODIFY);
 
 	I915_WRITE(ILK_DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN |
 		   (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) |
 		   (interval << DPFC_RECOMP_TIMER_COUNT_SHIFT));
 	I915_WRITE(ILK_DPFC_FENCE_YOFF, crtc->y);
 	I915_WRITE(ILK_FBC_RT_BASE, obj->gtt_offset | ILK_FBC_RT_VALID);
 	/* enable it... */
 	I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN);
 
 	if (IS_GEN6(dev)) {
 		I915_WRITE(SNB_DPFC_CTL_SA,
 			   SNB_CPU_FENCE_ENABLE | obj->fence_reg);
 		I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
 		sandybridge_blit_fbc_update(dev);
 	}
 
 	DRM_DEBUG_KMS("enabled fbc on plane %d\n", intel_crtc->plane);
 }
 
 static void ironlake_disable_fbc(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 dpfc_ctl;
 
 	/* Disable compression */
 	dpfc_ctl = I915_READ(ILK_DPFC_CONTROL);
 	if (dpfc_ctl & DPFC_CTL_EN) {
 		dpfc_ctl &= ~DPFC_CTL_EN;
 		I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl);
 
 		DRM_DEBUG_KMS("disabled FBC\n");
 	}
 }
 
 static bool ironlake_fbc_enabled(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	return I915_READ(ILK_DPFC_CONTROL) & DPFC_CTL_EN;
 }
 
 bool intel_fbc_enabled(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (!dev_priv->display.fbc_enabled)
 		return false;
 
 	return dev_priv->display.fbc_enabled(dev);
 }
 
 static void intel_fbc_work_fn(void *arg, int pending)
 {
 	struct intel_fbc_work *work = arg;
 	struct drm_device *dev = work->crtc->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	DRM_LOCK(dev);
 	if (work == dev_priv->fbc_work) {
 		/* Double check that we haven't switched fb without cancelling
 		 * the prior work.
 		 */
 		if (work->crtc->fb == work->fb) {
 			dev_priv->display.enable_fbc(work->crtc,
 						     work->interval);
 
 			dev_priv->cfb_plane = to_intel_crtc(work->crtc)->plane;
 			dev_priv->cfb_fb = work->crtc->fb->base.id;
 			dev_priv->cfb_y = work->crtc->y;
 		}
 
 		dev_priv->fbc_work = NULL;
 	}
 	DRM_UNLOCK(dev);
 
 	free(work, DRM_MEM_KMS);
 }
 
 static void intel_cancel_fbc_work(struct drm_i915_private *dev_priv)
 {
 	if (dev_priv->fbc_work == NULL)
 		return;
 
 	DRM_DEBUG_KMS("cancelling pending FBC enable\n");
 
 	/* Synchronisation is provided by struct_mutex and checking of
 	 * dev_priv->fbc_work, so we can perform the cancellation
 	 * entirely asynchronously.
 	 */
 	if (taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->fbc_work->work,
 	    NULL) == 0)
 		/* tasklet was killed before being run, clean up */
 		free(dev_priv->fbc_work, DRM_MEM_KMS);
 
 	/* Mark the work as no longer wanted so that if it does
 	 * wake-up (because the work was already running and waiting
 	 * for our mutex), it will discover that is no longer
 	 * necessary to run.
 	 */
 	dev_priv->fbc_work = NULL;
 }
 
 void intel_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
 {
 	struct intel_fbc_work *work;
 	struct drm_device *dev = crtc->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (!dev_priv->display.enable_fbc)
 		return;
 
 	intel_cancel_fbc_work(dev_priv);
 
 	work = malloc(sizeof *work, DRM_MEM_KMS, M_WAITOK | M_ZERO);
 	if (work == NULL) {
 		dev_priv->display.enable_fbc(crtc, interval);
 		return;
 	}
 
 	work->crtc = crtc;
 	work->fb = crtc->fb;
 	work->interval = interval;
 	TIMEOUT_TASK_INIT(dev_priv->wq, &work->work, 0, intel_fbc_work_fn,
 	    work);
 
 	dev_priv->fbc_work = work;
 
 	DRM_DEBUG_KMS("scheduling delayed FBC enable\n");
 
 	/* Delay the actual enabling to let pageflipping cease and the
 	 * display to settle before starting the compression. Note that
 	 * this delay also serves a second purpose: it allows for a
 	 * vblank to pass after disabling the FBC before we attempt
 	 * to modify the control registers.
 	 *
 	 * A more complicated solution would involve tracking vblanks
 	 * following the termination of the page-flipping sequence
 	 * and indeed performing the enable as a co-routine and not
 	 * waiting synchronously upon the vblank.
 	 */
 	taskqueue_enqueue_timeout(dev_priv->wq, &work->work,
 	    msecs_to_jiffies(50));
 }
 
 void intel_disable_fbc(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	intel_cancel_fbc_work(dev_priv);
 
 	if (!dev_priv->display.disable_fbc)
 		return;
 
 	dev_priv->display.disable_fbc(dev);
 	dev_priv->cfb_plane = -1;
 }
 
 /**
  * intel_update_fbc - enable/disable FBC as needed
  * @dev: the drm_device
  *
  * Set up the framebuffer compression hardware at mode set time.  We
  * enable it if possible:
  *   - plane A only (on pre-965)
  *   - no pixel mulitply/line duplication
  *   - no alpha buffer discard
  *   - no dual wide
  *   - framebuffer <= 2048 in width, 1536 in height
  *
  * We can't assume that any compression will take place (worst case),
  * so the compressed buffer has to be the same size as the uncompressed
  * one.  It also must reside (along with the line length buffer) in
  * stolen memory.
  *
  * We need to enable/disable FBC on a global basis.
  */
 void intel_update_fbc(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_crtc *crtc = NULL, *tmp_crtc;
 	struct intel_crtc *intel_crtc;
 	struct drm_framebuffer *fb;
 	struct intel_framebuffer *intel_fb;
 	struct drm_i915_gem_object *obj;
 	int enable_fbc;
 
 	if (!i915_powersave)
 		return;
 
 	if (!I915_HAS_FBC(dev))
 		return;
 
 	/*
 	 * If FBC is already on, we just have to verify that we can
 	 * keep it that way...
 	 * Need to disable if:
 	 *   - more than one pipe is active
 	 *   - changing FBC params (stride, fence, mode)
 	 *   - new fb is too large to fit in compressed buffer
 	 *   - going to an unsupported config (interlace, pixel multiply, etc.)
 	 */
 	list_for_each_entry(tmp_crtc, &dev->mode_config.crtc_list, head) {
 		if (intel_crtc_active(tmp_crtc) &&
 		    !to_intel_crtc(tmp_crtc)->primary_disabled) {
 			if (crtc) {
 				DRM_DEBUG_KMS("more than one pipe active, disabling compression\n");
 				dev_priv->no_fbc_reason = FBC_MULTIPLE_PIPES;
 				goto out_disable;
 			}
 			crtc = tmp_crtc;
 		}
 	}
 
 	if (!crtc || crtc->fb == NULL) {
 		DRM_DEBUG_KMS("no output, disabling\n");
 		dev_priv->no_fbc_reason = FBC_NO_OUTPUT;
 		goto out_disable;
 	}
 
 	intel_crtc = to_intel_crtc(crtc);
 	fb = crtc->fb;
 	intel_fb = to_intel_framebuffer(fb);
 	obj = intel_fb->obj;
 
 	enable_fbc = i915_enable_fbc;
 	if (enable_fbc < 0) {
 		DRM_DEBUG_KMS("fbc set to per-chip default\n");
 		enable_fbc = 1;
 		if (INTEL_INFO(dev)->gen <= 6)
 			enable_fbc = 0;
 	}
 	if (!enable_fbc) {
 		DRM_DEBUG_KMS("fbc disabled per module param\n");
 		dev_priv->no_fbc_reason = FBC_MODULE_PARAM;
 		goto out_disable;
 	}
 	if (intel_fb->obj->base.size > dev_priv->cfb_size) {
 		DRM_DEBUG_KMS("framebuffer too large, disabling "
 			      "compression\n");
 		dev_priv->no_fbc_reason = FBC_STOLEN_TOO_SMALL;
 		goto out_disable;
 	}
 	if ((crtc->mode.flags & DRM_MODE_FLAG_INTERLACE) ||
 	    (crtc->mode.flags & DRM_MODE_FLAG_DBLSCAN)) {
 		DRM_DEBUG_KMS("mode incompatible with compression, "
 			      "disabling\n");
 		dev_priv->no_fbc_reason = FBC_UNSUPPORTED_MODE;
 		goto out_disable;
 	}
 	if ((crtc->mode.hdisplay > 2048) ||
 	    (crtc->mode.vdisplay > 1536)) {
 		DRM_DEBUG_KMS("mode too large for compression, disabling\n");
 		dev_priv->no_fbc_reason = FBC_MODE_TOO_LARGE;
 		goto out_disable;
 	}
 	if ((IS_I915GM(dev) || IS_I945GM(dev)) && intel_crtc->plane != 0) {
 		DRM_DEBUG_KMS("plane not 0, disabling compression\n");
 		dev_priv->no_fbc_reason = FBC_BAD_PLANE;
 		goto out_disable;
 	}
 
 	/* The use of a CPU fence is mandatory in order to detect writes
 	 * by the CPU to the scanout and trigger updates to the FBC.
 	 */
 	if (obj->tiling_mode != I915_TILING_X ||
 	    obj->fence_reg == I915_FENCE_REG_NONE) {
 		DRM_DEBUG_KMS("framebuffer not tiled or fenced, disabling compression\n");
 		dev_priv->no_fbc_reason = FBC_NOT_TILED;
 		goto out_disable;
 	}
 
 	/* If the kernel debugger is active, always disable compression */
 	if (kdb_active)
 		goto out_disable;
 
 	/* If the scanout has not changed, don't modify the FBC settings.
 	 * Note that we make the fundamental assumption that the fb->obj
 	 * cannot be unpinned (and have its GTT offset and fence revoked)
 	 * without first being decoupled from the scanout and FBC disabled.
 	 */
 	if (dev_priv->cfb_plane == intel_crtc->plane &&
 	    dev_priv->cfb_fb == fb->base.id &&
 	    dev_priv->cfb_y == crtc->y)
 		return;
 
 	if (intel_fbc_enabled(dev)) {
 		/* We update FBC along two paths, after changing fb/crtc
 		 * configuration (modeswitching) and after page-flipping
 		 * finishes. For the latter, we know that not only did
 		 * we disable the FBC at the start of the page-flip
 		 * sequence, but also more than one vblank has passed.
 		 *
 		 * For the former case of modeswitching, it is possible
 		 * to switch between two FBC valid configurations
 		 * instantaneously so we do need to disable the FBC
 		 * before we can modify its control registers. We also
 		 * have to wait for the next vblank for that to take
 		 * effect. However, since we delay enabling FBC we can
 		 * assume that a vblank has passed since disabling and
 		 * that we can safely alter the registers in the deferred
 		 * callback.
 		 *
 		 * In the scenario that we go from a valid to invalid
 		 * and then back to valid FBC configuration we have
 		 * no strict enforcement that a vblank occurred since
 		 * disabling the FBC. However, along all current pipe
 		 * disabling paths we do need to wait for a vblank at
 		 * some point. And we wait before enabling FBC anyway.
 		 */
 		DRM_DEBUG_KMS("disabling active FBC for update\n");
 		intel_disable_fbc(dev);
 	}
 
 	intel_enable_fbc(crtc, 500);
 	return;
 
 out_disable:
 	/* Multiple disables should be harmless */
 	if (intel_fbc_enabled(dev)) {
 		DRM_DEBUG_KMS("unsupported config, disabling FBC\n");
 		intel_disable_fbc(dev);
 	}
 }
 
 static void i915_pineview_get_mem_freq(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 tmp;
 
 	tmp = I915_READ(CLKCFG);
 
 	switch (tmp & CLKCFG_FSB_MASK) {
 	case CLKCFG_FSB_533:
 		dev_priv->fsb_freq = 533; /* 133*4 */
 		break;
 	case CLKCFG_FSB_800:
 		dev_priv->fsb_freq = 800; /* 200*4 */
 		break;
 	case CLKCFG_FSB_667:
 		dev_priv->fsb_freq =  667; /* 167*4 */
 		break;
 	case CLKCFG_FSB_400:
 		dev_priv->fsb_freq = 400; /* 100*4 */
 		break;
 	}
 
 	switch (tmp & CLKCFG_MEM_MASK) {
 	case CLKCFG_MEM_533:
 		dev_priv->mem_freq = 533;
 		break;
 	case CLKCFG_MEM_667:
 		dev_priv->mem_freq = 667;
 		break;
 	case CLKCFG_MEM_800:
 		dev_priv->mem_freq = 800;
 		break;
 	}
 
 	/* detect pineview DDR3 setting */
 	tmp = I915_READ(CSHRDDR3CTL);
 	dev_priv->is_ddr3 = (tmp & CSHRDDR3CTL_DDR3) ? 1 : 0;
 }
 
 static void i915_ironlake_get_mem_freq(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u16 ddrpll, csipll;
 
 	ddrpll = I915_READ16(DDRMPLL1);
 	csipll = I915_READ16(CSIPLL0);
 
 	switch (ddrpll & 0xff) {
 	case 0xc:
 		dev_priv->mem_freq = 800;
 		break;
 	case 0x10:
 		dev_priv->mem_freq = 1066;
 		break;
 	case 0x14:
 		dev_priv->mem_freq = 1333;
 		break;
 	case 0x18:
 		dev_priv->mem_freq = 1600;
 		break;
 	default:
 		DRM_DEBUG_DRIVER("unknown memory frequency 0x%02x\n",
 				 ddrpll & 0xff);
 		dev_priv->mem_freq = 0;
 		break;
 	}
 
 	dev_priv->ips.r_t = dev_priv->mem_freq;
 
 	switch (csipll & 0x3ff) {
 	case 0x00c:
 		dev_priv->fsb_freq = 3200;
 		break;
 	case 0x00e:
 		dev_priv->fsb_freq = 3733;
 		break;
 	case 0x010:
 		dev_priv->fsb_freq = 4266;
 		break;
 	case 0x012:
 		dev_priv->fsb_freq = 4800;
 		break;
 	case 0x014:
 		dev_priv->fsb_freq = 5333;
 		break;
 	case 0x016:
 		dev_priv->fsb_freq = 5866;
 		break;
 	case 0x018:
 		dev_priv->fsb_freq = 6400;
 		break;
 	default:
 		DRM_DEBUG_DRIVER("unknown fsb frequency 0x%04x\n",
 				 csipll & 0x3ff);
 		dev_priv->fsb_freq = 0;
 		break;
 	}
 
 	if (dev_priv->fsb_freq == 3200) {
 		dev_priv->ips.c_m = 0;
 	} else if (dev_priv->fsb_freq > 3200 && dev_priv->fsb_freq <= 4800) {
 		dev_priv->ips.c_m = 1;
 	} else {
 		dev_priv->ips.c_m = 2;
 	}
 }
 
 static const struct cxsr_latency cxsr_latency_table[] = {
 	{1, 0, 800, 400, 3382, 33382, 3983, 33983},    /* DDR2-400 SC */
 	{1, 0, 800, 667, 3354, 33354, 3807, 33807},    /* DDR2-667 SC */
 	{1, 0, 800, 800, 3347, 33347, 3763, 33763},    /* DDR2-800 SC */
 	{1, 1, 800, 667, 6420, 36420, 6873, 36873},    /* DDR3-667 SC */
 	{1, 1, 800, 800, 5902, 35902, 6318, 36318},    /* DDR3-800 SC */
 
 	{1, 0, 667, 400, 3400, 33400, 4021, 34021},    /* DDR2-400 SC */
 	{1, 0, 667, 667, 3372, 33372, 3845, 33845},    /* DDR2-667 SC */
 	{1, 0, 667, 800, 3386, 33386, 3822, 33822},    /* DDR2-800 SC */
 	{1, 1, 667, 667, 6438, 36438, 6911, 36911},    /* DDR3-667 SC */
 	{1, 1, 667, 800, 5941, 35941, 6377, 36377},    /* DDR3-800 SC */
 
 	{1, 0, 400, 400, 3472, 33472, 4173, 34173},    /* DDR2-400 SC */
 	{1, 0, 400, 667, 3443, 33443, 3996, 33996},    /* DDR2-667 SC */
 	{1, 0, 400, 800, 3430, 33430, 3946, 33946},    /* DDR2-800 SC */
 	{1, 1, 400, 667, 6509, 36509, 7062, 37062},    /* DDR3-667 SC */
 	{1, 1, 400, 800, 5985, 35985, 6501, 36501},    /* DDR3-800 SC */
 
 	{0, 0, 800, 400, 3438, 33438, 4065, 34065},    /* DDR2-400 SC */
 	{0, 0, 800, 667, 3410, 33410, 3889, 33889},    /* DDR2-667 SC */
 	{0, 0, 800, 800, 3403, 33403, 3845, 33845},    /* DDR2-800 SC */
 	{0, 1, 800, 667, 6476, 36476, 6955, 36955},    /* DDR3-667 SC */
 	{0, 1, 800, 800, 5958, 35958, 6400, 36400},    /* DDR3-800 SC */
 
 	{0, 0, 667, 400, 3456, 33456, 4103, 34106},    /* DDR2-400 SC */
 	{0, 0, 667, 667, 3428, 33428, 3927, 33927},    /* DDR2-667 SC */
 	{0, 0, 667, 800, 3443, 33443, 3905, 33905},    /* DDR2-800 SC */
 	{0, 1, 667, 667, 6494, 36494, 6993, 36993},    /* DDR3-667 SC */
 	{0, 1, 667, 800, 5998, 35998, 6460, 36460},    /* DDR3-800 SC */
 
 	{0, 0, 400, 400, 3528, 33528, 4255, 34255},    /* DDR2-400 SC */
 	{0, 0, 400, 667, 3500, 33500, 4079, 34079},    /* DDR2-667 SC */
 	{0, 0, 400, 800, 3487, 33487, 4029, 34029},    /* DDR2-800 SC */
 	{0, 1, 400, 667, 6566, 36566, 7145, 37145},    /* DDR3-667 SC */
 	{0, 1, 400, 800, 6042, 36042, 6584, 36584},    /* DDR3-800 SC */
 };
 
 static const struct cxsr_latency *intel_get_cxsr_latency(int is_desktop,
 							 int is_ddr3,
 							 int fsb,
 							 int mem)
 {
 	const struct cxsr_latency *latency;
 	int i;
 
 	if (fsb == 0 || mem == 0)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(cxsr_latency_table); i++) {
 		latency = &cxsr_latency_table[i];
 		if (is_desktop == latency->is_desktop &&
 		    is_ddr3 == latency->is_ddr3 &&
 		    fsb == latency->fsb_freq && mem == latency->mem_freq)
 			return latency;
 	}
 
 	DRM_DEBUG_KMS("Unknown FSB/MEM found, disable CxSR\n");
 
 	return NULL;
 }
 
 static void pineview_disable_cxsr(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	/* deactivate cxsr */
 	I915_WRITE(DSPFW3, I915_READ(DSPFW3) & ~PINEVIEW_SELF_REFRESH_EN);
 }
 
 /*
  * Latency for FIFO fetches is dependent on several factors:
  *   - memory configuration (speed, channels)
  *   - chipset
  *   - current MCH state
  * It can be fairly high in some situations, so here we assume a fairly
  * pessimal value.  It's a tradeoff between extra memory fetches (if we
  * set this value too high, the FIFO will fetch frequently to stay full)
  * and power consumption (set it too low to save power and we might see
  * FIFO underruns and display "flicker").
  *
  * A value of 5us seems to be a good balance; safe for very low end
  * platforms but not overly aggressive on lower latency configs.
  */
 static const int latency_ns = 5000;
 
 static int i9xx_get_fifo_size(struct drm_device *dev, int plane)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dsparb = I915_READ(DSPARB);
 	int size;
 
 	size = dsparb & 0x7f;
 	if (plane)
 		size = ((dsparb >> DSPARB_CSTART_SHIFT) & 0x7f) - size;
 
 	DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb,
 		      plane ? "B" : "A", size);
 
 	return size;
 }
 
 static int i85x_get_fifo_size(struct drm_device *dev, int plane)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dsparb = I915_READ(DSPARB);
 	int size;
 
 	size = dsparb & 0x1ff;
 	if (plane)
 		size = ((dsparb >> DSPARB_BEND_SHIFT) & 0x1ff) - size;
 	size >>= 1; /* Convert to cachelines */
 
 	DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb,
 		      plane ? "B" : "A", size);
 
 	return size;
 }
 
 static int i845_get_fifo_size(struct drm_device *dev, int plane)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dsparb = I915_READ(DSPARB);
 	int size;
 
 	size = dsparb & 0x7f;
 	size >>= 2; /* Convert to cachelines */
 
 	DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb,
 		      plane ? "B" : "A",
 		      size);
 
 	return size;
 }
 
 static int i830_get_fifo_size(struct drm_device *dev, int plane)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dsparb = I915_READ(DSPARB);
 	int size;
 
 	size = dsparb & 0x7f;
 	size >>= 1; /* Convert to cachelines */
 
 	DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb,
 		      plane ? "B" : "A", size);
 
 	return size;
 }
 
 /* Pineview has different values for various configs */
 static const struct intel_watermark_params pineview_display_wm = {
 	PINEVIEW_DISPLAY_FIFO,
 	PINEVIEW_MAX_WM,
 	PINEVIEW_DFT_WM,
 	PINEVIEW_GUARD_WM,
 	PINEVIEW_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params pineview_display_hplloff_wm = {
 	PINEVIEW_DISPLAY_FIFO,
 	PINEVIEW_MAX_WM,
 	PINEVIEW_DFT_HPLLOFF_WM,
 	PINEVIEW_GUARD_WM,
 	PINEVIEW_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params pineview_cursor_wm = {
 	PINEVIEW_CURSOR_FIFO,
 	PINEVIEW_CURSOR_MAX_WM,
 	PINEVIEW_CURSOR_DFT_WM,
 	PINEVIEW_CURSOR_GUARD_WM,
 	PINEVIEW_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params pineview_cursor_hplloff_wm = {
 	PINEVIEW_CURSOR_FIFO,
 	PINEVIEW_CURSOR_MAX_WM,
 	PINEVIEW_CURSOR_DFT_WM,
 	PINEVIEW_CURSOR_GUARD_WM,
 	PINEVIEW_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params g4x_wm_info = {
 	G4X_FIFO_SIZE,
 	G4X_MAX_WM,
 	G4X_MAX_WM,
 	2,
 	G4X_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params g4x_cursor_wm_info = {
 	I965_CURSOR_FIFO,
 	I965_CURSOR_MAX_WM,
 	I965_CURSOR_DFT_WM,
 	2,
 	G4X_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params valleyview_wm_info = {
 	VALLEYVIEW_FIFO_SIZE,
 	VALLEYVIEW_MAX_WM,
 	VALLEYVIEW_MAX_WM,
 	2,
 	G4X_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params valleyview_cursor_wm_info = {
 	I965_CURSOR_FIFO,
 	VALLEYVIEW_CURSOR_MAX_WM,
 	I965_CURSOR_DFT_WM,
 	2,
 	G4X_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params i965_cursor_wm_info = {
 	I965_CURSOR_FIFO,
 	I965_CURSOR_MAX_WM,
 	I965_CURSOR_DFT_WM,
 	2,
 	I915_FIFO_LINE_SIZE,
 };
 static const struct intel_watermark_params i945_wm_info = {
 	I945_FIFO_SIZE,
 	I915_MAX_WM,
 	1,
 	2,
 	I915_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params i915_wm_info = {
 	I915_FIFO_SIZE,
 	I915_MAX_WM,
 	1,
 	2,
 	I915_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params i855_wm_info = {
 	I855GM_FIFO_SIZE,
 	I915_MAX_WM,
 	1,
 	2,
 	I830_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params i830_wm_info = {
 	I830_FIFO_SIZE,
 	I915_MAX_WM,
 	1,
 	2,
 	I830_FIFO_LINE_SIZE
 };
 
 static const struct intel_watermark_params ironlake_display_wm_info = {
 	ILK_DISPLAY_FIFO,
 	ILK_DISPLAY_MAXWM,
 	ILK_DISPLAY_DFTWM,
 	2,
 	ILK_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params ironlake_cursor_wm_info = {
 	ILK_CURSOR_FIFO,
 	ILK_CURSOR_MAXWM,
 	ILK_CURSOR_DFTWM,
 	2,
 	ILK_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params ironlake_display_srwm_info = {
 	ILK_DISPLAY_SR_FIFO,
 	ILK_DISPLAY_MAX_SRWM,
 	ILK_DISPLAY_DFT_SRWM,
 	2,
 	ILK_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params ironlake_cursor_srwm_info = {
 	ILK_CURSOR_SR_FIFO,
 	ILK_CURSOR_MAX_SRWM,
 	ILK_CURSOR_DFT_SRWM,
 	2,
 	ILK_FIFO_LINE_SIZE
 };
 
 static const struct intel_watermark_params sandybridge_display_wm_info = {
 	SNB_DISPLAY_FIFO,
 	SNB_DISPLAY_MAXWM,
 	SNB_DISPLAY_DFTWM,
 	2,
 	SNB_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params sandybridge_cursor_wm_info = {
 	SNB_CURSOR_FIFO,
 	SNB_CURSOR_MAXWM,
 	SNB_CURSOR_DFTWM,
 	2,
 	SNB_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params sandybridge_display_srwm_info = {
 	SNB_DISPLAY_SR_FIFO,
 	SNB_DISPLAY_MAX_SRWM,
 	SNB_DISPLAY_DFT_SRWM,
 	2,
 	SNB_FIFO_LINE_SIZE
 };
 static const struct intel_watermark_params sandybridge_cursor_srwm_info = {
 	SNB_CURSOR_SR_FIFO,
 	SNB_CURSOR_MAX_SRWM,
 	SNB_CURSOR_DFT_SRWM,
 	2,
 	SNB_FIFO_LINE_SIZE
 };
 
 
 /**
  * intel_calculate_wm - calculate watermark level
  * @clock_in_khz: pixel clock
  * @wm: chip FIFO params
  * @pixel_size: display pixel size
  * @latency_ns: memory latency for the platform
  *
  * Calculate the watermark level (the level at which the display plane will
  * start fetching from memory again).  Each chip has a different display
  * FIFO size and allocation, so the caller needs to figure that out and pass
  * in the correct intel_watermark_params structure.
  *
  * As the pixel clock runs, the FIFO will be drained at a rate that depends
  * on the pixel size.  When it reaches the watermark level, it'll start
  * fetching FIFO line sized based chunks from memory until the FIFO fills
  * past the watermark point.  If the FIFO drains completely, a FIFO underrun
  * will occur, and a display engine hang could result.
  */
 static unsigned long intel_calculate_wm(unsigned long clock_in_khz,
 					const struct intel_watermark_params *wm,
 					int fifo_size,
 					int pixel_size,
 					unsigned long latency_ns)
 {
 	long entries_required, wm_size;
 
 	/*
 	 * Note: we need to make sure we don't overflow for various clock &
 	 * latency values.
 	 * clocks go from a few thousand to several hundred thousand.
 	 * latency is usually a few thousand
 	 */
 	entries_required = ((clock_in_khz / 1000) * pixel_size * latency_ns) /
 		1000;
 	entries_required = DIV_ROUND_UP(entries_required, wm->cacheline_size);
 
 	DRM_DEBUG_KMS("FIFO entries required for mode: %ld\n", entries_required);
 
 	wm_size = fifo_size - (entries_required + wm->guard_size);
 
 	DRM_DEBUG_KMS("FIFO watermark level: %ld\n", wm_size);
 
 	/* Don't promote wm_size to unsigned... */
 	if (wm_size > (long)wm->max_wm)
 		wm_size = wm->max_wm;
 	if (wm_size <= 0)
 		wm_size = wm->default_wm;
 	return wm_size;
 }
 
 static struct drm_crtc *single_enabled_crtc(struct drm_device *dev)
 {
 	struct drm_crtc *crtc, *enabled = NULL;
 
 	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
 		if (intel_crtc_active(crtc)) {
 			if (enabled)
 				return NULL;
 			enabled = crtc;
 		}
 	}
 
 	return enabled;
 }
 
 static void pineview_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_crtc *crtc;
 	const struct cxsr_latency *latency;
 	u32 reg;
 	unsigned long wm;
 
 	latency = intel_get_cxsr_latency(IS_PINEVIEW_G(dev), dev_priv->is_ddr3,
 					 dev_priv->fsb_freq, dev_priv->mem_freq);
 	if (!latency) {
 		DRM_DEBUG_KMS("Unknown FSB/MEM found, disable CxSR\n");
 		pineview_disable_cxsr(dev);
 		return;
 	}
 
 	crtc = single_enabled_crtc(dev);
 	if (crtc) {
 		int clock = crtc->mode.clock;
 		int pixel_size = crtc->fb->bits_per_pixel / 8;
 
 		/* Display SR */
 		wm = intel_calculate_wm(clock, &pineview_display_wm,
 					pineview_display_wm.fifo_size,
 					pixel_size, latency->display_sr);
 		reg = I915_READ(DSPFW1);
 		reg &= ~DSPFW_SR_MASK;
 		reg |= wm << DSPFW_SR_SHIFT;
 		I915_WRITE(DSPFW1, reg);
 		DRM_DEBUG_KMS("DSPFW1 register is %x\n", reg);
 
 		/* cursor SR */
 		wm = intel_calculate_wm(clock, &pineview_cursor_wm,
 					pineview_display_wm.fifo_size,
 					pixel_size, latency->cursor_sr);
 		reg = I915_READ(DSPFW3);
 		reg &= ~DSPFW_CURSOR_SR_MASK;
 		reg |= (wm & 0x3f) << DSPFW_CURSOR_SR_SHIFT;
 		I915_WRITE(DSPFW3, reg);
 
 		/* Display HPLL off SR */
 		wm = intel_calculate_wm(clock, &pineview_display_hplloff_wm,
 					pineview_display_hplloff_wm.fifo_size,
 					pixel_size, latency->display_hpll_disable);
 		reg = I915_READ(DSPFW3);
 		reg &= ~DSPFW_HPLL_SR_MASK;
 		reg |= wm & DSPFW_HPLL_SR_MASK;
 		I915_WRITE(DSPFW3, reg);
 
 		/* cursor HPLL off SR */
 		wm = intel_calculate_wm(clock, &pineview_cursor_hplloff_wm,
 					pineview_display_hplloff_wm.fifo_size,
 					pixel_size, latency->cursor_hpll_disable);
 		reg = I915_READ(DSPFW3);
 		reg &= ~DSPFW_HPLL_CURSOR_MASK;
 		reg |= (wm & 0x3f) << DSPFW_HPLL_CURSOR_SHIFT;
 		I915_WRITE(DSPFW3, reg);
 		DRM_DEBUG_KMS("DSPFW3 register is %x\n", reg);
 
 		/* activate cxsr */
 		I915_WRITE(DSPFW3,
 			   I915_READ(DSPFW3) | PINEVIEW_SELF_REFRESH_EN);
 		DRM_DEBUG_KMS("Self-refresh is enabled\n");
 	} else {
 		pineview_disable_cxsr(dev);
 		DRM_DEBUG_KMS("Self-refresh is disabled\n");
 	}
 }
 
 static bool g4x_compute_wm0(struct drm_device *dev,
 			    int plane,
 			    const struct intel_watermark_params *display,
 			    int display_latency_ns,
 			    const struct intel_watermark_params *cursor,
 			    int cursor_latency_ns,
 			    int *plane_wm,
 			    int *cursor_wm)
 {
 	struct drm_crtc *crtc;
 	int htotal, hdisplay, clock, pixel_size;
 	int line_time_us, line_count;
 	int entries, tlb_miss;
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	if (!intel_crtc_active(crtc)) {
 		*cursor_wm = cursor->guard_size;
 		*plane_wm = display->guard_size;
 		return false;
 	}
 
 	htotal = crtc->mode.htotal;
 	hdisplay = crtc->mode.hdisplay;
 	clock = crtc->mode.clock;
 	pixel_size = crtc->fb->bits_per_pixel / 8;
 
 	/* Use the small buffer method to calculate plane watermark */
 	entries = ((clock * pixel_size / 1000) * display_latency_ns) / 1000;
 	tlb_miss = display->fifo_size*display->cacheline_size - hdisplay * 8;
 	if (tlb_miss > 0)
 		entries += tlb_miss;
 	entries = DIV_ROUND_UP(entries, display->cacheline_size);
 	*plane_wm = entries + display->guard_size;
 	if (*plane_wm > (int)display->max_wm)
 		*plane_wm = display->max_wm;
 
 	/* Use the large buffer method to calculate cursor watermark */
 	line_time_us = ((htotal * 1000) / clock);
 	line_count = (cursor_latency_ns / line_time_us + 1000) / 1000;
 	entries = line_count * 64 * pixel_size;
 	tlb_miss = cursor->fifo_size*cursor->cacheline_size - hdisplay * 8;
 	if (tlb_miss > 0)
 		entries += tlb_miss;
 	entries = DIV_ROUND_UP(entries, cursor->cacheline_size);
 	*cursor_wm = entries + cursor->guard_size;
 	if (*cursor_wm > (int)cursor->max_wm)
 		*cursor_wm = (int)cursor->max_wm;
 
 	return true;
 }
 
 /*
  * Check the wm result.
  *
  * If any calculated watermark values is larger than the maximum value that
  * can be programmed into the associated watermark register, that watermark
  * must be disabled.
  */
 static bool g4x_check_srwm(struct drm_device *dev,
 			   int display_wm, int cursor_wm,
 			   const struct intel_watermark_params *display,
 			   const struct intel_watermark_params *cursor)
 {
 	DRM_DEBUG_KMS("SR watermark: display plane %d, cursor %d\n",
 		      display_wm, cursor_wm);
 
 	if (display_wm > display->max_wm) {
 		DRM_DEBUG_KMS("display watermark is too large(%d/%ld), disabling\n",
 			      display_wm, display->max_wm);
 		return false;
 	}
 
 	if (cursor_wm > cursor->max_wm) {
 		DRM_DEBUG_KMS("cursor watermark is too large(%d/%ld), disabling\n",
 			      cursor_wm, cursor->max_wm);
 		return false;
 	}
 
 	if (!(display_wm || cursor_wm)) {
 		DRM_DEBUG_KMS("SR latency is 0, disabling\n");
 		return false;
 	}
 
 	return true;
 }
 
 static bool g4x_compute_srwm(struct drm_device *dev,
 			     int plane,
 			     int latency_ns,
 			     const struct intel_watermark_params *display,
 			     const struct intel_watermark_params *cursor,
 			     int *display_wm, int *cursor_wm)
 {
 	struct drm_crtc *crtc;
 	int hdisplay, htotal, pixel_size, clock;
 	unsigned long line_time_us;
 	int line_count, line_size;
 	int small, large;
 	int entries;
 
 	if (!latency_ns) {
 		*display_wm = *cursor_wm = 0;
 		return false;
 	}
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	hdisplay = crtc->mode.hdisplay;
 	htotal = crtc->mode.htotal;
 	clock = crtc->mode.clock;
 	pixel_size = crtc->fb->bits_per_pixel / 8;
 
 	line_time_us = (htotal * 1000) / clock;
 	line_count = (latency_ns / line_time_us + 1000) / 1000;
 	line_size = hdisplay * pixel_size;
 
 	/* Use the minimum of the small and large buffer method for primary */
 	small = ((clock * pixel_size / 1000) * latency_ns) / 1000;
 	large = line_count * line_size;
 
 	entries = DIV_ROUND_UP(min(small, large), display->cacheline_size);
 	*display_wm = entries + display->guard_size;
 
 	/* calculate the self-refresh watermark for display cursor */
 	entries = line_count * pixel_size * 64;
 	entries = DIV_ROUND_UP(entries, cursor->cacheline_size);
 	*cursor_wm = entries + cursor->guard_size;
 
 	return g4x_check_srwm(dev,
 			      *display_wm, *cursor_wm,
 			      display, cursor);
 }
 
 static bool vlv_compute_drain_latency(struct drm_device *dev,
 				     int plane,
 				     int *plane_prec_mult,
 				     int *plane_dl,
 				     int *cursor_prec_mult,
 				     int *cursor_dl)
 {
 	struct drm_crtc *crtc;
 	int clock, pixel_size;
 	int entries;
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	if (!intel_crtc_active(crtc))
 		return false;
 
 	clock = crtc->mode.clock;	/* VESA DOT Clock */
 	pixel_size = crtc->fb->bits_per_pixel / 8;	/* BPP */
 
 	entries = (clock / 1000) * pixel_size;
 	*plane_prec_mult = (entries > 256) ?
 		DRAIN_LATENCY_PRECISION_32 : DRAIN_LATENCY_PRECISION_16;
 	*plane_dl = (64 * (*plane_prec_mult) * 4) / ((clock / 1000) *
 						     pixel_size);
 
 	entries = (clock / 1000) * 4;	/* BPP is always 4 for cursor */
 	*cursor_prec_mult = (entries > 256) ?
 		DRAIN_LATENCY_PRECISION_32 : DRAIN_LATENCY_PRECISION_16;
 	*cursor_dl = (64 * (*cursor_prec_mult) * 4) / ((clock / 1000) * 4);
 
 	return true;
 }
 
 /*
  * Update drain latency registers of memory arbiter
  *
  * Valleyview SoC has a new memory arbiter and needs drain latency registers
  * to be programmed. Each plane has a drain latency multiplier and a drain
  * latency value.
  */
 
 static void vlv_update_drain_latency(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int planea_prec, planea_dl, planeb_prec, planeb_dl;
 	int cursora_prec, cursora_dl, cursorb_prec, cursorb_dl;
 	int plane_prec_mult, cursor_prec_mult; /* Precision multiplier is
 							either 16 or 32 */
 
 	/* For plane A, Cursor A */
 	if (vlv_compute_drain_latency(dev, 0, &plane_prec_mult, &planea_dl,
 				      &cursor_prec_mult, &cursora_dl)) {
 		cursora_prec = (cursor_prec_mult == DRAIN_LATENCY_PRECISION_32) ?
 			DDL_CURSORA_PRECISION_32 : DDL_CURSORA_PRECISION_16;
 		planea_prec = (plane_prec_mult == DRAIN_LATENCY_PRECISION_32) ?
 			DDL_PLANEA_PRECISION_32 : DDL_PLANEA_PRECISION_16;
 
 		I915_WRITE(VLV_DDL1, cursora_prec |
 				(cursora_dl << DDL_CURSORA_SHIFT) |
 				planea_prec | planea_dl);
 	}
 
 	/* For plane B, Cursor B */
 	if (vlv_compute_drain_latency(dev, 1, &plane_prec_mult, &planeb_dl,
 				      &cursor_prec_mult, &cursorb_dl)) {
 		cursorb_prec = (cursor_prec_mult == DRAIN_LATENCY_PRECISION_32) ?
 			DDL_CURSORB_PRECISION_32 : DDL_CURSORB_PRECISION_16;
 		planeb_prec = (plane_prec_mult == DRAIN_LATENCY_PRECISION_32) ?
 			DDL_PLANEB_PRECISION_32 : DDL_PLANEB_PRECISION_16;
 
 		I915_WRITE(VLV_DDL2, cursorb_prec |
 				(cursorb_dl << DDL_CURSORB_SHIFT) |
 				planeb_prec | planeb_dl);
 	}
 }
 
 #define single_plane_enabled(mask) ((mask) != 0 && powerof2(mask))
 
 static void valleyview_update_wm(struct drm_device *dev)
 {
 	static const int sr_latency_ns = 12000;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int planea_wm, planeb_wm, cursora_wm, cursorb_wm;
 	int plane_sr, cursor_sr;
 	int ignore_plane_sr, ignore_cursor_sr;
 	unsigned int enabled = 0;
 
 	vlv_update_drain_latency(dev);
 
 	if (g4x_compute_wm0(dev, 0,
 			    &valleyview_wm_info, latency_ns,
 			    &valleyview_cursor_wm_info, latency_ns,
 			    &planea_wm, &cursora_wm))
 		enabled |= 1;
 
 	if (g4x_compute_wm0(dev, 1,
 			    &valleyview_wm_info, latency_ns,
 			    &valleyview_cursor_wm_info, latency_ns,
 			    &planeb_wm, &cursorb_wm))
 		enabled |= 2;
 
 	if (single_plane_enabled(enabled) &&
 	    g4x_compute_srwm(dev, ffs(enabled) - 1,
 			     sr_latency_ns,
 			     &valleyview_wm_info,
 			     &valleyview_cursor_wm_info,
 			     &plane_sr, &ignore_cursor_sr) &&
 	    g4x_compute_srwm(dev, ffs(enabled) - 1,
 			     2*sr_latency_ns,
 			     &valleyview_wm_info,
 			     &valleyview_cursor_wm_info,
 			     &ignore_plane_sr, &cursor_sr)) {
 		I915_WRITE(FW_BLC_SELF_VLV, FW_CSPWRDWNEN);
 	} else {
 		I915_WRITE(FW_BLC_SELF_VLV,
 			   I915_READ(FW_BLC_SELF_VLV) & ~FW_CSPWRDWNEN);
 		plane_sr = cursor_sr = 0;
 	}
 
 	DRM_DEBUG_KMS("Setting FIFO watermarks - A: plane=%d, cursor=%d, B: plane=%d, cursor=%d, SR: plane=%d, cursor=%d\n",
 		      planea_wm, cursora_wm,
 		      planeb_wm, cursorb_wm,
 		      plane_sr, cursor_sr);
 
 	I915_WRITE(DSPFW1,
 		   (plane_sr << DSPFW_SR_SHIFT) |
 		   (cursorb_wm << DSPFW_CURSORB_SHIFT) |
 		   (planeb_wm << DSPFW_PLANEB_SHIFT) |
 		   planea_wm);
 	I915_WRITE(DSPFW2,
 		   (I915_READ(DSPFW2) & ~DSPFW_CURSORA_MASK) |
 		   (cursora_wm << DSPFW_CURSORA_SHIFT));
 	I915_WRITE(DSPFW3,
 		   (I915_READ(DSPFW3) & ~DSPFW_CURSOR_SR_MASK) |
 		   (cursor_sr << DSPFW_CURSOR_SR_SHIFT));
 }
 
 static void g4x_update_wm(struct drm_device *dev)
 {
 	static const int sr_latency_ns = 12000;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int planea_wm, planeb_wm, cursora_wm, cursorb_wm;
 	int plane_sr, cursor_sr;
 	unsigned int enabled = 0;
 
 	if (g4x_compute_wm0(dev, 0,
 			    &g4x_wm_info, latency_ns,
 			    &g4x_cursor_wm_info, latency_ns,
 			    &planea_wm, &cursora_wm))
 		enabled |= 1;
 
 	if (g4x_compute_wm0(dev, 1,
 			    &g4x_wm_info, latency_ns,
 			    &g4x_cursor_wm_info, latency_ns,
 			    &planeb_wm, &cursorb_wm))
 		enabled |= 2;
 
 	if (single_plane_enabled(enabled) &&
 	    g4x_compute_srwm(dev, ffs(enabled) - 1,
 			     sr_latency_ns,
 			     &g4x_wm_info,
 			     &g4x_cursor_wm_info,
 			     &plane_sr, &cursor_sr)) {
 		I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN);
 	} else {
 		I915_WRITE(FW_BLC_SELF,
 			   I915_READ(FW_BLC_SELF) & ~FW_BLC_SELF_EN);
 		plane_sr = cursor_sr = 0;
 	}
 
 	DRM_DEBUG_KMS("Setting FIFO watermarks - A: plane=%d, cursor=%d, B: plane=%d, cursor=%d, SR: plane=%d, cursor=%d\n",
 		      planea_wm, cursora_wm,
 		      planeb_wm, cursorb_wm,
 		      plane_sr, cursor_sr);
 
 	I915_WRITE(DSPFW1,
 		   (plane_sr << DSPFW_SR_SHIFT) |
 		   (cursorb_wm << DSPFW_CURSORB_SHIFT) |
 		   (planeb_wm << DSPFW_PLANEB_SHIFT) |
 		   planea_wm);
 	I915_WRITE(DSPFW2,
 		   (I915_READ(DSPFW2) & ~DSPFW_CURSORA_MASK) |
 		   (cursora_wm << DSPFW_CURSORA_SHIFT));
 	/* HPLL off in SR has some issues on G4x... disable it */
 	I915_WRITE(DSPFW3,
 		   (I915_READ(DSPFW3) & ~(DSPFW_HPLL_SR_EN | DSPFW_CURSOR_SR_MASK)) |
 		   (cursor_sr << DSPFW_CURSOR_SR_SHIFT));
 }
 
 static void i965_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_crtc *crtc;
 	int srwm = 1;
 	int cursor_sr = 16;
 
 	/* Calc sr entries for one plane configs */
 	crtc = single_enabled_crtc(dev);
 	if (crtc) {
 		/* self-refresh has much higher latency */
 		static const int sr_latency_ns = 12000;
 		int clock = crtc->mode.clock;
 		int htotal = crtc->mode.htotal;
 		int hdisplay = crtc->mode.hdisplay;
 		int pixel_size = crtc->fb->bits_per_pixel / 8;
 		unsigned long line_time_us;
 		int entries;
 
 		line_time_us = ((htotal * 1000) / clock);
 
 		/* Use ns/us then divide to preserve precision */
 		entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) *
 			pixel_size * hdisplay;
 		entries = DIV_ROUND_UP(entries, I915_FIFO_LINE_SIZE);
 		srwm = I965_FIFO_SIZE - entries;
 		if (srwm < 0)
 			srwm = 1;
 		srwm &= 0x1ff;
 		DRM_DEBUG_KMS("self-refresh entries: %d, wm: %d\n",
 			      entries, srwm);
 
 		entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) *
 			pixel_size * 64;
 		entries = DIV_ROUND_UP(entries,
 					  i965_cursor_wm_info.cacheline_size);
 		cursor_sr = i965_cursor_wm_info.fifo_size -
 			(entries + i965_cursor_wm_info.guard_size);
 
 		if (cursor_sr > i965_cursor_wm_info.max_wm)
 			cursor_sr = i965_cursor_wm_info.max_wm;
 
 		DRM_DEBUG_KMS("self-refresh watermark: display plane %d "
 			      "cursor %d\n", srwm, cursor_sr);
 
 		if (IS_CRESTLINE(dev))
 			I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN);
 	} else {
 		/* Turn off self refresh if both pipes are enabled */
 		if (IS_CRESTLINE(dev))
 			I915_WRITE(FW_BLC_SELF, I915_READ(FW_BLC_SELF)
 				   & ~FW_BLC_SELF_EN);
 	}
 
 	DRM_DEBUG_KMS("Setting FIFO watermarks - A: 8, B: 8, C: 8, SR %d\n",
 		      srwm);
 
 	/* 965 has limitations... */
 	I915_WRITE(DSPFW1, (srwm << DSPFW_SR_SHIFT) |
 		   (8 << 16) | (8 << 8) | (8 << 0));
 	I915_WRITE(DSPFW2, (8 << 8) | (8 << 0));
 	/* update cursor SR watermark */
 	I915_WRITE(DSPFW3, (cursor_sr << DSPFW_CURSOR_SR_SHIFT));
 }
 
 static void i9xx_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	const struct intel_watermark_params *wm_info;
 	uint32_t fwater_lo;
 	uint32_t fwater_hi;
 	int cwm, srwm = 1;
 	int fifo_size;
 	int planea_wm, planeb_wm;
 	struct drm_crtc *crtc, *enabled = NULL;
 
 	if (IS_I945GM(dev))
 		wm_info = &i945_wm_info;
 	else if (!IS_GEN2(dev))
 		wm_info = &i915_wm_info;
 	else
 		wm_info = &i855_wm_info;
 
 	fifo_size = dev_priv->display.get_fifo_size(dev, 0);
 	crtc = intel_get_crtc_for_plane(dev, 0);
 	if (intel_crtc_active(crtc)) {
 		int cpp = crtc->fb->bits_per_pixel / 8;
 		if (IS_GEN2(dev))
 			cpp = 4;
 
 		planea_wm = intel_calculate_wm(crtc->mode.clock,
 					       wm_info, fifo_size, cpp,
 					       latency_ns);
 		enabled = crtc;
 	} else
 		planea_wm = fifo_size - wm_info->guard_size;
 
 	fifo_size = dev_priv->display.get_fifo_size(dev, 1);
 	crtc = intel_get_crtc_for_plane(dev, 1);
 	if (intel_crtc_active(crtc)) {
 		int cpp = crtc->fb->bits_per_pixel / 8;
 		if (IS_GEN2(dev))
 			cpp = 4;
 
 		planeb_wm = intel_calculate_wm(crtc->mode.clock,
 					       wm_info, fifo_size, cpp,
 					       latency_ns);
 		if (enabled == NULL)
 			enabled = crtc;
 		else
 			enabled = NULL;
 	} else
 		planeb_wm = fifo_size - wm_info->guard_size;
 
 	DRM_DEBUG_KMS("FIFO watermarks - A: %d, B: %d\n", planea_wm, planeb_wm);
 
 	/*
 	 * Overlay gets an aggressive default since video jitter is bad.
 	 */
 	cwm = 2;
 
 	/* Play safe and disable self-refresh before adjusting watermarks. */
 	if (IS_I945G(dev) || IS_I945GM(dev))
 		I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN_MASK | 0);
 	else if (IS_I915GM(dev))
 		I915_WRITE(INSTPM, I915_READ(INSTPM) & ~INSTPM_SELF_EN);
 
 	/* Calc sr entries for one plane configs */
 	if (HAS_FW_BLC(dev) && enabled) {
 		/* self-refresh has much higher latency */
 		static const int sr_latency_ns = 6000;
 		int clock = enabled->mode.clock;
 		int htotal = enabled->mode.htotal;
 		int hdisplay = enabled->mode.hdisplay;
 		int pixel_size = enabled->fb->bits_per_pixel / 8;
 		unsigned long line_time_us;
 		int entries;
 
 		line_time_us = (htotal * 1000) / clock;
 
 		/* Use ns/us then divide to preserve precision */
 		entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) *
 			pixel_size * hdisplay;
 		entries = DIV_ROUND_UP(entries, wm_info->cacheline_size);
 		DRM_DEBUG_KMS("self-refresh entries: %d\n", entries);
 		srwm = wm_info->fifo_size - entries;
 		if (srwm < 0)
 			srwm = 1;
 
 		if (IS_I945G(dev) || IS_I945GM(dev))
 			I915_WRITE(FW_BLC_SELF,
 				   FW_BLC_SELF_FIFO_MASK | (srwm & 0xff));
 		else if (IS_I915GM(dev))
 			I915_WRITE(FW_BLC_SELF, srwm & 0x3f);
 	}
 
 	DRM_DEBUG_KMS("Setting FIFO watermarks - A: %d, B: %d, C: %d, SR %d\n",
 		      planea_wm, planeb_wm, cwm, srwm);
 
 	fwater_lo = ((planeb_wm & 0x3f) << 16) | (planea_wm & 0x3f);
 	fwater_hi = (cwm & 0x1f);
 
 	/* Set request length to 8 cachelines per fetch */
 	fwater_lo = fwater_lo | (1 << 24) | (1 << 8);
 	fwater_hi = fwater_hi | (1 << 8);
 
 	I915_WRITE(FW_BLC, fwater_lo);
 	I915_WRITE(FW_BLC2, fwater_hi);
 
 	if (HAS_FW_BLC(dev)) {
 		if (enabled) {
 			if (IS_I945G(dev) || IS_I945GM(dev))
 				I915_WRITE(FW_BLC_SELF,
 					   FW_BLC_SELF_EN_MASK | FW_BLC_SELF_EN);
 			else if (IS_I915GM(dev))
 				I915_WRITE(INSTPM, I915_READ(INSTPM) | INSTPM_SELF_EN);
 			DRM_DEBUG_KMS("memory self refresh enabled\n");
 		} else
 			DRM_DEBUG_KMS("memory self refresh disabled\n");
 	}
 }
 
 static void i830_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_crtc *crtc;
 	uint32_t fwater_lo;
 	int planea_wm;
 
 	crtc = single_enabled_crtc(dev);
 	if (crtc == NULL)
 		return;
 
 	planea_wm = intel_calculate_wm(crtc->mode.clock, &i830_wm_info,
 				       dev_priv->display.get_fifo_size(dev, 0),
 				       4, latency_ns);
 	fwater_lo = I915_READ(FW_BLC) & ~0xfff;
 	fwater_lo |= (3<<8) | planea_wm;
 
 	DRM_DEBUG_KMS("Setting FIFO watermarks - A: %d\n", planea_wm);
 
 	I915_WRITE(FW_BLC, fwater_lo);
 }
 
 #define ILK_LP0_PLANE_LATENCY		700
 #define ILK_LP0_CURSOR_LATENCY		1300
 
 /*
  * Check the wm result.
  *
  * If any calculated watermark values is larger than the maximum value that
  * can be programmed into the associated watermark register, that watermark
  * must be disabled.
  */
 static bool ironlake_check_srwm(struct drm_device *dev, int level,
 				int fbc_wm, int display_wm, int cursor_wm,
 				const struct intel_watermark_params *display,
 				const struct intel_watermark_params *cursor)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	DRM_DEBUG_KMS("watermark %d: display plane %d, fbc lines %d,"
 		      " cursor %d\n", level, display_wm, fbc_wm, cursor_wm);
 
 	if (fbc_wm > SNB_FBC_MAX_SRWM) {
 		DRM_DEBUG_KMS("fbc watermark(%d) is too large(%d), disabling wm%d+\n",
 			      fbc_wm, SNB_FBC_MAX_SRWM, level);
 
 		/* fbc has it's own way to disable FBC WM */
 		I915_WRITE(DISP_ARB_CTL,
 			   I915_READ(DISP_ARB_CTL) | DISP_FBC_WM_DIS);
 		return false;
 	}
 
 	if (display_wm > display->max_wm) {
 		DRM_DEBUG_KMS("display watermark(%d) is too large(%d), disabling wm%d+\n",
 			      display_wm, SNB_DISPLAY_MAX_SRWM, level);
 		return false;
 	}
 
 	if (cursor_wm > cursor->max_wm) {
 		DRM_DEBUG_KMS("cursor watermark(%d) is too large(%d), disabling wm%d+\n",
 			      cursor_wm, SNB_CURSOR_MAX_SRWM, level);
 		return false;
 	}
 
 	if (!(fbc_wm || display_wm || cursor_wm)) {
 		DRM_DEBUG_KMS("latency %d is 0, disabling wm%d+\n", level, level);
 		return false;
 	}
 
 	return true;
 }
 
 /*
  * Compute watermark values of WM[1-3],
  */
 static bool ironlake_compute_srwm(struct drm_device *dev, int level, int plane,
 				  int latency_ns,
 				  const struct intel_watermark_params *display,
 				  const struct intel_watermark_params *cursor,
 				  int *fbc_wm, int *display_wm, int *cursor_wm)
 {
 	struct drm_crtc *crtc;
 	unsigned long line_time_us;
 	int hdisplay, htotal, pixel_size, clock;
 	int line_count, line_size;
 	int small, large;
 	int entries;
 
 	if (!latency_ns) {
 		*fbc_wm = *display_wm = *cursor_wm = 0;
 		return false;
 	}
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	hdisplay = crtc->mode.hdisplay;
 	htotal = crtc->mode.htotal;
 	clock = crtc->mode.clock;
 	pixel_size = crtc->fb->bits_per_pixel / 8;
 
 	line_time_us = (htotal * 1000) / clock;
 	line_count = (latency_ns / line_time_us + 1000) / 1000;
 	line_size = hdisplay * pixel_size;
 
 	/* Use the minimum of the small and large buffer method for primary */
 	small = ((clock * pixel_size / 1000) * latency_ns) / 1000;
 	large = line_count * line_size;
 
 	entries = DIV_ROUND_UP(min(small, large), display->cacheline_size);
 	*display_wm = entries + display->guard_size;
 
 	/*
 	 * Spec says:
 	 * FBC WM = ((Final Primary WM * 64) / number of bytes per line) + 2
 	 */
 	*fbc_wm = DIV_ROUND_UP(*display_wm * 64, line_size) + 2;
 
 	/* calculate the self-refresh watermark for display cursor */
 	entries = line_count * pixel_size * 64;
 	entries = DIV_ROUND_UP(entries, cursor->cacheline_size);
 	*cursor_wm = entries + cursor->guard_size;
 
 	return ironlake_check_srwm(dev, level,
 				   *fbc_wm, *display_wm, *cursor_wm,
 				   display, cursor);
 }
 
 static void ironlake_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int fbc_wm, plane_wm, cursor_wm;
 	unsigned int enabled;
 
 	enabled = 0;
 	if (g4x_compute_wm0(dev, 0,
 			    &ironlake_display_wm_info,
 			    ILK_LP0_PLANE_LATENCY,
 			    &ironlake_cursor_wm_info,
 			    ILK_LP0_CURSOR_LATENCY,
 			    &plane_wm, &cursor_wm)) {
 		I915_WRITE(WM0_PIPEA_ILK,
 			   (plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm);
 		DRM_DEBUG_KMS("FIFO watermarks For pipe A -"
 			      " plane %d, " "cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 1;
 	}
 
 	if (g4x_compute_wm0(dev, 1,
 			    &ironlake_display_wm_info,
 			    ILK_LP0_PLANE_LATENCY,
 			    &ironlake_cursor_wm_info,
 			    ILK_LP0_CURSOR_LATENCY,
 			    &plane_wm, &cursor_wm)) {
 		I915_WRITE(WM0_PIPEB_ILK,
 			   (plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm);
 		DRM_DEBUG_KMS("FIFO watermarks For pipe B -"
 			      " plane %d, cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 2;
 	}
 
 	/*
 	 * Calculate and update the self-refresh watermark only when one
 	 * display plane is used.
 	 */
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	if (!single_plane_enabled(enabled))
 		return;
 	enabled = ffs(enabled) - 1;
 
 	/* WM1 */
 	if (!ironlake_compute_srwm(dev, 1, enabled,
 				   ILK_READ_WM1_LATENCY() * 500,
 				   &ironlake_display_srwm_info,
 				   &ironlake_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM1_LP_ILK,
 		   WM1_LP_SR_EN |
 		   (ILK_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/* WM2 */
 	if (!ironlake_compute_srwm(dev, 2, enabled,
 				   ILK_READ_WM2_LATENCY() * 500,
 				   &ironlake_display_srwm_info,
 				   &ironlake_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM2_LP_ILK,
 		   WM2_LP_EN |
 		   (ILK_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/*
 	 * WM3 is unsupported on ILK, probably because we don't have latency
 	 * data for that power state
 	 */
 }
 
 static void sandybridge_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int latency = SNB_READ_WM0_LATENCY() * 100;	/* In unit 0.1us */
 	u32 val;
 	int fbc_wm, plane_wm, cursor_wm;
 	unsigned int enabled;
 
 	enabled = 0;
 	if (g4x_compute_wm0(dev, 0,
 			    &sandybridge_display_wm_info, latency,
 			    &sandybridge_cursor_wm_info, latency,
 			    &plane_wm, &cursor_wm)) {
 		val = I915_READ(WM0_PIPEA_ILK);
 		val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK);
 		I915_WRITE(WM0_PIPEA_ILK, val |
 			   ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm));
 		DRM_DEBUG_KMS("FIFO watermarks For pipe A -"
 			      " plane %d, " "cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 1;
 	}
 
 	if (g4x_compute_wm0(dev, 1,
 			    &sandybridge_display_wm_info, latency,
 			    &sandybridge_cursor_wm_info, latency,
 			    &plane_wm, &cursor_wm)) {
 		val = I915_READ(WM0_PIPEB_ILK);
 		val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK);
 		I915_WRITE(WM0_PIPEB_ILK, val |
 			   ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm));
 		DRM_DEBUG_KMS("FIFO watermarks For pipe B -"
 			      " plane %d, cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 2;
 	}
 
 	/*
 	 * Calculate and update the self-refresh watermark only when one
 	 * display plane is used.
 	 *
 	 * SNB support 3 levels of watermark.
 	 *
 	 * WM1/WM2/WM2 watermarks have to be enabled in the ascending order,
 	 * and disabled in the descending order
 	 *
 	 */
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	if (!single_plane_enabled(enabled) ||
 	    dev_priv->sprite_scaling_enabled)
 		return;
 	enabled = ffs(enabled) - 1;
 
 	/* WM1 */
 	if (!ironlake_compute_srwm(dev, 1, enabled,
 				   SNB_READ_WM1_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM1_LP_ILK,
 		   WM1_LP_SR_EN |
 		   (SNB_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/* WM2 */
 	if (!ironlake_compute_srwm(dev, 2, enabled,
 				   SNB_READ_WM2_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM2_LP_ILK,
 		   WM2_LP_EN |
 		   (SNB_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/* WM3 */
 	if (!ironlake_compute_srwm(dev, 3, enabled,
 				   SNB_READ_WM3_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM3_LP_ILK,
 		   WM3_LP_EN |
 		   (SNB_READ_WM3_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 }
 
 static void ivybridge_update_wm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int latency = SNB_READ_WM0_LATENCY() * 100;	/* In unit 0.1us */
 	u32 val;
 	int fbc_wm, plane_wm, cursor_wm;
 	int ignore_fbc_wm, ignore_plane_wm, ignore_cursor_wm;
 	unsigned int enabled;
 
 	enabled = 0;
 	if (g4x_compute_wm0(dev, 0,
 			    &sandybridge_display_wm_info, latency,
 			    &sandybridge_cursor_wm_info, latency,
 			    &plane_wm, &cursor_wm)) {
 		val = I915_READ(WM0_PIPEA_ILK);
 		val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK);
 		I915_WRITE(WM0_PIPEA_ILK, val |
 			   ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm));
 		DRM_DEBUG_KMS("FIFO watermarks For pipe A -"
 			      " plane %d, " "cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 1;
 	}
 
 	if (g4x_compute_wm0(dev, 1,
 			    &sandybridge_display_wm_info, latency,
 			    &sandybridge_cursor_wm_info, latency,
 			    &plane_wm, &cursor_wm)) {
 		val = I915_READ(WM0_PIPEB_ILK);
 		val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK);
 		I915_WRITE(WM0_PIPEB_ILK, val |
 			   ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm));
 		DRM_DEBUG_KMS("FIFO watermarks For pipe B -"
 			      " plane %d, cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 2;
 	}
 
 	if (g4x_compute_wm0(dev, 2,
 			    &sandybridge_display_wm_info, latency,
 			    &sandybridge_cursor_wm_info, latency,
 			    &plane_wm, &cursor_wm)) {
 		val = I915_READ(WM0_PIPEC_IVB);
 		val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK);
 		I915_WRITE(WM0_PIPEC_IVB, val |
 			   ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm));
 		DRM_DEBUG_KMS("FIFO watermarks For pipe C -"
 			      " plane %d, cursor: %d\n",
 			      plane_wm, cursor_wm);
 		enabled |= 3;
 	}
 
 	/*
 	 * Calculate and update the self-refresh watermark only when one
 	 * display plane is used.
 	 *
 	 * SNB support 3 levels of watermark.
 	 *
 	 * WM1/WM2/WM2 watermarks have to be enabled in the ascending order,
 	 * and disabled in the descending order
 	 *
 	 */
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	if (!single_plane_enabled(enabled) ||
 	    dev_priv->sprite_scaling_enabled)
 		return;
 	enabled = ffs(enabled) - 1;
 
 	/* WM1 */
 	if (!ironlake_compute_srwm(dev, 1, enabled,
 				   SNB_READ_WM1_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM1_LP_ILK,
 		   WM1_LP_SR_EN |
 		   (SNB_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/* WM2 */
 	if (!ironlake_compute_srwm(dev, 2, enabled,
 				   SNB_READ_WM2_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM2_LP_ILK,
 		   WM2_LP_EN |
 		   (SNB_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 
 	/* WM3, note we have to correct the cursor latency */
 	if (!ironlake_compute_srwm(dev, 3, enabled,
 				   SNB_READ_WM3_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &fbc_wm, &plane_wm, &ignore_cursor_wm) ||
 	    !ironlake_compute_srwm(dev, 3, enabled,
 				   2 * SNB_READ_WM3_LATENCY() * 500,
 				   &sandybridge_display_srwm_info,
 				   &sandybridge_cursor_srwm_info,
 				   &ignore_fbc_wm, &ignore_plane_wm, &cursor_wm))
 		return;
 
 	I915_WRITE(WM3_LP_ILK,
 		   WM3_LP_EN |
 		   (SNB_READ_WM3_LATENCY() << WM1_LP_LATENCY_SHIFT) |
 		   (fbc_wm << WM1_LP_FBC_SHIFT) |
 		   (plane_wm << WM1_LP_SR_SHIFT) |
 		   cursor_wm);
 }
 
 static void
 haswell_update_linetime_wm(struct drm_device *dev, int pipe,
 				 struct drm_display_mode *mode)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 temp;
 
 	temp = I915_READ(PIPE_WM_LINETIME(pipe));
 	temp &= ~PIPE_WM_LINETIME_MASK;
 
 	/* The WM are computed with base on how long it takes to fill a single
 	 * row at the given clock rate, multiplied by 8.
 	 * */
 	temp |= PIPE_WM_LINETIME_TIME(
 		((mode->crtc_hdisplay * 1000) / mode->clock) * 8);
 
 	/* IPS watermarks are only used by pipe A, and are ignored by
 	 * pipes B and C.  They are calculated similarly to the common
 	 * linetime values, except that we are using CD clock frequency
 	 * in MHz instead of pixel rate for the division.
 	 *
 	 * This is a placeholder for the IPS watermark calculation code.
 	 */
 
 	I915_WRITE(PIPE_WM_LINETIME(pipe), temp);
 }
 
 static bool
 sandybridge_compute_sprite_wm(struct drm_device *dev, int plane,
 			      uint32_t sprite_width, int pixel_size,
 			      const struct intel_watermark_params *display,
 			      int display_latency_ns, int *sprite_wm)
 {
 	struct drm_crtc *crtc;
 	int clock;
 	int entries, tlb_miss;
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	if (!intel_crtc_active(crtc)) {
 		*sprite_wm = display->guard_size;
 		return false;
 	}
 
 	clock = crtc->mode.clock;
 
 	/* Use the small buffer method to calculate the sprite watermark */
 	entries = ((clock * pixel_size / 1000) * display_latency_ns) / 1000;
 	tlb_miss = display->fifo_size*display->cacheline_size -
 		sprite_width * 8;
 	if (tlb_miss > 0)
 		entries += tlb_miss;
 	entries = DIV_ROUND_UP(entries, display->cacheline_size);
 	*sprite_wm = entries + display->guard_size;
 	if (*sprite_wm > (int)display->max_wm)
 		*sprite_wm = display->max_wm;
 
 	return true;
 }
 
 static bool
 sandybridge_compute_sprite_srwm(struct drm_device *dev, int plane,
 				uint32_t sprite_width, int pixel_size,
 				const struct intel_watermark_params *display,
 				int latency_ns, int *sprite_wm)
 {
 	struct drm_crtc *crtc;
 	unsigned long line_time_us;
 	int clock;
 	int line_count, line_size;
 	int small, large;
 	int entries;
 
 	if (!latency_ns) {
 		*sprite_wm = 0;
 		return false;
 	}
 
 	crtc = intel_get_crtc_for_plane(dev, plane);
 	clock = crtc->mode.clock;
 	if (!clock) {
 		*sprite_wm = 0;
 		return false;
 	}
 
 	line_time_us = (sprite_width * 1000) / clock;
 	if (!line_time_us) {
 		*sprite_wm = 0;
 		return false;
 	}
 
 	line_count = (latency_ns / line_time_us + 1000) / 1000;
 	line_size = sprite_width * pixel_size;
 
 	/* Use the minimum of the small and large buffer method for primary */
 	small = ((clock * pixel_size / 1000) * latency_ns) / 1000;
 	large = line_count * line_size;
 
 	entries = DIV_ROUND_UP(min(small, large), display->cacheline_size);
 	*sprite_wm = entries + display->guard_size;
 
 	return *sprite_wm > 0x3ff ? false : true;
 }
 
 static void sandybridge_update_sprite_wm(struct drm_device *dev, int pipe,
 					 uint32_t sprite_width, int pixel_size)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int latency = SNB_READ_WM0_LATENCY() * 100;	/* In unit 0.1us */
 	u32 val;
 	int sprite_wm, reg;
 	int ret;
 
 	switch (pipe) {
 	case 0:
 		reg = WM0_PIPEA_ILK;
 		break;
 	case 1:
 		reg = WM0_PIPEB_ILK;
 		break;
 	case 2:
 		reg = WM0_PIPEC_IVB;
 		break;
 	default:
 		return; /* bad pipe */
 	}
 
 	ret = sandybridge_compute_sprite_wm(dev, pipe, sprite_width, pixel_size,
 					    &sandybridge_display_wm_info,
 					    latency, &sprite_wm);
 	if (!ret) {
 		DRM_DEBUG_KMS("failed to compute sprite wm for pipe %d\n",
 			      pipe);
 		return;
 	}
 
 	val = I915_READ(reg);
 	val &= ~WM0_PIPE_SPRITE_MASK;
 	I915_WRITE(reg, val | (sprite_wm << WM0_PIPE_SPRITE_SHIFT));
 	DRM_DEBUG_KMS("sprite watermarks For pipe %d - %d\n", pipe, sprite_wm);
 
 
 	ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width,
 					      pixel_size,
 					      &sandybridge_display_srwm_info,
 					      SNB_READ_WM1_LATENCY() * 500,
 					      &sprite_wm);
 	if (!ret) {
 		DRM_DEBUG_KMS("failed to compute sprite lp1 wm on pipe %d\n",
 			      pipe);
 		return;
 	}
 	I915_WRITE(WM1S_LP_ILK, sprite_wm);
 
 	/* Only IVB has two more LP watermarks for sprite */
 	if (!IS_IVYBRIDGE(dev))
 		return;
 
 	ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width,
 					      pixel_size,
 					      &sandybridge_display_srwm_info,
 					      SNB_READ_WM2_LATENCY() * 500,
 					      &sprite_wm);
 	if (!ret) {
 		DRM_DEBUG_KMS("failed to compute sprite lp2 wm on pipe %d\n",
 			      pipe);
 		return;
 	}
 	I915_WRITE(WM2S_LP_IVB, sprite_wm);
 
 	ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width,
 					      pixel_size,
 					      &sandybridge_display_srwm_info,
 					      SNB_READ_WM3_LATENCY() * 500,
 					      &sprite_wm);
 	if (!ret) {
 		DRM_DEBUG_KMS("failed to compute sprite lp3 wm on pipe %d\n",
 			      pipe);
 		return;
 	}
 	I915_WRITE(WM3S_LP_IVB, sprite_wm);
 }
 
 /**
  * intel_update_watermarks - update FIFO watermark values based on current modes
  *
  * Calculate watermark values for the various WM regs based on current mode
  * and plane configuration.
  *
  * There are several cases to deal with here:
  *   - normal (i.e. non-self-refresh)
  *   - self-refresh (SR) mode
  *   - lines are large relative to FIFO size (buffer can hold up to 2)
  *   - lines are small relative to FIFO size (buffer can hold more than 2
  *     lines), so need to account for TLB latency
  *
  *   The normal calculation is:
  *     watermark = dotclock * bytes per pixel * latency
  *   where latency is platform & configuration dependent (we assume pessimal
  *   values here).
  *
  *   The SR calculation is:
  *     watermark = (trunc(latency/line time)+1) * surface width *
  *       bytes per pixel
  *   where
  *     line time = htotal / dotclock
  *     surface width = hdisplay for normal plane and 64 for cursor
  *   and latency is assumed to be high, as above.
  *
  * The final value programmed to the register should always be rounded up,
  * and include an extra 2 entries to account for clock crossings.
  *
  * We don't use the sprite, so we can ignore that.  And on Crestline we have
  * to set the non-SR watermarks to 8.
  */
 void intel_update_watermarks(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (dev_priv->display.update_wm)
 		dev_priv->display.update_wm(dev);
 }
 
 void intel_update_linetime_watermarks(struct drm_device *dev,
 		int pipe, struct drm_display_mode *mode)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (dev_priv->display.update_linetime_wm)
 		dev_priv->display.update_linetime_wm(dev, pipe, mode);
 }
 
 void intel_update_sprite_watermarks(struct drm_device *dev, int pipe,
 				    uint32_t sprite_width, int pixel_size)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (dev_priv->display.update_sprite_wm)
 		dev_priv->display.update_sprite_wm(dev, pipe, sprite_width,
 						   pixel_size);
 }
 
 static struct drm_i915_gem_object *
 intel_alloc_context_page(struct drm_device *dev)
 {
 	struct drm_i915_gem_object *ctx;
 	int ret;
 
 	DRM_LOCK_ASSERT(dev);
 
 	ctx = i915_gem_alloc_object(dev, 4096);
 	if (!ctx) {
 		DRM_DEBUG("failed to alloc power context, RC6 disabled\n");
 		return NULL;
 	}
 
 	ret = i915_gem_object_pin(ctx, 4096, true, false);
 	if (ret) {
 		DRM_ERROR("failed to pin power context: %d\n", ret);
 		goto err_unref;
 	}
 
 	ret = i915_gem_object_set_to_gtt_domain(ctx, 1);
 	if (ret) {
 		DRM_ERROR("failed to set-domain on power context: %d\n", ret);
 		goto err_unpin;
 	}
 
 	return ctx;
 
 err_unpin:
 	i915_gem_object_unpin(ctx);
 err_unref:
 	drm_gem_object_unreference(&ctx->base);
 	DRM_UNLOCK(dev);
 	return NULL;
 }
 
 /**
  * Lock protecting IPS related data structures
  */
 struct mtx mchdev_lock;
 MTX_SYSINIT(mchdev, &mchdev_lock, "mchdev", MTX_DEF);
 
 /* Global for IPS driver to get at the current i915 device. Protected by
  * mchdev_lock. */
 static struct drm_i915_private *i915_mch_dev;
 
 bool ironlake_set_drps(struct drm_device *dev, u8 val)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u16 rgvswctl;
 
 	mtx_assert(&mchdev_lock, MA_OWNED);
 
 	rgvswctl = I915_READ16(MEMSWCTL);
 	if (rgvswctl & MEMCTL_CMD_STS) {
 		DRM_DEBUG("gpu busy, RCS change rejected\n");
 		return false; /* still busy with another command */
 	}
 
 	rgvswctl = (MEMCTL_CMD_CHFREQ << MEMCTL_CMD_SHIFT) |
 		(val << MEMCTL_FREQ_SHIFT) | MEMCTL_SFCAVM;
 	I915_WRITE16(MEMSWCTL, rgvswctl);
 	POSTING_READ16(MEMSWCTL);
 
 	rgvswctl |= MEMCTL_CMD_STS;
 	I915_WRITE16(MEMSWCTL, rgvswctl);
 
 	return true;
 }
 
 static void ironlake_enable_drps(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 rgvmodectl = I915_READ(MEMMODECTL);
 	u8 fmax, fmin, fstart, vstart;
 
 	mtx_lock(&mchdev_lock);
 
 	/* Enable temp reporting */
 	I915_WRITE16(PMMISC, I915_READ(PMMISC) | MCPPCE_EN);
 	I915_WRITE16(TSC1, I915_READ(TSC1) | TSE);
 
 	/* 100ms RC evaluation intervals */
 	I915_WRITE(RCUPEI, 100000);
 	I915_WRITE(RCDNEI, 100000);
 
 	/* Set max/min thresholds to 90ms and 80ms respectively */
 	I915_WRITE(RCBMAXAVG, 90000);
 	I915_WRITE(RCBMINAVG, 80000);
 
 	I915_WRITE(MEMIHYST, 1);
 
 	/* Set up min, max, and cur for interrupt handling */
 	fmax = (rgvmodectl & MEMMODE_FMAX_MASK) >> MEMMODE_FMAX_SHIFT;
 	fmin = (rgvmodectl & MEMMODE_FMIN_MASK);
 	fstart = (rgvmodectl & MEMMODE_FSTART_MASK) >>
 		MEMMODE_FSTART_SHIFT;
 
 	vstart = (I915_READ(PXVFREQ_BASE + (fstart * 4)) & PXVFREQ_PX_MASK) >>
 		PXVFREQ_PX_SHIFT;
 
 	dev_priv->ips.fmax = fmax; /* IPS callback will increase this */
 	dev_priv->ips.fstart = fstart;
 
 	dev_priv->ips.max_delay = fstart;
 	dev_priv->ips.min_delay = fmin;
 	dev_priv->ips.cur_delay = fstart;
 
 	DRM_DEBUG_DRIVER("fmax: %d, fmin: %d, fstart: %d\n",
 			 fmax, fmin, fstart);
 
 	I915_WRITE(MEMINTREN, MEMINT_CX_SUPR_EN | MEMINT_EVAL_CHG_EN);
 
 	/*
 	 * Interrupts will be enabled in ironlake_irq_postinstall
 	 */
 
 	I915_WRITE(VIDSTART, vstart);
 	POSTING_READ(VIDSTART);
 
 	rgvmodectl |= MEMMODE_SWMODE_EN;
 	I915_WRITE(MEMMODECTL, rgvmodectl);
 
 	if (wait_for_atomic((I915_READ(MEMSWCTL) & MEMCTL_CMD_STS) == 0, 10))
 		DRM_ERROR("stuck trying to change perf mode\n");
 	mdelay(1);
 
 	ironlake_set_drps(dev, fstart);
 
 	dev_priv->ips.last_count1 = I915_READ(0x112e4) + I915_READ(0x112e8) +
 		I915_READ(0x112e0);
 	dev_priv->ips.last_time1 = jiffies_to_msecs(jiffies);
 	dev_priv->ips.last_count2 = I915_READ(0x112f4);
 	getrawmonotonic(&dev_priv->ips.last_time2);
 
 	mtx_unlock(&mchdev_lock);
 }
 
 static void ironlake_disable_drps(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u16 rgvswctl;
 
 	mtx_lock(&mchdev_lock);
 
 	rgvswctl = I915_READ16(MEMSWCTL);
 
 	/* Ack interrupts, disable EFC interrupt */
 	I915_WRITE(MEMINTREN, I915_READ(MEMINTREN) & ~MEMINT_EVAL_CHG_EN);
 	I915_WRITE(MEMINTRSTS, MEMINT_EVAL_CHG);
 	I915_WRITE(DEIER, I915_READ(DEIER) & ~DE_PCU_EVENT);
 	I915_WRITE(DEIIR, DE_PCU_EVENT);
 	I915_WRITE(DEIMR, I915_READ(DEIMR) | DE_PCU_EVENT);
 
 	/* Go back to the starting frequency */
 	ironlake_set_drps(dev, dev_priv->ips.fstart);
 	mdelay(1);
 	rgvswctl |= MEMCTL_CMD_STS;
 	I915_WRITE(MEMSWCTL, rgvswctl);
 	mdelay(1);
 
 	mtx_unlock(&mchdev_lock);
 }
 
 /* There's a funny hw issue where the hw returns all 0 when reading from
  * GEN6_RP_INTERRUPT_LIMITS. Hence we always need to compute the desired value
  * ourselves, instead of doing a rmw cycle (which might result in us clearing
  * all limits and the gpu stuck at whatever frequency it is at atm).
  */
 static u32 gen6_rps_limits(struct drm_i915_private *dev_priv, u8 *val)
 {
 	u32 limits;
 
 	limits = 0;
 
 	if (*val >= dev_priv->rps.max_delay)
 		*val = dev_priv->rps.max_delay;
 	limits |= dev_priv->rps.max_delay << 24;
 
 	/* Only set the down limit when we've reached the lowest level to avoid
 	 * getting more interrupts, otherwise leave this clear. This prevents a
 	 * race in the hw when coming out of rc6: There's a tiny window where
 	 * the hw runs at the minimal clock before selecting the desired
 	 * frequency, if the down threshold expires in that window we will not
 	 * receive a down interrupt. */
 	if (*val <= dev_priv->rps.min_delay) {
 		*val = dev_priv->rps.min_delay;
 		limits |= dev_priv->rps.min_delay << 16;
 	}
 
 	return limits;
 }
 
 void gen6_set_rps(struct drm_device *dev, u8 val)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 limits = gen6_rps_limits(dev_priv, &val);
 
 	sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED);
 	WARN_ON(val > dev_priv->rps.max_delay);
 	WARN_ON(val < dev_priv->rps.min_delay);
 
 	if (val == dev_priv->rps.cur_delay)
 		return;
 
 	I915_WRITE(GEN6_RPNSWREQ,
 		   GEN6_FREQUENCY(val) |
 		   GEN6_OFFSET(0) |
 		   GEN6_AGGRESSIVE_TURBO);
 
 	/* Make sure we continue to get interrupts
 	 * until we hit the minimum or maximum frequencies.
 	 */
 	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits);
 
 	POSTING_READ(GEN6_RPNSWREQ);
 
 	dev_priv->rps.cur_delay = val;
 }
 
 static void gen6_disable_rps(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(GEN6_RC_CONTROL, 0);
 	I915_WRITE(GEN6_RPNSWREQ, 1 << 31);
 	I915_WRITE(GEN6_PMINTRMSK, 0xffffffff);
 	I915_WRITE(GEN6_PMIER, 0);
 	/* Complete PM interrupt masking here doesn't race with the rps work
 	 * item again unmasking PM interrupts because that is using a different
 	 * register (PMIMR) to mask PM interrupts. The only risk is in leaving
 	 * stale bits in PMIIR and PMIMR which gen6_enable_rps will clean up. */
 
 	mtx_lock(&dev_priv->rps.lock);
 	dev_priv->rps.pm_iir = 0;
 	mtx_unlock(&dev_priv->rps.lock);
 
 	I915_WRITE(GEN6_PMIIR, I915_READ(GEN6_PMIIR));
 }
 
 int intel_enable_rc6(const struct drm_device *dev)
 {
 	/* Respect the kernel parameter if it is set */
 	if (i915_enable_rc6 >= 0)
 		return i915_enable_rc6;
 
 	/* Disable RC6 on Ironlake */
 	if (INTEL_INFO(dev)->gen == 5)
 		return 0;
 
 	if (IS_HASWELL(dev)) {
 		DRM_DEBUG_DRIVER("Haswell: only RC6 available\n");
 		return INTEL_RC6_ENABLE;
 	}
 
 	/* snb/ivb have more than one rc6 state. */
 	if (INTEL_INFO(dev)->gen == 6) {
 		DRM_DEBUG_DRIVER("Sandybridge: deep RC6 disabled\n");
 		return INTEL_RC6_ENABLE;
 	}
 
 	DRM_DEBUG_DRIVER("RC6 and deep RC6 enabled\n");
 	return (INTEL_RC6_ENABLE | INTEL_RC6p_ENABLE);
 }
 
 static void gen6_enable_rps(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
 	u32 rp_state_cap;
 	u32 gt_perf_status;
 	u32 rc6vids, pcu_mbox, rc6_mask = 0;
 	u32 gtfifodbg;
 	int rc6_mode;
 	int i, ret;
 
 	sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED);
 
 	/* Here begins a magic sequence of register writes to enable
 	 * auto-downclocking.
 	 *
 	 * Perhaps there might be some value in exposing these to
 	 * userspace...
 	 */
 	I915_WRITE(GEN6_RC_STATE, 0);
 
 	/* Clear the DBG now so we don't confuse earlier errors */
 	if ((gtfifodbg = I915_READ(GTFIFODBG))) {
 		DRM_ERROR("GT fifo had a previous error %x\n", gtfifodbg);
 		I915_WRITE(GTFIFODBG, gtfifodbg);
 	}
 
 	gen6_gt_force_wake_get(dev_priv);
 
 	rp_state_cap = I915_READ(GEN6_RP_STATE_CAP);
 	gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS);
 
 	/* In units of 100MHz */
 	dev_priv->rps.max_delay = rp_state_cap & 0xff;
 	dev_priv->rps.min_delay = (rp_state_cap & 0xff0000) >> 16;
 	dev_priv->rps.cur_delay = 0;
 
 	/* disable the counters and set deterministic thresholds */
 	I915_WRITE(GEN6_RC_CONTROL, 0);
 
 	I915_WRITE(GEN6_RC1_WAKE_RATE_LIMIT, 1000 << 16);
 	I915_WRITE(GEN6_RC6_WAKE_RATE_LIMIT, 40 << 16 | 30);
 	I915_WRITE(GEN6_RC6pp_WAKE_RATE_LIMIT, 30);
 	I915_WRITE(GEN6_RC_EVALUATION_INTERVAL, 125000);
 	I915_WRITE(GEN6_RC_IDLE_HYSTERSIS, 25);
 
 	for_each_ring(ring, dev_priv, i)
 		I915_WRITE(RING_MAX_IDLE(ring->mmio_base), 10);
 
 	I915_WRITE(GEN6_RC_SLEEP, 0);
 	I915_WRITE(GEN6_RC1e_THRESHOLD, 1000);
 	I915_WRITE(GEN6_RC6_THRESHOLD, 50000);
 	I915_WRITE(GEN6_RC6p_THRESHOLD, 150000);
 	I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */
 
 	/* Check if we are enabling RC6 */
 	rc6_mode = intel_enable_rc6(dev_priv->dev);
 	if (rc6_mode & INTEL_RC6_ENABLE)
 		rc6_mask |= GEN6_RC_CTL_RC6_ENABLE;
 
 	/* We don't use those on Haswell */
 	if (!IS_HASWELL(dev)) {
 		if (rc6_mode & INTEL_RC6p_ENABLE)
 			rc6_mask |= GEN6_RC_CTL_RC6p_ENABLE;
 
 		if (rc6_mode & INTEL_RC6pp_ENABLE)
 			rc6_mask |= GEN6_RC_CTL_RC6pp_ENABLE;
 	}
 
 	DRM_INFO("Enabling RC6 states: RC6 %s, RC6p %s, RC6pp %s\n",
 			(rc6_mask & GEN6_RC_CTL_RC6_ENABLE) ? "on" : "off",
 			(rc6_mask & GEN6_RC_CTL_RC6p_ENABLE) ? "on" : "off",
 			(rc6_mask & GEN6_RC_CTL_RC6pp_ENABLE) ? "on" : "off");
 
 	I915_WRITE(GEN6_RC_CONTROL,
 		   rc6_mask |
 		   GEN6_RC_CTL_EI_MODE(1) |
 		   GEN6_RC_CTL_HW_ENABLE);
 
 	I915_WRITE(GEN6_RPNSWREQ,
 		   GEN6_FREQUENCY(10) |
 		   GEN6_OFFSET(0) |
 		   GEN6_AGGRESSIVE_TURBO);
 	I915_WRITE(GEN6_RC_VIDEO_FREQ,
 		   GEN6_FREQUENCY(12));
 
 	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 1000000);
 	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
 		   dev_priv->rps.max_delay << 24 |
 		   dev_priv->rps.min_delay << 16);
 
 	I915_WRITE(GEN6_RP_UP_THRESHOLD, 59400);
 	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 245000);
 	I915_WRITE(GEN6_RP_UP_EI, 66000);
 	I915_WRITE(GEN6_RP_DOWN_EI, 350000);
 
 	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
 	I915_WRITE(GEN6_RP_CONTROL,
 		   GEN6_RP_MEDIA_TURBO |
 		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
 		   GEN6_RP_MEDIA_IS_GFX |
 		   GEN6_RP_ENABLE |
 		   GEN6_RP_UP_BUSY_AVG |
 		   (IS_HASWELL(dev) ? GEN7_RP_DOWN_IDLE_AVG : GEN6_RP_DOWN_IDLE_CONT));
 
 	ret = sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_MIN_FREQ_TABLE, 0);
 	if (!ret) {
 		pcu_mbox = 0;
 		ret = sandybridge_pcode_read(dev_priv, GEN6_READ_OC_PARAMS, &pcu_mbox);
 		if (ret && pcu_mbox & (1<<31)) { /* OC supported */
 			dev_priv->rps.max_delay = pcu_mbox & 0xff;
 			DRM_DEBUG_DRIVER("overclocking supported, adjusting frequency max to %dMHz\n", pcu_mbox * 50);
 		}
 	} else {
 		DRM_DEBUG_DRIVER("Failed to set the min frequency\n");
 	}
 
 	gen6_set_rps(dev_priv->dev, (gt_perf_status & 0xff00) >> 8);
 
 	/* requires MSI enabled */
 	I915_WRITE(GEN6_PMIER, GEN6_PM_DEFERRED_EVENTS);
 	mtx_lock(&dev_priv->rps.lock);
 	WARN_ON(dev_priv->rps.pm_iir != 0);
 	I915_WRITE(GEN6_PMIMR, 0);
 	mtx_unlock(&dev_priv->rps.lock);
 	/* enable all PM interrupts */
 	I915_WRITE(GEN6_PMINTRMSK, 0);
 
 	rc6vids = 0;
 	ret = sandybridge_pcode_read(dev_priv, GEN6_PCODE_READ_RC6VIDS, &rc6vids);
 	if (IS_GEN6(dev) && ret) {
 		DRM_DEBUG_DRIVER("Couldn't check for BIOS workaround\n");
 	} else if (IS_GEN6(dev) && (GEN6_DECODE_RC6_VID(rc6vids & 0xff) < 450)) {
 		DRM_DEBUG_DRIVER("You should update your BIOS. Correcting minimum rc6 voltage (%dmV->%dmV)\n",
 			  GEN6_DECODE_RC6_VID(rc6vids & 0xff), 450);
 		rc6vids &= 0xffff00;
 		rc6vids |= GEN6_ENCODE_RC6_VID(450);
 		ret = sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_RC6VIDS, rc6vids);
 		if (ret)
 			DRM_ERROR("Couldn't fix incorrect rc6 voltage\n");
 	}
 
 	gen6_gt_force_wake_put(dev_priv);
 }
 
 static void gen6_update_ring_freq(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int min_freq = 15;
 	int gpu_freq;
 	unsigned int ia_freq, max_ia_freq;
 	int scaling_factor = 180;
 
 	sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED);
 
 #ifdef FREEBSD_WIP
 	max_ia_freq = cpufreq_quick_get_max(0);
 	/*
 	 * Default to measured freq if none found, PCU will ensure we don't go
 	 * over
 	 */
 	if (!max_ia_freq)
 		max_ia_freq = tsc_khz;
 #else
 	uint64_t freq;
 	freq = atomic_load_acq_64(&tsc_freq);
 	max_ia_freq = freq / 1000;
 #endif /* FREEBSD_WIP */
 
 	/* Convert from kHz to MHz */
 	max_ia_freq /= 1000;
 
 	/*
 	 * For each potential GPU frequency, load a ring frequency we'd like
 	 * to use for memory access.  We do this by specifying the IA frequency
 	 * the PCU should use as a reference to determine the ring frequency.
 	 */
 	for (gpu_freq = dev_priv->rps.max_delay; gpu_freq >= dev_priv->rps.min_delay;
 	     gpu_freq--) {
 		int diff = dev_priv->rps.max_delay - gpu_freq;
 
 		/*
 		 * For GPU frequencies less than 750MHz, just use the lowest
 		 * ring freq.
 		 */
 		if (gpu_freq < min_freq)
 			ia_freq = 800;
 		else
 			ia_freq = max_ia_freq - ((diff * scaling_factor) / 2);
 		ia_freq = DIV_ROUND_CLOSEST(ia_freq, 100);
 		ia_freq <<= GEN6_PCODE_FREQ_IA_RATIO_SHIFT;
 
 		sandybridge_pcode_write(dev_priv,
 					GEN6_PCODE_WRITE_MIN_FREQ_TABLE,
 					ia_freq | gpu_freq);
 	}
 }
 
 void ironlake_teardown_rc6(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (dev_priv->ips.renderctx) {
 		i915_gem_object_unpin(dev_priv->ips.renderctx);
 		drm_gem_object_unreference(&dev_priv->ips.renderctx->base);
 		dev_priv->ips.renderctx = NULL;
 	}
 
 	if (dev_priv->ips.pwrctx) {
 		i915_gem_object_unpin(dev_priv->ips.pwrctx);
 		drm_gem_object_unreference(&dev_priv->ips.pwrctx->base);
 		dev_priv->ips.pwrctx = NULL;
 	}
 }
 
 static void ironlake_disable_rc6(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (I915_READ(PWRCTXA)) {
 		/* Wake the GPU, prevent RC6, then restore RSTDBYCTL */
 		I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) | RCX_SW_EXIT);
 		wait_for(((I915_READ(RSTDBYCTL) & RSX_STATUS_MASK) == RSX_STATUS_ON),
 			 50);
 
 		I915_WRITE(PWRCTXA, 0);
 		POSTING_READ(PWRCTXA);
 
 		I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) & ~RCX_SW_EXIT);
 		POSTING_READ(RSTDBYCTL);
 	}
 }
 
 static int ironlake_setup_rc6(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (dev_priv->ips.renderctx == NULL)
 		dev_priv->ips.renderctx = intel_alloc_context_page(dev);
 	if (!dev_priv->ips.renderctx)
 		return -ENOMEM;
 
 	if (dev_priv->ips.pwrctx == NULL)
 		dev_priv->ips.pwrctx = intel_alloc_context_page(dev);
 	if (!dev_priv->ips.pwrctx) {
 		ironlake_teardown_rc6(dev);
 		return -ENOMEM;
 	}
 
 	return 0;
 }
 
 static void ironlake_enable_rc6(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring = &dev_priv->ring[RCS];
 	bool was_interruptible;
 	int ret;
 
 	/* rc6 disabled by default due to repeated reports of hanging during
 	 * boot and resume.
 	 */
 	if (!intel_enable_rc6(dev))
 		return;
 
 	DRM_LOCK_ASSERT(dev);
 
 	ret = ironlake_setup_rc6(dev);
 	if (ret)
 		return;
 
 	was_interruptible = dev_priv->mm.interruptible;
 	dev_priv->mm.interruptible = false;
 
 	/*
 	 * GPU can automatically power down the render unit if given a page
 	 * to save state.
 	 */
 	ret = intel_ring_begin(ring, 6);
 	if (ret) {
 		ironlake_teardown_rc6(dev);
 		dev_priv->mm.interruptible = was_interruptible;
 		return;
 	}
 
 	intel_ring_emit(ring, MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN);
 	intel_ring_emit(ring, MI_SET_CONTEXT);
 	intel_ring_emit(ring, dev_priv->ips.renderctx->gtt_offset |
 			MI_MM_SPACE_GTT |
 			MI_SAVE_EXT_STATE_EN |
 			MI_RESTORE_EXT_STATE_EN |
 			MI_RESTORE_INHIBIT);
 	intel_ring_emit(ring, MI_SUSPEND_FLUSH);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_FLUSH);
 	intel_ring_advance(ring);
 
 	/*
 	 * Wait for the command parser to advance past MI_SET_CONTEXT. The HW
 	 * does an implicit flush, combined with MI_FLUSH above, it should be
 	 * safe to assume that renderctx is valid
 	 */
 	ret = intel_ring_idle(ring);
 	dev_priv->mm.interruptible = was_interruptible;
 	if (ret) {
 		DRM_ERROR("failed to enable ironlake power power savings\n");
 		ironlake_teardown_rc6(dev);
 		return;
 	}
 
 	I915_WRITE(PWRCTXA, dev_priv->ips.pwrctx->gtt_offset | PWRCTX_EN);
 	I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) & ~RCX_SW_EXIT);
 }
 
 static unsigned long intel_pxfreq(u32 vidfreq)
 {
 	unsigned long freq;
 	int div = (vidfreq & 0x3f0000) >> 16;
 	int post = (vidfreq & 0x3000) >> 12;
 	int pre = (vidfreq & 0x7);
 
 	if (!pre)
 		return 0;
 
 	freq = ((div * 133333) / ((1<<post) * pre));
 
 	return freq;
 }
 
 static const struct cparams {
 	u16 i;
 	u16 t;
 	u16 m;
 	u16 c;
 } cparams[] = {
 	{ 1, 1333, 301, 28664 },
 	{ 1, 1066, 294, 24460 },
 	{ 1, 800, 294, 25192 },
 	{ 0, 1333, 276, 27605 },
 	{ 0, 1066, 276, 27605 },
 	{ 0, 800, 231, 23784 },
 };
 
 static unsigned long __i915_chipset_val(struct drm_i915_private *dev_priv)
 {
 	u64 total_count, diff, ret;
 	u32 count1, count2, count3, m = 0, c = 0;
 	unsigned long now = jiffies_to_msecs(jiffies), diff1;
 	int i;
 
 	mtx_assert(&mchdev_lock, MA_OWNED);
 
 	diff1 = now - dev_priv->ips.last_time1;
 
 	/* Prevent division-by-zero if we are asking too fast.
 	 * Also, we don't get interesting results if we are polling
 	 * faster than once in 10ms, so just return the saved value
 	 * in such cases.
 	 */
 	if (diff1 <= 10)
 		return dev_priv->ips.chipset_power;
 
 	count1 = I915_READ(DMIEC);
 	count2 = I915_READ(DDREC);
 	count3 = I915_READ(CSIEC);
 
 	total_count = count1 + count2 + count3;
 
 	/* FIXME: handle per-counter overflow */
 	if (total_count < dev_priv->ips.last_count1) {
 		diff = ~0UL - dev_priv->ips.last_count1;
 		diff += total_count;
 	} else {
 		diff = total_count - dev_priv->ips.last_count1;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(cparams); i++) {
 		if (cparams[i].i == dev_priv->ips.c_m &&
 		    cparams[i].t == dev_priv->ips.r_t) {
 			m = cparams[i].m;
 			c = cparams[i].c;
 			break;
 		}
 	}
 
 	diff = div_u64(diff, diff1);
 	ret = ((m * diff) + c);
 	ret = div_u64(ret, 10);
 
 	dev_priv->ips.last_count1 = total_count;
 	dev_priv->ips.last_time1 = now;
 
 	dev_priv->ips.chipset_power = ret;
 
 	return ret;
 }
 
 unsigned long i915_chipset_val(struct drm_i915_private *dev_priv)
 {
 	unsigned long val;
 
 	if (dev_priv->info->gen != 5)
 		return 0;
 
 	mtx_lock(&mchdev_lock);
 
 	val = __i915_chipset_val(dev_priv);
 
 	mtx_unlock(&mchdev_lock);
 
 	return val;
 }
 
 unsigned long i915_mch_val(struct drm_i915_private *dev_priv)
 {
 	unsigned long m, x, b;
 	u32 tsfs;
 
 	tsfs = I915_READ(TSFS);
 
 	m = ((tsfs & TSFS_SLOPE_MASK) >> TSFS_SLOPE_SHIFT);
 	x = I915_READ8(I915_TR1);
 
 	b = tsfs & TSFS_INTR_MASK;
 
 	return ((m * x) / 127) - b;
 }
 
 static u16 pvid_to_extvid(struct drm_i915_private *dev_priv, u8 pxvid)
 {
 	static const struct v_table {
 		u16 vd; /* in .1 mil */
 		u16 vm; /* in .1 mil */
 	} v_table[] = {
 		{ 0, 0, },
 		{ 375, 0, },
 		{ 500, 0, },
 		{ 625, 0, },
 		{ 750, 0, },
 		{ 875, 0, },
 		{ 1000, 0, },
 		{ 1125, 0, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4125, 3000, },
 		{ 4250, 3125, },
 		{ 4375, 3250, },
 		{ 4500, 3375, },
 		{ 4625, 3500, },
 		{ 4750, 3625, },
 		{ 4875, 3750, },
 		{ 5000, 3875, },
 		{ 5125, 4000, },
 		{ 5250, 4125, },
 		{ 5375, 4250, },
 		{ 5500, 4375, },
 		{ 5625, 4500, },
 		{ 5750, 4625, },
 		{ 5875, 4750, },
 		{ 6000, 4875, },
 		{ 6125, 5000, },
 		{ 6250, 5125, },
 		{ 6375, 5250, },
 		{ 6500, 5375, },
 		{ 6625, 5500, },
 		{ 6750, 5625, },
 		{ 6875, 5750, },
 		{ 7000, 5875, },
 		{ 7125, 6000, },
 		{ 7250, 6125, },
 		{ 7375, 6250, },
 		{ 7500, 6375, },
 		{ 7625, 6500, },
 		{ 7750, 6625, },
 		{ 7875, 6750, },
 		{ 8000, 6875, },
 		{ 8125, 7000, },
 		{ 8250, 7125, },
 		{ 8375, 7250, },
 		{ 8500, 7375, },
 		{ 8625, 7500, },
 		{ 8750, 7625, },
 		{ 8875, 7750, },
 		{ 9000, 7875, },
 		{ 9125, 8000, },
 		{ 9250, 8125, },
 		{ 9375, 8250, },
 		{ 9500, 8375, },
 		{ 9625, 8500, },
 		{ 9750, 8625, },
 		{ 9875, 8750, },
 		{ 10000, 8875, },
 		{ 10125, 9000, },
 		{ 10250, 9125, },
 		{ 10375, 9250, },
 		{ 10500, 9375, },
 		{ 10625, 9500, },
 		{ 10750, 9625, },
 		{ 10875, 9750, },
 		{ 11000, 9875, },
 		{ 11125, 10000, },
 		{ 11250, 10125, },
 		{ 11375, 10250, },
 		{ 11500, 10375, },
 		{ 11625, 10500, },
 		{ 11750, 10625, },
 		{ 11875, 10750, },
 		{ 12000, 10875, },
 		{ 12125, 11000, },
 		{ 12250, 11125, },
 		{ 12375, 11250, },
 		{ 12500, 11375, },
 		{ 12625, 11500, },
 		{ 12750, 11625, },
 		{ 12875, 11750, },
 		{ 13000, 11875, },
 		{ 13125, 12000, },
 		{ 13250, 12125, },
 		{ 13375, 12250, },
 		{ 13500, 12375, },
 		{ 13625, 12500, },
 		{ 13750, 12625, },
 		{ 13875, 12750, },
 		{ 14000, 12875, },
 		{ 14125, 13000, },
 		{ 14250, 13125, },
 		{ 14375, 13250, },
 		{ 14500, 13375, },
 		{ 14625, 13500, },
 		{ 14750, 13625, },
 		{ 14875, 13750, },
 		{ 15000, 13875, },
 		{ 15125, 14000, },
 		{ 15250, 14125, },
 		{ 15375, 14250, },
 		{ 15500, 14375, },
 		{ 15625, 14500, },
 		{ 15750, 14625, },
 		{ 15875, 14750, },
 		{ 16000, 14875, },
 		{ 16125, 15000, },
 	};
 	if (dev_priv->info->is_mobile)
 		return v_table[pxvid].vm;
 	else
 		return v_table[pxvid].vd;
 }
 
 static void __i915_update_gfx_val(struct drm_i915_private *dev_priv)
 {
 	struct timespec now, diff1;
 	u64 diff;
 	unsigned long diffms;
 	u32 count;
 
 	mtx_assert(&mchdev_lock, MA_OWNED);
 
 	nanotime(&now);
-	diff1 = now;
-	timespecsub(&diff1, &dev_priv->ips.last_time2);
+	timespecsub(&now, &dev_priv->ips.last_time2, &diff1);
 
 	/* Don't divide by 0 */
 	diffms = diff1.tv_sec * 1000 + diff1.tv_nsec / 1000000;
 	if (!diffms)
 		return;
 
 	count = I915_READ(GFXEC);
 
 	if (count < dev_priv->ips.last_count2) {
 		diff = ~0UL - dev_priv->ips.last_count2;
 		diff += count;
 	} else {
 		diff = count - dev_priv->ips.last_count2;
 	}
 
 	dev_priv->ips.last_count2 = count;
 	dev_priv->ips.last_time2 = now;
 
 	/* More magic constants... */
 	diff = diff * 1181;
 	diff = div_u64(diff, diffms * 10);
 	dev_priv->ips.gfx_power = diff;
 }
 
 void i915_update_gfx_val(struct drm_i915_private *dev_priv)
 {
 	if (dev_priv->info->gen != 5)
 		return;
 
 	mtx_lock(&mchdev_lock);
 
 	__i915_update_gfx_val(dev_priv);
 
 	mtx_unlock(&mchdev_lock);
 }
 
 static unsigned long __i915_gfx_val(struct drm_i915_private *dev_priv)
 {
 	unsigned long t, corr, state1, corr2, state2;
 	u32 pxvid, ext_v;
 
 	mtx_assert(&mchdev_lock, MA_OWNED);
 
 	pxvid = I915_READ(PXVFREQ_BASE + (dev_priv->rps.cur_delay * 4));
 	pxvid = (pxvid >> 24) & 0x7f;
 	ext_v = pvid_to_extvid(dev_priv, pxvid);
 
 	state1 = ext_v;
 
 	t = i915_mch_val(dev_priv);
 
 	/* Revel in the empirically derived constants */
 
 	/* Correction factor in 1/100000 units */
 	if (t > 80)
 		corr = ((t * 2349) + 135940);
 	else if (t >= 50)
 		corr = ((t * 964) + 29317);
 	else /* < 50 */
 		corr = ((t * 301) + 1004);
 
 	corr = corr * ((150142 * state1) / 10000 - 78642);
 	corr /= 100000;
 	corr2 = (corr * dev_priv->ips.corr);
 
 	state2 = (corr2 * state1) / 10000;
 	state2 /= 100; /* convert to mW */
 
 	__i915_update_gfx_val(dev_priv);
 
 	return dev_priv->ips.gfx_power + state2;
 }
 
 unsigned long i915_gfx_val(struct drm_i915_private *dev_priv)
 {
 	unsigned long val;
 
 	if (dev_priv->info->gen != 5)
 		return 0;
 
 	mtx_lock(&mchdev_lock);
 
 	val = __i915_gfx_val(dev_priv);
 
 	mtx_unlock(&mchdev_lock);
 
 	return val;
 }
 
 /**
  * i915_read_mch_val - return value for IPS use
  *
  * Calculate and return a value for the IPS driver to use when deciding whether
  * we have thermal and power headroom to increase CPU or GPU power budget.
  */
 unsigned long i915_read_mch_val(void)
 {
 	struct drm_i915_private *dev_priv;
 	unsigned long chipset_val, graphics_val, ret = 0;
 
 	mtx_lock(&mchdev_lock);
 	if (!i915_mch_dev)
 		goto out_unlock;
 	dev_priv = i915_mch_dev;
 
 	chipset_val = __i915_chipset_val(dev_priv);
 	graphics_val = __i915_gfx_val(dev_priv);
 
 	ret = chipset_val + graphics_val;
 
 out_unlock:
 	mtx_unlock(&mchdev_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i915_read_mch_val);
 
 /**
  * i915_gpu_raise - raise GPU frequency limit
  *
  * Raise the limit; IPS indicates we have thermal headroom.
  */
 bool i915_gpu_raise(void)
 {
 	struct drm_i915_private *dev_priv;
 	bool ret = true;
 
 	mtx_lock(&mchdev_lock);
 	if (!i915_mch_dev) {
 		ret = false;
 		goto out_unlock;
 	}
 	dev_priv = i915_mch_dev;
 
 	if (dev_priv->ips.max_delay > dev_priv->ips.fmax)
 		dev_priv->ips.max_delay--;
 
 out_unlock:
 	mtx_unlock(&mchdev_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i915_gpu_raise);
 
 /**
  * i915_gpu_lower - lower GPU frequency limit
  *
  * IPS indicates we're close to a thermal limit, so throttle back the GPU
  * frequency maximum.
  */
 bool i915_gpu_lower(void)
 {
 	struct drm_i915_private *dev_priv;
 	bool ret = true;
 
 	mtx_lock(&mchdev_lock);
 	if (!i915_mch_dev) {
 		ret = false;
 		goto out_unlock;
 	}
 	dev_priv = i915_mch_dev;
 
 	if (dev_priv->ips.max_delay < dev_priv->ips.min_delay)
 		dev_priv->ips.max_delay++;
 
 out_unlock:
 	mtx_unlock(&mchdev_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i915_gpu_lower);
 
 /**
  * i915_gpu_busy - indicate GPU business to IPS
  *
  * Tell the IPS driver whether or not the GPU is busy.
  */
 bool i915_gpu_busy(void)
 {
 	struct drm_i915_private *dev_priv;
 	struct intel_ring_buffer *ring;
 	bool ret = false;
 	int i;
 
 	mtx_lock(&mchdev_lock);
 	if (!i915_mch_dev)
 		goto out_unlock;
 	dev_priv = i915_mch_dev;
 
 	for_each_ring(ring, dev_priv, i)
 		ret |= !list_empty(&ring->request_list);
 
 out_unlock:
 	mtx_unlock(&mchdev_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i915_gpu_busy);
 
 /**
  * i915_gpu_turbo_disable - disable graphics turbo
  *
  * Disable graphics turbo by resetting the max frequency and setting the
  * current frequency to the default.
  */
 bool i915_gpu_turbo_disable(void)
 {
 	struct drm_i915_private *dev_priv;
 	bool ret = true;
 
 	mtx_lock(&mchdev_lock);
 	if (!i915_mch_dev) {
 		ret = false;
 		goto out_unlock;
 	}
 	dev_priv = i915_mch_dev;
 
 	dev_priv->ips.max_delay = dev_priv->ips.fstart;
 
 	if (!ironlake_set_drps(dev_priv->dev, dev_priv->ips.fstart))
 		ret = false;
 
 out_unlock:
 	mtx_unlock(&mchdev_lock);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable);
 
 #ifdef FREEBSD_WIP
 /**
  * Tells the intel_ips driver that the i915 driver is now loaded, if
  * IPS got loaded first.
  *
  * This awkward dance is so that neither module has to depend on the
  * other in order for IPS to do the appropriate communication of
  * GPU turbo limits to i915.
  */
 static void
 ips_ping_for_i915_load(void)
 {
 	void (*link)(void);
 
 	link = symbol_get(ips_link_to_i915_driver);
 	if (link) {
 		link();
 		symbol_put(ips_link_to_i915_driver);
 	}
 }
 #endif /* FREEBSD_WIP */
 
 void intel_gpu_ips_init(struct drm_i915_private *dev_priv)
 {
 	/* We only register the i915 ips part with intel-ips once everything is
 	 * set up, to avoid intel-ips sneaking in and reading bogus values. */
 	mtx_lock(&mchdev_lock);
 	i915_mch_dev = dev_priv;
 	mtx_unlock(&mchdev_lock);
 
 #ifdef FREEBSD_WIP
 	ips_ping_for_i915_load();
 #endif /* FREEBSD_WIP */
 }
 
 void intel_gpu_ips_teardown(void)
 {
 	mtx_lock(&mchdev_lock);
 	i915_mch_dev = NULL;
 	mtx_unlock(&mchdev_lock);
 }
 static void intel_init_emon(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 lcfuse;
 	u8 pxw[16];
 	int i;
 
 	/* Disable to program */
 	I915_WRITE(ECR, 0);
 	POSTING_READ(ECR);
 
 	/* Program energy weights for various events */
 	I915_WRITE(SDEW, 0x15040d00);
 	I915_WRITE(CSIEW0, 0x007f0000);
 	I915_WRITE(CSIEW1, 0x1e220004);
 	I915_WRITE(CSIEW2, 0x04000004);
 
 	for (i = 0; i < 5; i++)
 		I915_WRITE(PEW + (i * 4), 0);
 	for (i = 0; i < 3; i++)
 		I915_WRITE(DEW + (i * 4), 0);
 
 	/* Program P-state weights to account for frequency power adjustment */
 	for (i = 0; i < 16; i++) {
 		u32 pxvidfreq = I915_READ(PXVFREQ_BASE + (i * 4));
 		unsigned long freq = intel_pxfreq(pxvidfreq);
 		unsigned long vid = (pxvidfreq & PXVFREQ_PX_MASK) >>
 			PXVFREQ_PX_SHIFT;
 		unsigned long val;
 
 		val = vid * vid;
 		val *= (freq / 1000);
 		val *= 255;
 		val /= (127*127*900);
 		if (val > 0xff)
 			DRM_ERROR("bad pxval: %ld\n", val);
 		pxw[i] = val;
 	}
 	/* Render standby states get 0 weight */
 	pxw[14] = 0;
 	pxw[15] = 0;
 
 	for (i = 0; i < 4; i++) {
 		u32 val = (pxw[i*4] << 24) | (pxw[(i*4)+1] << 16) |
 			(pxw[(i*4)+2] << 8) | (pxw[(i*4)+3]);
 		I915_WRITE(PXW + (i * 4), val);
 	}
 
 	/* Adjust magic regs to magic values (more experimental results) */
 	I915_WRITE(OGW0, 0);
 	I915_WRITE(OGW1, 0);
 	I915_WRITE(EG0, 0x00007f00);
 	I915_WRITE(EG1, 0x0000000e);
 	I915_WRITE(EG2, 0x000e0000);
 	I915_WRITE(EG3, 0x68000300);
 	I915_WRITE(EG4, 0x42000000);
 	I915_WRITE(EG5, 0x00140031);
 	I915_WRITE(EG6, 0);
 	I915_WRITE(EG7, 0);
 
 	for (i = 0; i < 8; i++)
 		I915_WRITE(PXWL + (i * 4), 0);
 
 	/* Enable PMON + select events */
 	I915_WRITE(ECR, 0x80000019);
 
 	lcfuse = I915_READ(LCFUSE02);
 
 	dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK);
 }
 
 void intel_disable_gt_powersave(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (IS_IRONLAKE_M(dev)) {
 		ironlake_disable_drps(dev);
 		ironlake_disable_rc6(dev);
 	} else if (INTEL_INFO(dev)->gen >= 6 && !IS_VALLEYVIEW(dev)) {
 		taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->rps.delayed_resume_work, NULL);
 		sx_xlock(&dev_priv->rps.hw_lock);
 		gen6_disable_rps(dev);
 		sx_xunlock(&dev_priv->rps.hw_lock);
 	}
 }
 
 static void intel_gen6_powersave_work(void *arg, int pending)
 {
 	struct drm_i915_private *dev_priv = arg;
 	struct drm_device *dev = dev_priv->dev;
 
 	sx_xlock(&dev_priv->rps.hw_lock);
 	gen6_enable_rps(dev);
 	gen6_update_ring_freq(dev);
 	sx_xunlock(&dev_priv->rps.hw_lock);
 }
 
 void intel_enable_gt_powersave(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (IS_IRONLAKE_M(dev)) {
 		ironlake_enable_drps(dev);
 		ironlake_enable_rc6(dev);
 		intel_init_emon(dev);
 	} else if ((IS_GEN6(dev) || IS_GEN7(dev)) && !IS_VALLEYVIEW(dev)) {
 		/*
 		 * PCU communication is slow and this doesn't need to be
 		 * done at any specific time, so do this out of our fast path
 		 * to make resume and init faster.
 		 */
 		taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->rps.delayed_resume_work,
 		    round_jiffies_up_relative(HZ));
 	}
 }
 
 static void ibx_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	/*
 	 * On Ibex Peak and Cougar Point, we need to disable clock
 	 * gating for the panel power sequencer or it will fail to
 	 * start up when no ports are active.
 	 */
 	I915_WRITE(SOUTH_DSPCLK_GATE_D, PCH_DPLSUNIT_CLOCK_GATE_DISABLE);
 }
 
 static void ironlake_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE;
 
 	/* Required for FBC */
 	dspclk_gate |= ILK_DPFCRUNIT_CLOCK_GATE_DISABLE |
 		   ILK_DPFCUNIT_CLOCK_GATE_DISABLE |
 		   ILK_DPFDUNIT_CLOCK_GATE_ENABLE;
 
 	I915_WRITE(PCH_3DCGDIS0,
 		   MARIUNIT_CLOCK_GATE_DISABLE |
 		   SVSMUNIT_CLOCK_GATE_DISABLE);
 	I915_WRITE(PCH_3DCGDIS1,
 		   VFMUNIT_CLOCK_GATE_DISABLE);
 
 	/*
 	 * According to the spec the following bits should be set in
 	 * order to enable memory self-refresh
 	 * The bit 22/21 of 0x42004
 	 * The bit 5 of 0x42020
 	 * The bit 15 of 0x45000
 	 */
 	I915_WRITE(ILK_DISPLAY_CHICKEN2,
 		   (I915_READ(ILK_DISPLAY_CHICKEN2) |
 		    ILK_DPARB_GATE | ILK_VSDPFD_FULL));
 	dspclk_gate |= ILK_DPARBUNIT_CLOCK_GATE_ENABLE;
 	I915_WRITE(DISP_ARB_CTL,
 		   (I915_READ(DISP_ARB_CTL) |
 		    DISP_FBC_WM_DIS));
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	/*
 	 * Based on the document from hardware guys the following bits
 	 * should be set unconditionally in order to enable FBC.
 	 * The bit 22 of 0x42000
 	 * The bit 22 of 0x42004
 	 * The bit 7,8,9 of 0x42020.
 	 */
 	if (IS_IRONLAKE_M(dev)) {
 		I915_WRITE(ILK_DISPLAY_CHICKEN1,
 			   I915_READ(ILK_DISPLAY_CHICKEN1) |
 			   ILK_FBCQ_DIS);
 		I915_WRITE(ILK_DISPLAY_CHICKEN2,
 			   I915_READ(ILK_DISPLAY_CHICKEN2) |
 			   ILK_DPARB_GATE);
 	}
 
 	I915_WRITE(ILK_DSPCLK_GATE_D, dspclk_gate);
 
 	I915_WRITE(ILK_DISPLAY_CHICKEN2,
 		   I915_READ(ILK_DISPLAY_CHICKEN2) |
 		   ILK_ELPIN_409_SELECT);
 	I915_WRITE(_3D_CHICKEN2,
 		   _3D_CHICKEN2_WM_READ_PIPELINED << 16 |
 		   _3D_CHICKEN2_WM_READ_PIPELINED);
 
 	/* WaDisableRenderCachePipelinedFlush */
 	I915_WRITE(CACHE_MODE_0,
 		   _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE));
 
 	ibx_init_clock_gating(dev);
 }
 
 static void cpt_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pipe;
 	uint32_t val;
 
 	/*
 	 * On Ibex Peak and Cougar Point, we need to disable clock
 	 * gating for the panel power sequencer or it will fail to
 	 * start up when no ports are active.
 	 */
 	I915_WRITE(SOUTH_DSPCLK_GATE_D, PCH_DPLSUNIT_CLOCK_GATE_DISABLE);
 	I915_WRITE(SOUTH_CHICKEN2, I915_READ(SOUTH_CHICKEN2) |
 		   DPLS_EDP_PPS_FIX_DIS);
 	/* The below fixes the weird display corruption, a few pixels shifted
 	 * downward, on (only) LVDS of some HP laptops with IVY.
 	 */
 	for_each_pipe(pipe) {
 		val = TRANS_CHICKEN2_TIMING_OVERRIDE;
 		if (dev_priv->fdi_rx_polarity_inverted)
 			val |= TRANS_CHICKEN2_FDI_POLARITY_REVERSED;
 		I915_WRITE(TRANS_CHICKEN2(pipe), val);
 	}
 	/* WADP0ClockGatingDisable */
 	for_each_pipe(pipe) {
 		I915_WRITE(TRANS_CHICKEN1(pipe),
 			   TRANS_CHICKEN1_DP0UNIT_GC_DISABLE);
 	}
 }
 
 static void gen6_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pipe;
 	uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE;
 
 	I915_WRITE(ILK_DSPCLK_GATE_D, dspclk_gate);
 
 	I915_WRITE(ILK_DISPLAY_CHICKEN2,
 		   I915_READ(ILK_DISPLAY_CHICKEN2) |
 		   ILK_ELPIN_409_SELECT);
 
 	/* WaDisableHiZPlanesWhenMSAAEnabled */
 	I915_WRITE(_3D_CHICKEN,
 		   _MASKED_BIT_ENABLE(_3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB));
 
 	/* WaSetupGtModeTdRowDispatch */
 	if (IS_SNB_GT1(dev))
 		I915_WRITE(GEN6_GT_MODE,
 			   _MASKED_BIT_ENABLE(GEN6_TD_FOUR_ROW_DISPATCH_DISABLE));
 
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	I915_WRITE(CACHE_MODE_0,
 		   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 
 	I915_WRITE(GEN6_UCGCTL1,
 		   I915_READ(GEN6_UCGCTL1) |
 		   GEN6_BLBUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_CSUNIT_CLOCK_GATE_DISABLE);
 
 	/* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock
 	 * gating disable must be set.  Failure to set it results in
 	 * flickering pixels due to Z write ordering failures after
 	 * some amount of runtime in the Mesa "fire" demo, and Unigine
 	 * Sanctuary and Tropics, and apparently anything else with
 	 * alpha test or pixel discard.
 	 *
 	 * According to the spec, bit 11 (RCCUNIT) must also be set,
 	 * but we didn't debug actual testcases to find it out.
 	 *
 	 * Also apply WaDisableVDSUnitClockGating and
 	 * WaDisableRCPBUnitClockGating.
 	 */
 	I915_WRITE(GEN6_UCGCTL2,
 		   GEN7_VDSUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
 
 	/* Bspec says we need to always set all mask bits. */
 	I915_WRITE(_3D_CHICKEN3, (0xFFFF << 16) |
 		   _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL);
 
 	/*
 	 * According to the spec the following bits should be
 	 * set in order to enable memory self-refresh and fbc:
 	 * The bit21 and bit22 of 0x42000
 	 * The bit21 and bit22 of 0x42004
 	 * The bit5 and bit7 of 0x42020
 	 * The bit14 of 0x70180
 	 * The bit14 of 0x71180
 	 */
 	I915_WRITE(ILK_DISPLAY_CHICKEN1,
 		   I915_READ(ILK_DISPLAY_CHICKEN1) |
 		   ILK_FBCQ_DIS | ILK_PABSTRETCH_DIS);
 	I915_WRITE(ILK_DISPLAY_CHICKEN2,
 		   I915_READ(ILK_DISPLAY_CHICKEN2) |
 		   ILK_DPARB_GATE | ILK_VSDPFD_FULL);
 	I915_WRITE(ILK_DSPCLK_GATE_D,
 		   I915_READ(ILK_DSPCLK_GATE_D) |
 		   ILK_DPARBUNIT_CLOCK_GATE_ENABLE  |
 		   ILK_DPFDUNIT_CLOCK_GATE_ENABLE);
 
 
 #ifdef FREEBSD_WIP
 	/* NOTE Linux<->FreeBSD: Disable GEN6_MBCTL write.
 	 *
 	 * This arrived in Linux 3.6 in commit
 	 * b4ae3f22d238617ca11610b29fde16cf8c0bc6e0 and causes significantly
 	 * increased power consumption after kldloading i915kms.ko on FreeBSD
 	 * on (some) Sandy Bridge laptops. A Thinkpad X220 reported about 11W
 	 * after booting while idle at the vt(4) console and about double that
 	 * after loading the driver.
 	 *
 	 * There were reports in Linux of increased consumption after a suspend
 	 * and resume cycle due to that change.
 	 *
 	 * Linux bug reports:
 	 * https://bugs.freedesktop.org/show_bug.cgi?id=54089
 	 * https://bugzilla.kernel.org/show_bug.cgi?id=58971
 	 *
 	 * This suspend and resume issue is reportedly fixed in Linux with
 	 * commits 7dcd2677ea912573d9ed4bcd629b0023b2d11505 and
 	 * 7dcd2677ea912573d9ed4bcd629b0023b2d11505 (Linux 3.11). However, I
 	 * found that those changes did not help on FreeBSD, where increased
 	 * power consumption is observed after loading i915kms.ko without
 	 * suspending and resuming.
 	 *
 	 * This workaround should be removed after updating to a future Linux
 	 * i915 version and verifying normal power consumption on Sandy Bridge.
 	 */
 
 	/* WaMbcDriverBootEnable */
 	I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) |
 		   GEN6_MBCTL_ENABLE_BOOT_FETCH);
 #endif /* FREEBSD_WIP */
 
 	for_each_pipe(pipe) {
 		I915_WRITE(DSPCNTR(pipe),
 			   I915_READ(DSPCNTR(pipe)) |
 			   DISPPLANE_TRICKLE_FEED_DISABLE);
 		intel_flush_display_plane(dev_priv, pipe);
 	}
 
 	/* The default value should be 0x200 according to docs, but the two
 	 * platforms I checked have a 0 for this. (Maybe BIOS overrides?) */
 	I915_WRITE(GEN6_GT_MODE, _MASKED_BIT_DISABLE(0xffff));
 	I915_WRITE(GEN6_GT_MODE, _MASKED_BIT_ENABLE(GEN6_GT_MODE_HI));
 
 	cpt_init_clock_gating(dev);
 }
 
 static void gen7_setup_fixed_func_scheduler(struct drm_i915_private *dev_priv)
 {
 	uint32_t reg = I915_READ(GEN7_FF_THREAD_MODE);
 
 	reg &= ~GEN7_FF_SCHED_MASK;
 	reg |= GEN7_FF_TS_SCHED_HW;
 	reg |= GEN7_FF_VS_SCHED_HW;
 	reg |= GEN7_FF_DS_SCHED_HW;
 
 	I915_WRITE(GEN7_FF_THREAD_MODE, reg);
 }
 
 static void lpt_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	/*
 	 * TODO: this bit should only be enabled when really needed, then
 	 * disabled when not needed anymore in order to save power.
 	 */
 	if (dev_priv->pch_id == INTEL_PCH_LPT_LP_DEVICE_ID_TYPE)
 		I915_WRITE(SOUTH_DSPCLK_GATE_D,
 			   I915_READ(SOUTH_DSPCLK_GATE_D) |
 			   PCH_LP_PARTITION_LEVEL_DISABLE);
 }
 
 static void haswell_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pipe;
 
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	/* According to the spec, bit 13 (RCZUNIT) must be set on IVB.
 	 * This implements the WaDisableRCZUnitClockGating workaround.
 	 */
 	I915_WRITE(GEN6_UCGCTL2, GEN6_RCZUNIT_CLOCK_GATE_DISABLE);
 
 	/* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */
 	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
 		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
 
 	/* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */
 	I915_WRITE(GEN7_L3CNTLREG1,
 			GEN7_WA_FOR_GEN7_L3_CONTROL);
 	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER,
 			GEN7_WA_L3_CHICKEN_MODE);
 
 	/* This is required by WaCatErrorRejectionIssue */
 	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
 			I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
 			GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB);
 
 	for_each_pipe(pipe) {
 		I915_WRITE(DSPCNTR(pipe),
 			   I915_READ(DSPCNTR(pipe)) |
 			   DISPPLANE_TRICKLE_FEED_DISABLE);
 		intel_flush_display_plane(dev_priv, pipe);
 	}
 
 	gen7_setup_fixed_func_scheduler(dev_priv);
 
 	/* WaDisable4x2SubspanOptimization */
 	I915_WRITE(CACHE_MODE_1,
 		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
 
 	/* WaMbcDriverBootEnable */
 	I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) |
 		   GEN6_MBCTL_ENABLE_BOOT_FETCH);
 
 	/* XXX: This is a workaround for early silicon revisions and should be
 	 * removed later.
 	 */
 	I915_WRITE(WM_DBG,
 			I915_READ(WM_DBG) |
 			WM_DBG_DISALLOW_MULTIPLE_LP |
 			WM_DBG_DISALLOW_SPRITE |
 			WM_DBG_DISALLOW_MAXFIFO);
 
 	lpt_init_clock_gating(dev);
 }
 
 static void ivybridge_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pipe;
 	uint32_t snpcr;
 
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE);
 
 	/* WaDisableEarlyCull */
 	I915_WRITE(_3D_CHICKEN3,
 		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL));
 
 	/* WaDisableBackToBackFlipFix */
 	I915_WRITE(IVB_CHICKEN3,
 		   CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE |
 		   CHICKEN3_DGMG_DONE_FIX_DISABLE);
 
 	/* WaDisablePSDDualDispatchEnable */
 	if (IS_IVB_GT1(dev))
 		I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
 			   _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
 	else
 		I915_WRITE(GEN7_HALF_SLICE_CHICKEN1_GT2,
 			   _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
 
 	/* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */
 	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
 		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
 
 	/* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */
 	I915_WRITE(GEN7_L3CNTLREG1,
 			GEN7_WA_FOR_GEN7_L3_CONTROL);
 	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER,
 		   GEN7_WA_L3_CHICKEN_MODE);
 	if (IS_IVB_GT1(dev))
 		I915_WRITE(GEN7_ROW_CHICKEN2,
 			   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 	else
 		I915_WRITE(GEN7_ROW_CHICKEN2_GT2,
 			   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 
 
 	/* WaForceL3Serialization */
 	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
 		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
 	/* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock
 	 * gating disable must be set.  Failure to set it results in
 	 * flickering pixels due to Z write ordering failures after
 	 * some amount of runtime in the Mesa "fire" demo, and Unigine
 	 * Sanctuary and Tropics, and apparently anything else with
 	 * alpha test or pixel discard.
 	 *
 	 * According to the spec, bit 11 (RCCUNIT) must also be set,
 	 * but we didn't debug actual testcases to find it out.
 	 *
 	 * According to the spec, bit 13 (RCZUNIT) must be set on IVB.
 	 * This implements the WaDisableRCZUnitClockGating workaround.
 	 */
 	I915_WRITE(GEN6_UCGCTL2,
 		   GEN6_RCZUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
 
 	/* This is required by WaCatErrorRejectionIssue */
 	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
 			I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
 			GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB);
 
 	for_each_pipe(pipe) {
 		I915_WRITE(DSPCNTR(pipe),
 			   I915_READ(DSPCNTR(pipe)) |
 			   DISPPLANE_TRICKLE_FEED_DISABLE);
 		intel_flush_display_plane(dev_priv, pipe);
 	}
 
 	/* WaMbcDriverBootEnable */
 	I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) |
 		   GEN6_MBCTL_ENABLE_BOOT_FETCH);
 
 	gen7_setup_fixed_func_scheduler(dev_priv);
 
 	/* WaDisable4x2SubspanOptimization */
 	I915_WRITE(CACHE_MODE_1,
 		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
 
 	snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
 	snpcr &= ~GEN6_MBC_SNPCR_MASK;
 	snpcr |= GEN6_MBC_SNPCR_MED;
 	I915_WRITE(GEN6_MBCUNIT_SNPCR, snpcr);
 
 	cpt_init_clock_gating(dev);
 }
 
 static void valleyview_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int pipe;
 
 	I915_WRITE(WM3_LP_ILK, 0);
 	I915_WRITE(WM2_LP_ILK, 0);
 	I915_WRITE(WM1_LP_ILK, 0);
 
 	I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE);
 
 	/* WaDisableEarlyCull */
 	I915_WRITE(_3D_CHICKEN3,
 		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL));
 
 	/* WaDisableBackToBackFlipFix */
 	I915_WRITE(IVB_CHICKEN3,
 		   CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE |
 		   CHICKEN3_DGMG_DONE_FIX_DISABLE);
 
 	I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
 		   _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
 
 	/* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */
 	I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
 		   GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
 
 	/* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */
 	I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
 	I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
 
 	/* WaForceL3Serialization */
 	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
 		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
 	/* WaDisableDopClockGating */
 	I915_WRITE(GEN7_ROW_CHICKEN2,
 		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 
 	/* WaForceL3Serialization */
 	I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
 		   ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
 	/* This is required by WaCatErrorRejectionIssue */
 	I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
 		   I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
 		   GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB);
 
 	/* WaMbcDriverBootEnable */
 	I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) |
 		   GEN6_MBCTL_ENABLE_BOOT_FETCH);
 
 
 	/* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock
 	 * gating disable must be set.  Failure to set it results in
 	 * flickering pixels due to Z write ordering failures after
 	 * some amount of runtime in the Mesa "fire" demo, and Unigine
 	 * Sanctuary and Tropics, and apparently anything else with
 	 * alpha test or pixel discard.
 	 *
 	 * According to the spec, bit 11 (RCCUNIT) must also be set,
 	 * but we didn't debug actual testcases to find it out.
 	 *
 	 * According to the spec, bit 13 (RCZUNIT) must be set on IVB.
 	 * This implements the WaDisableRCZUnitClockGating workaround.
 	 *
 	 * Also apply WaDisableVDSUnitClockGating and
 	 * WaDisableRCPBUnitClockGating.
 	 */
 	I915_WRITE(GEN6_UCGCTL2,
 		   GEN7_VDSUNIT_CLOCK_GATE_DISABLE |
 		   GEN7_TDLUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCZUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
 		   GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
 
 	I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
 
 	for_each_pipe(pipe) {
 		I915_WRITE(DSPCNTR(pipe),
 			   I915_READ(DSPCNTR(pipe)) |
 			   DISPPLANE_TRICKLE_FEED_DISABLE);
 		intel_flush_display_plane(dev_priv, pipe);
 	}
 
 	I915_WRITE(CACHE_MODE_1,
 		   _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
 
 	/*
 	 * On ValleyView, the GUnit needs to signal the GT
 	 * when flip and other events complete.  So enable
 	 * all the GUnit->GT interrupts here
 	 */
 	I915_WRITE(VLV_DPFLIPSTAT, PIPEB_LINE_COMPARE_INT_EN |
 		   PIPEB_HLINE_INT_EN | PIPEB_VBLANK_INT_EN |
 		   SPRITED_FLIPDONE_INT_EN | SPRITEC_FLIPDONE_INT_EN |
 		   PLANEB_FLIPDONE_INT_EN | PIPEA_LINE_COMPARE_INT_EN |
 		   PIPEA_HLINE_INT_EN | PIPEA_VBLANK_INT_EN |
 		   SPRITEB_FLIPDONE_INT_EN | SPRITEA_FLIPDONE_INT_EN |
 		   PLANEA_FLIPDONE_INT_EN);
 
 	/*
 	 * WaDisableVLVClockGating_VBIIssue
 	 * Disable clock gating on th GCFG unit to prevent a delay
 	 * in the reporting of vblank events.
 	 */
 	I915_WRITE(VLV_GUNIT_CLOCK_GATE, GCFG_DIS);
 }
 
 static void g4x_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t dspclk_gate;
 
 	I915_WRITE(RENCLK_GATE_D1, 0);
 	I915_WRITE(RENCLK_GATE_D2, VF_UNIT_CLOCK_GATE_DISABLE |
 		   GS_UNIT_CLOCK_GATE_DISABLE |
 		   CL_UNIT_CLOCK_GATE_DISABLE);
 	I915_WRITE(RAMCLK_GATE_D, 0);
 	dspclk_gate = VRHUNIT_CLOCK_GATE_DISABLE |
 		OVRUNIT_CLOCK_GATE_DISABLE |
 		OVCUNIT_CLOCK_GATE_DISABLE;
 	if (IS_GM45(dev))
 		dspclk_gate |= DSSUNIT_CLOCK_GATE_DISABLE;
 	I915_WRITE(DSPCLK_GATE_D, dspclk_gate);
 
 	/* WaDisableRenderCachePipelinedFlush */
 	I915_WRITE(CACHE_MODE_0,
 		   _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE));
 }
 
 static void crestline_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(RENCLK_GATE_D1, I965_RCC_CLOCK_GATE_DISABLE);
 	I915_WRITE(RENCLK_GATE_D2, 0);
 	I915_WRITE(DSPCLK_GATE_D, 0);
 	I915_WRITE(RAMCLK_GATE_D, 0);
 	I915_WRITE16(DEUC, 0);
 }
 
 static void broadwater_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(RENCLK_GATE_D1, I965_RCZ_CLOCK_GATE_DISABLE |
 		   I965_RCC_CLOCK_GATE_DISABLE |
 		   I965_RCPB_CLOCK_GATE_DISABLE |
 		   I965_ISC_CLOCK_GATE_DISABLE |
 		   I965_FBC_CLOCK_GATE_DISABLE);
 	I915_WRITE(RENCLK_GATE_D2, 0);
 }
 
 static void gen3_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 dstate = I915_READ(D_STATE);
 
 	dstate |= DSTATE_PLL_D3_OFF | DSTATE_GFX_CLOCK_GATING |
 		DSTATE_DOT_CLOCK_GATING;
 	I915_WRITE(D_STATE, dstate);
 
 	if (IS_PINEVIEW(dev))
 		I915_WRITE(ECOSKPD, _MASKED_BIT_ENABLE(ECO_GATING_CX_ONLY));
 
 	/* IIR "flip pending" means done if this bit is set */
 	I915_WRITE(ECOSKPD, _MASKED_BIT_DISABLE(ECO_FLIP_DONE));
 }
 
 static void i85x_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(RENCLK_GATE_D1, SV_CLOCK_GATE_DISABLE);
 }
 
 static void i830_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(DSPCLK_GATE_D, OVRUNIT_CLOCK_GATE_DISABLE);
 }
 
 void intel_init_clock_gating(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	dev_priv->display.init_clock_gating(dev);
 }
 
 /* Starting with Haswell, we have different power wells for
  * different parts of the GPU. This attempts to enable them all.
  */
 void intel_init_power_wells(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	unsigned long power_wells[] = {
 		HSW_PWR_WELL_CTL1,
 		HSW_PWR_WELL_CTL2,
 		HSW_PWR_WELL_CTL4
 	};
 	int i;
 
 	if (!IS_HASWELL(dev))
 		return;
 
 	DRM_LOCK(dev);
 
 	for (i = 0; i < ARRAY_SIZE(power_wells); i++) {
 		int well = I915_READ(power_wells[i]);
 
 		if ((well & HSW_PWR_WELL_STATE) == 0) {
 			I915_WRITE(power_wells[i], well & HSW_PWR_WELL_ENABLE);
 			if (wait_for((I915_READ(power_wells[i]) & HSW_PWR_WELL_STATE), 20))
 				DRM_ERROR("Error enabling power well %lx\n", power_wells[i]);
 		}
 	}
 
 	DRM_UNLOCK(dev);
 }
 
 /* Set up chip specific power management-related functions */
 void intel_init_pm(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (I915_HAS_FBC(dev)) {
 		if (HAS_PCH_SPLIT(dev)) {
 			dev_priv->display.fbc_enabled = ironlake_fbc_enabled;
 			dev_priv->display.enable_fbc = ironlake_enable_fbc;
 			dev_priv->display.disable_fbc = ironlake_disable_fbc;
 		} else if (IS_GM45(dev)) {
 			dev_priv->display.fbc_enabled = g4x_fbc_enabled;
 			dev_priv->display.enable_fbc = g4x_enable_fbc;
 			dev_priv->display.disable_fbc = g4x_disable_fbc;
 		} else if (IS_CRESTLINE(dev)) {
 			dev_priv->display.fbc_enabled = i8xx_fbc_enabled;
 			dev_priv->display.enable_fbc = i8xx_enable_fbc;
 			dev_priv->display.disable_fbc = i8xx_disable_fbc;
 		}
 		/* 855GM needs testing */
 	}
 
 	/* For cxsr */
 	if (IS_PINEVIEW(dev))
 		i915_pineview_get_mem_freq(dev);
 	else if (IS_GEN5(dev))
 		i915_ironlake_get_mem_freq(dev);
 
 	/* For FIFO watermark updates */
 	if (HAS_PCH_SPLIT(dev)) {
 		if (IS_GEN5(dev)) {
 			if (I915_READ(MLTR_ILK) & ILK_SRLT_MASK)
 				dev_priv->display.update_wm = ironlake_update_wm;
 			else {
 				DRM_DEBUG_KMS("Failed to get proper latency. "
 					      "Disable CxSR\n");
 				dev_priv->display.update_wm = NULL;
 			}
 			dev_priv->display.init_clock_gating = ironlake_init_clock_gating;
 		} else if (IS_GEN6(dev)) {
 			if (SNB_READ_WM0_LATENCY()) {
 				dev_priv->display.update_wm = sandybridge_update_wm;
 				dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm;
 			} else {
 				DRM_DEBUG_KMS("Failed to read display plane latency. "
 					      "Disable CxSR\n");
 				dev_priv->display.update_wm = NULL;
 			}
 			dev_priv->display.init_clock_gating = gen6_init_clock_gating;
 		} else if (IS_IVYBRIDGE(dev)) {
 			/* FIXME: detect B0+ stepping and use auto training */
 			if (SNB_READ_WM0_LATENCY()) {
 				dev_priv->display.update_wm = ivybridge_update_wm;
 				dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm;
 			} else {
 				DRM_DEBUG_KMS("Failed to read display plane latency. "
 					      "Disable CxSR\n");
 				dev_priv->display.update_wm = NULL;
 			}
 			dev_priv->display.init_clock_gating = ivybridge_init_clock_gating;
 		} else if (IS_HASWELL(dev)) {
 			if (SNB_READ_WM0_LATENCY()) {
 				dev_priv->display.update_wm = sandybridge_update_wm;
 				dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm;
 				dev_priv->display.update_linetime_wm = haswell_update_linetime_wm;
 			} else {
 				DRM_DEBUG_KMS("Failed to read display plane latency. "
 					      "Disable CxSR\n");
 				dev_priv->display.update_wm = NULL;
 			}
 			dev_priv->display.init_clock_gating = haswell_init_clock_gating;
 		} else
 			dev_priv->display.update_wm = NULL;
 	} else if (IS_VALLEYVIEW(dev)) {
 		dev_priv->display.update_wm = valleyview_update_wm;
 		dev_priv->display.init_clock_gating =
 			valleyview_init_clock_gating;
 	} else if (IS_PINEVIEW(dev)) {
 		if (!intel_get_cxsr_latency(IS_PINEVIEW_G(dev),
 					    dev_priv->is_ddr3,
 					    dev_priv->fsb_freq,
 					    dev_priv->mem_freq)) {
 			DRM_INFO("failed to find known CxSR latency "
 				 "(found ddr%s fsb freq %d, mem freq %d), "
 				 "disabling CxSR\n",
 				 (dev_priv->is_ddr3 == 1) ? "3" : "2",
 				 dev_priv->fsb_freq, dev_priv->mem_freq);
 			/* Disable CxSR and never update its watermark again */
 			pineview_disable_cxsr(dev);
 			dev_priv->display.update_wm = NULL;
 		} else
 			dev_priv->display.update_wm = pineview_update_wm;
 		dev_priv->display.init_clock_gating = gen3_init_clock_gating;
 	} else if (IS_G4X(dev)) {
 		dev_priv->display.update_wm = g4x_update_wm;
 		dev_priv->display.init_clock_gating = g4x_init_clock_gating;
 	} else if (IS_GEN4(dev)) {
 		dev_priv->display.update_wm = i965_update_wm;
 		if (IS_CRESTLINE(dev))
 			dev_priv->display.init_clock_gating = crestline_init_clock_gating;
 		else if (IS_BROADWATER(dev))
 			dev_priv->display.init_clock_gating = broadwater_init_clock_gating;
 	} else if (IS_GEN3(dev)) {
 		dev_priv->display.update_wm = i9xx_update_wm;
 		dev_priv->display.get_fifo_size = i9xx_get_fifo_size;
 		dev_priv->display.init_clock_gating = gen3_init_clock_gating;
 	} else if (IS_I865G(dev)) {
 		dev_priv->display.update_wm = i830_update_wm;
 		dev_priv->display.init_clock_gating = i85x_init_clock_gating;
 		dev_priv->display.get_fifo_size = i830_get_fifo_size;
 	} else if (IS_I85X(dev)) {
 		dev_priv->display.update_wm = i9xx_update_wm;
 		dev_priv->display.get_fifo_size = i85x_get_fifo_size;
 		dev_priv->display.init_clock_gating = i85x_init_clock_gating;
 	} else {
 		dev_priv->display.update_wm = i830_update_wm;
 		dev_priv->display.init_clock_gating = i830_init_clock_gating;
 		if (IS_845G(dev))
 			dev_priv->display.get_fifo_size = i845_get_fifo_size;
 		else
 			dev_priv->display.get_fifo_size = i830_get_fifo_size;
 	}
 }
 
 static void __gen6_gt_wait_for_thread_c0(struct drm_i915_private *dev_priv)
 {
 	u32 gt_thread_status_mask;
 
 	if (IS_HASWELL(dev_priv->dev))
 		gt_thread_status_mask = GEN6_GT_THREAD_STATUS_CORE_MASK_HSW;
 	else
 		gt_thread_status_mask = GEN6_GT_THREAD_STATUS_CORE_MASK;
 
 	/* w/a for a sporadic read returning 0 by waiting for the GT
 	 * thread to wake up.
 	 */
 	if (wait_for_atomic_us((I915_READ_NOTRACE(GEN6_GT_THREAD_STATUS_REG) & gt_thread_status_mask) == 0, 500))
 		DRM_ERROR("GT thread status wait timed out\n");
 }
 
 static void __gen6_gt_force_wake_reset(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE, 0);
 	POSTING_READ(ECOBUS); /* something from same cacheline, but !FORCEWAKE */
 }
 
 static void __gen6_gt_force_wake_get(struct drm_i915_private *dev_priv)
 {
 	u32 forcewake_ack;
 
 	if (IS_HASWELL(dev_priv->dev))
 		forcewake_ack = FORCEWAKE_ACK_HSW;
 	else
 		forcewake_ack = FORCEWAKE_ACK;
 
 	if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1) == 0,
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n");
 
 	I915_WRITE_NOTRACE(FORCEWAKE, FORCEWAKE_KERNEL);
 	POSTING_READ(ECOBUS); /* something from same cacheline, but !FORCEWAKE */
 
 	if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1),
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake to ack request.\n");
 
 	__gen6_gt_wait_for_thread_c0(dev_priv);
 }
 
 static void __gen6_gt_force_wake_mt_reset(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_DISABLE(0xffff));
 	/* something from same cacheline, but !FORCEWAKE_MT */
 	POSTING_READ(ECOBUS);
 }
 
 static void __gen6_gt_force_wake_mt_get(struct drm_i915_private *dev_priv)
 {
 	u32 forcewake_ack;
 
 	if (IS_HASWELL(dev_priv->dev))
 		forcewake_ack = FORCEWAKE_ACK_HSW;
 	else
 		forcewake_ack = FORCEWAKE_MT_ACK;
 
 	if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1) == 0,
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n");
 
 	I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_ENABLE(FORCEWAKE_KERNEL));
 	/* something from same cacheline, but !FORCEWAKE_MT */
 	POSTING_READ(ECOBUS);
 
 	if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1),
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake to ack request.\n");
 
 	__gen6_gt_wait_for_thread_c0(dev_priv);
 }
 
 /*
  * Generally this is called implicitly by the register read function. However,
  * if some sequence requires the GT to not power down then this function should
  * be called at the beginning of the sequence followed by a call to
  * gen6_gt_force_wake_put() at the end of the sequence.
  */
 void gen6_gt_force_wake_get(struct drm_i915_private *dev_priv)
 {
 
 	mtx_lock(&dev_priv->gt_lock);
 	if (dev_priv->forcewake_count++ == 0)
 		dev_priv->gt.force_wake_get(dev_priv);
 	mtx_unlock(&dev_priv->gt_lock);
 }
 
 void gen6_gt_check_fifodbg(struct drm_i915_private *dev_priv)
 {
 	u32 gtfifodbg;
 	gtfifodbg = I915_READ_NOTRACE(GTFIFODBG);
 	if (WARN(gtfifodbg & GT_FIFO_CPU_ERROR_MASK,
 	     "MMIO read or write has been dropped %x\n", gtfifodbg))
 		I915_WRITE_NOTRACE(GTFIFODBG, GT_FIFO_CPU_ERROR_MASK);
 }
 
 static void __gen6_gt_force_wake_put(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE, 0);
 	/* something from same cacheline, but !FORCEWAKE */
 	POSTING_READ(ECOBUS);
 	gen6_gt_check_fifodbg(dev_priv);
 }
 
 static void __gen6_gt_force_wake_mt_put(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_DISABLE(FORCEWAKE_KERNEL));
 	/* something from same cacheline, but !FORCEWAKE_MT */
 	POSTING_READ(ECOBUS);
 	gen6_gt_check_fifodbg(dev_priv);
 }
 
 /*
  * see gen6_gt_force_wake_get()
  */
 void gen6_gt_force_wake_put(struct drm_i915_private *dev_priv)
 {
 
 	mtx_lock(&dev_priv->gt_lock);
 	if (--dev_priv->forcewake_count == 0)
 		dev_priv->gt.force_wake_put(dev_priv);
 	mtx_unlock(&dev_priv->gt_lock);
 }
 
 int __gen6_gt_wait_for_fifo(struct drm_i915_private *dev_priv)
 {
 	int ret = 0;
 
 	if (dev_priv->gt_fifo_count < GT_FIFO_NUM_RESERVED_ENTRIES) {
 		int loop = 500;
 		u32 fifo = I915_READ_NOTRACE(GT_FIFO_FREE_ENTRIES);
 		while (fifo <= GT_FIFO_NUM_RESERVED_ENTRIES && loop--) {
 			udelay(10);
 			fifo = I915_READ_NOTRACE(GT_FIFO_FREE_ENTRIES);
 		}
 		if (WARN_ON(loop < 0 && fifo <= GT_FIFO_NUM_RESERVED_ENTRIES))
 			++ret;
 		dev_priv->gt_fifo_count = fifo;
 	}
 	dev_priv->gt_fifo_count--;
 
 	return ret;
 }
 
 static void vlv_force_wake_reset(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_DISABLE(0xffff));
 	/* something from same cacheline, but !FORCEWAKE_VLV */
 	POSTING_READ(FORCEWAKE_ACK_VLV);
 }
 
 static void vlv_force_wake_get(struct drm_i915_private *dev_priv)
 {
 	if (wait_for_atomic((I915_READ_NOTRACE(FORCEWAKE_ACK_VLV) & 1) == 0,
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n");
 
 	I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_ENABLE(FORCEWAKE_KERNEL));
 
 	if (wait_for_atomic((I915_READ_NOTRACE(FORCEWAKE_ACK_VLV) & 1),
 			    FORCEWAKE_ACK_TIMEOUT_MS))
 		DRM_ERROR("Timed out waiting for forcewake to ack request.\n");
 
 	__gen6_gt_wait_for_thread_c0(dev_priv);
 }
 
 static void vlv_force_wake_put(struct drm_i915_private *dev_priv)
 {
 	I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_DISABLE(FORCEWAKE_KERNEL));
 	/* something from same cacheline, but !FORCEWAKE_VLV */
 	POSTING_READ(FORCEWAKE_ACK_VLV);
 	gen6_gt_check_fifodbg(dev_priv);
 }
 
 void intel_gt_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (IS_VALLEYVIEW(dev)) {
 		vlv_force_wake_reset(dev_priv);
 	} else if (INTEL_INFO(dev)->gen >= 6) {
 		__gen6_gt_force_wake_reset(dev_priv);
 		if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev))
 			__gen6_gt_force_wake_mt_reset(dev_priv);
 	}
 }
 
 void intel_gt_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	mtx_init(&dev_priv->gt_lock, "i915_gt_lock", NULL, MTX_DEF);
 
 	intel_gt_reset(dev);
 
 	if (IS_VALLEYVIEW(dev)) {
 		dev_priv->gt.force_wake_get = vlv_force_wake_get;
 		dev_priv->gt.force_wake_put = vlv_force_wake_put;
 	} else if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) {
 		dev_priv->gt.force_wake_get = __gen6_gt_force_wake_mt_get;
 		dev_priv->gt.force_wake_put = __gen6_gt_force_wake_mt_put;
 	} else if (IS_GEN6(dev)) {
 		dev_priv->gt.force_wake_get = __gen6_gt_force_wake_get;
 		dev_priv->gt.force_wake_put = __gen6_gt_force_wake_put;
 	}
 	TIMEOUT_TASK_INIT(dev_priv->wq, &dev_priv->rps.delayed_resume_work, 0,
 	    intel_gen6_powersave_work, dev_priv);
 }
 
 int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u8 mbox, u32 *val)
 {
 	sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED);
 
 	if (I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) {
 		DRM_DEBUG_DRIVER("warning: pcode (read) mailbox access failed\n");
 		return -EAGAIN;
 	}
 
 	I915_WRITE(GEN6_PCODE_DATA, *val);
 	I915_WRITE(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
 
 	if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0,
 		     500)) {
 		DRM_ERROR("timeout waiting for pcode read (%d) to finish\n", mbox);
 		return -ETIMEDOUT;
 	}
 
 	*val = I915_READ(GEN6_PCODE_DATA);
 	I915_WRITE(GEN6_PCODE_DATA, 0);
 
 	return 0;
 }
 
 int sandybridge_pcode_write(struct drm_i915_private *dev_priv, u8 mbox, u32 val)
 {
 	sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED);
 
 	if (I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) {
 		DRM_DEBUG_DRIVER("warning: pcode (write) mailbox access failed\n");
 		return -EAGAIN;
 	}
 
 	I915_WRITE(GEN6_PCODE_DATA, val);
 	I915_WRITE(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
 
 	if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0,
 		     500)) {
 		DRM_ERROR("timeout waiting for pcode write (%d) to finish\n", mbox);
 		return -ETIMEDOUT;
 	}
 
 	I915_WRITE(GEN6_PCODE_DATA, 0);
 
 	return 0;
 }
Index: head/sys/dev/efidev/efirtc.c
===================================================================
--- head/sys/dev/efidev/efirtc.c	(revision 336913)
+++ head/sys/dev/efidev/efirtc.c	(revision 336914)
@@ -1,206 +1,206 @@
 /*-
  * Copyright (c) 2017 Andrew Turner
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/clock.h>
 #include <sys/efi.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 
 #include "clock_if.h"
 
 static bool efirtc_zeroes_subseconds;
 static struct timespec efirtc_resadj;
 
 static const u_int us_per_s  = 1000000;
 static const u_int ns_per_s  = 1000000000;
 static const u_int ns_per_us = 1000;
 
 static void
 efirtc_identify(driver_t *driver, device_t parent)
 {
 
 	/* Don't add the driver unless we have working runtime services. */
 	if (efi_rt_ok() != 0)
 		return;
 	if (device_find_child(parent, "efirtc", -1) != NULL)
 		return;
 	if (BUS_ADD_CHILD(parent, 0, "efirtc", -1) == NULL)
 		device_printf(parent, "add child failed\n");
 }
 
 static int
 efirtc_probe(device_t dev)
 {
 	struct efi_tm tm;
 	int error;
 
 	/*
 	 * Check whether we can read the time.  This will stop us from attaching
 	 * when there is EFI Runtime support but the gettime function is
 	 * unimplemented, e.g. on some builds of U-Boot.
 	 */
 	if ((error = efi_get_time(&tm)) != 0) {
 		if (bootverbose)
 			device_printf(dev, "cannot read EFI realtime clock\n");
 		return (error);
 	}
 	device_set_desc(dev, "EFI Realtime Clock");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 efirtc_attach(device_t dev)
 {
 	struct efi_tmcap tmcap;
 	long res;
 	int error;
 
 	bzero(&tmcap, sizeof(tmcap));
 	if ((error = efi_get_time_capabilities(&tmcap)) != 0) {
 		device_printf(dev, "cannot get EFI time capabilities");
 		return (error);
 	}
 
 	/* Translate resolution in Hz to tick length in usec. */
 	if (tmcap.tc_res == 0)
 		res = us_per_s; /* 0 is insane, assume 1 Hz. */
 	else if (tmcap.tc_res > us_per_s)
 		res = 1; /* 1us is the best we can represent */
 	else
 		res = us_per_s / tmcap.tc_res;
 
 	/* Clock rounding adjustment is 1/2 of resolution, in nsec. */
 	efirtc_resadj.tv_nsec = (res * ns_per_us) / 2;
 
 	/* Does the clock zero the subseconds when time is set? */
 	efirtc_zeroes_subseconds = tmcap.tc_stz;
 
 	/*
 	 * Register.  If the clock zeroes out the subseconds when it's set,
 	 * schedule the SetTime calls to happen just before top-of-second.
 	 */
 	clock_register_flags(dev, res, CLOCKF_SETTIME_NO_ADJ);
 	if (efirtc_zeroes_subseconds)
 		clock_schedule(dev, ns_per_s - ns_per_us);
 
 	return (0);
 }
 
 static int
 efirtc_detach(device_t dev)
 {
 
 	clock_unregister(dev);
 	return (0);
 }
 
 static int
 efirtc_gettime(device_t dev, struct timespec *ts)
 {
 	struct clocktime ct;
 	struct efi_tm tm;
 	int error;
 
 	error = efi_get_time(&tm);
 	if (error != 0)
 		return (error);
 
 	ct.sec = tm.tm_sec;
 	ct.min = tm.tm_min;
 	ct.hour = tm.tm_hour;
 	ct.day = tm.tm_mday;
 	ct.mon = tm.tm_mon;
 	ct.year = tm.tm_year;
 	ct.nsec = tm.tm_nsec;
 
 	clock_dbgprint_ct(dev, CLOCK_DBG_READ, &ct);
 	return (clock_ct_to_ts(&ct, ts));
 }
 
 static int
 efirtc_settime(device_t dev, struct timespec *ts)
 {
 	struct clocktime ct;
 	struct efi_tm tm;
 
 	/*
 	 * We request a timespec with no resolution-adjustment so that we can
 	 * apply it ourselves based on whether or not the clock zeroes the
 	 * sub-second part of the time when setting the time.
 	 */
 	ts->tv_sec -= utc_offset();
 	if (!efirtc_zeroes_subseconds)
-		timespecadd(ts, &efirtc_resadj);
+		timespecadd(ts, &efirtc_resadj, ts);
 	
 	clock_ts_to_ct(ts, &ct);
 	clock_dbgprint_ct(dev, CLOCK_DBG_WRITE, &ct);
 
 	bzero(&tm, sizeof(tm));
 	tm.tm_sec = ct.sec;
 	tm.tm_min = ct.min;
 	tm.tm_hour = ct.hour;
 	tm.tm_mday = ct.day;
 	tm.tm_mon = ct.mon;
 	tm.tm_year = ct.year;
 	tm.tm_nsec = ct.nsec;
 
 	return (efi_set_time(&tm));
 }
 
 static device_method_t efirtc_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	efirtc_identify),
 	DEVMETHOD(device_probe,		efirtc_probe),
 	DEVMETHOD(device_attach,	efirtc_attach),
 	DEVMETHOD(device_detach,	efirtc_detach),
 
 	/* Clock interface */
 	DEVMETHOD(clock_gettime,	efirtc_gettime),
 	DEVMETHOD(clock_settime,	efirtc_settime),
 
 	DEVMETHOD_END
 };
 
 static devclass_t efirtc_devclass;
 static driver_t efirtc_driver = {
 	"efirtc",
 	efirtc_methods,
 	0
 };
 
 DRIVER_MODULE(efirtc, nexus, efirtc_driver, efirtc_devclass, 0, 0);
 MODULE_VERSION(efirtc, 1);
 MODULE_DEPEND(efirtc, efirt, 1, 1, 1);
Index: head/sys/dev/isp/isp_freebsd.c
===================================================================
--- head/sys/dev/isp/isp_freebsd.c	(revision 336913)
+++ head/sys/dev/isp/isp_freebsd.c	(revision 336914)
@@ -1,4313 +1,4314 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2017 Alexander Motin <mav@FreeBSD.org>
  * Copyright (c) 1997-2009 by Matthew Jacob
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Platform (FreeBSD) dependent common attachment code for Qlogic adapters.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/isp/isp_freebsd.h>
 #include <sys/unistd.h>
 #include <sys/kthread.h>
 #include <sys/conf.h>
 #include <sys/module.h>
 #include <sys/ioccom.h>
 #include <dev/isp/isp_ioctl.h>
 #include <sys/devicestat.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 
 MODULE_VERSION(isp, 1);
 MODULE_DEPEND(isp, cam, 1, 1, 1);
 int isp_announced = 0;
 int isp_loop_down_limit = 60;	/* default loop down limit */
 int isp_quickboot_time = 7;	/* don't wait more than N secs for loop up */
 int isp_gone_device_time = 30;	/* grace time before reporting device lost */
 static const char prom3[] = "Chan %d [%u] PortID 0x%06x Departed because of %s";
 
 static void isp_freeze_loopdown(ispsoftc_t *, int);
 static void isp_loop_changed(ispsoftc_t *isp, int chan);
 static d_ioctl_t ispioctl;
 static void isp_cam_async(void *, uint32_t, struct cam_path *, void *);
 static void isp_poll(struct cam_sim *);
 static timeout_t isp_watchdog;
 static timeout_t isp_gdt;
 static task_fn_t isp_gdt_task;
 static void isp_kthread(void *);
 static void isp_action(struct cam_sim *, union ccb *);
 static int isp_timer_count;
 static void isp_timer(void *);
 
 static struct cdevsw isp_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	ispioctl,
 	.d_name =	"isp",
 };
 
 static int
 isp_role_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	ispsoftc_t *isp = (ispsoftc_t *)arg1;
 	int chan = arg2;
 	int error, old, value;
 
 	value = FCPARAM(isp, chan)->role;
 
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	if (value < ISP_ROLE_NONE || value > ISP_ROLE_BOTH)
 		return (EINVAL);
 
 	ISP_LOCK(isp);
 	old = FCPARAM(isp, chan)->role;
 
 	/* We don't allow target mode switch from here. */
 	value = (old & ISP_ROLE_TARGET) | (value & ISP_ROLE_INITIATOR);
 
 	/* If nothing has changed -- we are done. */
 	if (value == old) {
 		ISP_UNLOCK(isp);
 		return (0);
 	}
 
 	/* Actually change the role. */
 	error = isp_control(isp, ISPCTL_CHANGE_ROLE, chan, value);
 	ISP_UNLOCK(isp);
 	return (error);
 }
 
 static int
 isp_attach_chan(ispsoftc_t *isp, struct cam_devq *devq, int chan)
 {
 	struct ccb_setasync csa;
 	struct cam_sim *sim;
 	struct cam_path *path;
 #ifdef	ISP_TARGET_MODE
 	int i;
 #endif
 
 	sim = cam_sim_alloc(isp_action, isp_poll, "isp", isp,
 	    device_get_unit(isp->isp_dev), &isp->isp_lock,
 	    isp->isp_maxcmds, isp->isp_maxcmds, devq);
 	if (sim == NULL)
 		return (ENOMEM);
 
 	ISP_LOCK(isp);
 	if (xpt_bus_register(sim, isp->isp_dev, chan) != CAM_SUCCESS) {
 		ISP_UNLOCK(isp);
 		cam_sim_free(sim, FALSE);
 		return (EIO);
 	}
 	ISP_UNLOCK(isp);
 	if (xpt_create_path(&path, NULL, cam_sim_path(sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 		ISP_LOCK(isp);
 		xpt_bus_deregister(cam_sim_path(sim));
 		ISP_UNLOCK(isp);
 		cam_sim_free(sim, FALSE);
 		return (ENXIO);
 	}
 	xpt_setup_ccb(&csa.ccb_h, path, 5);
 	csa.ccb_h.func_code = XPT_SASYNC_CB;
 	csa.event_enable = AC_LOST_DEVICE;
 	csa.callback = isp_cam_async;
 	csa.callback_arg = sim;
 
 	ISP_LOCK(isp);
 	xpt_action((union ccb *)&csa);
 	ISP_UNLOCK(isp);
 
 	if (IS_SCSI(isp)) {
 		struct isp_spi *spi = ISP_SPI_PC(isp, chan);
 		spi->sim = sim;
 		spi->path = path;
 #ifdef	ISP_TARGET_MODE
 		TAILQ_INIT(&spi->waitq);
 		STAILQ_INIT(&spi->ntfree);
 		for (i = 0; i < ATPDPSIZE; i++)
 			STAILQ_INSERT_TAIL(&spi->ntfree, &spi->ntpool[i], next);
 		LIST_INIT(&spi->atfree);
 		for (i = ATPDPSIZE-1; i >= 0; i--)
 			LIST_INSERT_HEAD(&spi->atfree, &spi->atpool[i], next);
 		for (i = 0; i < ATPDPHASHSIZE; i++)
 			LIST_INIT(&spi->atused[i]);
 #endif
 	} else {
 		fcparam *fcp = FCPARAM(isp, chan);
 		struct isp_fc *fc = ISP_FC_PC(isp, chan);
 		struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(isp->isp_osinfo.dev);
 		struct sysctl_oid *tree = device_get_sysctl_tree(isp->isp_osinfo.dev);
 		char name[16];
 
 		ISP_LOCK(isp);
 		fc->sim = sim;
 		fc->path = path;
 		fc->isp = isp;
 		fc->ready = 1;
 		fcp->isp_use_gft_id = 1;
 		fcp->isp_use_gff_id = 1;
 
 		callout_init_mtx(&fc->gdt, &isp->isp_lock, 0);
 		TASK_INIT(&fc->gtask, 1, isp_gdt_task, fc);
 #ifdef	ISP_TARGET_MODE
 		TAILQ_INIT(&fc->waitq);
 		STAILQ_INIT(&fc->ntfree);
 		for (i = 0; i < ATPDPSIZE; i++)
 			STAILQ_INSERT_TAIL(&fc->ntfree, &fc->ntpool[i], next);
 		LIST_INIT(&fc->atfree);
 		for (i = ATPDPSIZE-1; i >= 0; i--)
 			LIST_INSERT_HEAD(&fc->atfree, &fc->atpool[i], next);
 		for (i = 0; i < ATPDPHASHSIZE; i++)
 			LIST_INIT(&fc->atused[i]);
 #endif
 		isp_loop_changed(isp, chan);
 		ISP_UNLOCK(isp);
 		if (kproc_create(isp_kthread, fc, &fc->kproc, 0, 0,
 		    "%s_%d", device_get_nameunit(isp->isp_osinfo.dev), chan)) {
 			xpt_free_path(fc->path);
 			ISP_LOCK(isp);
 			xpt_bus_deregister(cam_sim_path(fc->sim));
 			ISP_UNLOCK(isp);
 			cam_sim_free(fc->sim, FALSE);
 			return (ENOMEM);
 		}
 		fc->num_threads += 1;
 		if (chan > 0) {
 			snprintf(name, sizeof(name), "chan%d", chan);
 			tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(tree),
 			    OID_AUTO, name, CTLFLAG_RW, 0, "Virtual channel");
 		}
 		SYSCTL_ADD_QUAD(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "wwnn", CTLFLAG_RD, &fcp->isp_wwnn,
 		    "World Wide Node Name");
 		SYSCTL_ADD_QUAD(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "wwpn", CTLFLAG_RD, &fcp->isp_wwpn,
 		    "World Wide Port Name");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "loop_down_limit", CTLFLAG_RW, &fc->loop_down_limit, 0,
 		    "Loop Down Limit");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "gone_device_time", CTLFLAG_RW, &fc->gone_device_time, 0,
 		    "Gone Device Time");
 #if defined(ISP_TARGET_MODE) && defined(DEBUG)
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "inject_lost_data_frame", CTLFLAG_RW, &fc->inject_lost_data_frame, 0,
 		    "Cause a Lost Frame on a Read");
 #endif
 		SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "role", CTLTYPE_INT | CTLFLAG_RW, isp, chan,
 		    isp_role_sysctl, "I", "Current role");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "speed", CTLFLAG_RD, &fcp->isp_gbspeed, 0,
 		    "Connection speed in gigabits");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "linkstate", CTLFLAG_RD, &fcp->isp_linkstate, 0,
 		    "Link state");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "fwstate", CTLFLAG_RD, &fcp->isp_fwstate, 0,
 		    "Firmware state");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "loopstate", CTLFLAG_RD, &fcp->isp_loopstate, 0,
 		    "Loop state");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "topo", CTLFLAG_RD, &fcp->isp_topo, 0,
 		    "Connection topology");
 		SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "use_gft_id", CTLFLAG_RWTUN, &fcp->isp_use_gft_id, 0,
 		    "Use GFT_ID during fabric scan");
 		SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
 		    "use_gff_id", CTLFLAG_RWTUN, &fcp->isp_use_gff_id, 0,
 		    "Use GFF_ID during fabric scan");
 	}
 	return (0);
 }
 
 static void
 isp_detach_chan(ispsoftc_t *isp, int chan)
 {
 	struct cam_sim *sim;
 	struct cam_path *path;
 	struct ccb_setasync csa;
 	int *num_threads;
 
 	ISP_GET_PC(isp, chan, sim, sim);
 	ISP_GET_PC(isp, chan, path, path);
 	ISP_GET_PC_ADDR(isp, chan, num_threads, num_threads);
 
 	xpt_setup_ccb(&csa.ccb_h, path, 5);
 	csa.ccb_h.func_code = XPT_SASYNC_CB;
 	csa.event_enable = 0;
 	csa.callback = isp_cam_async;
 	csa.callback_arg = sim;
 	xpt_action((union ccb *)&csa);
 	xpt_free_path(path);
 	xpt_bus_deregister(cam_sim_path(sim));
 	cam_sim_free(sim, FALSE);
 
 	/* Wait for the channel's spawned threads to exit. */
 	wakeup(isp->isp_osinfo.pc.ptr);
 	while (*num_threads != 0)
 		mtx_sleep(isp, &isp->isp_lock, PRIBIO, "isp_reap", 100);
 }
 
 int
 isp_attach(ispsoftc_t *isp)
 {
 	const char *nu = device_get_nameunit(isp->isp_osinfo.dev);
 	int du = device_get_unit(isp->isp_dev);
 	int chan;
 
 	/*
 	 * Create the device queue for our SIM(s).
 	 */
 	isp->isp_osinfo.devq = cam_simq_alloc(isp->isp_maxcmds);
 	if (isp->isp_osinfo.devq == NULL) {
 		return (EIO);
 	}
 
 	for (chan = 0; chan < isp->isp_nchan; chan++) {
 		if (isp_attach_chan(isp, isp->isp_osinfo.devq, chan)) {
 			goto unwind;
 		}
 	}
 
 	callout_init_mtx(&isp->isp_osinfo.tmo, &isp->isp_lock, 0);
 	isp_timer_count = hz >> 2;
 	callout_reset(&isp->isp_osinfo.tmo, isp_timer_count, isp_timer, isp);
 
 	isp->isp_osinfo.cdev = make_dev(&isp_cdevsw, du, UID_ROOT, GID_OPERATOR, 0600, "%s", nu);
 	if (isp->isp_osinfo.cdev) {
 		isp->isp_osinfo.cdev->si_drv1 = isp;
 	}
 	return (0);
 
 unwind:
 	while (--chan >= 0) {
 		struct cam_sim *sim;
 		struct cam_path *path;
 
 		ISP_GET_PC(isp, chan, sim, sim);
 		ISP_GET_PC(isp, chan, path, path);
 		xpt_free_path(path);
 		ISP_LOCK(isp);
 		xpt_bus_deregister(cam_sim_path(sim));
 		ISP_UNLOCK(isp);
 		cam_sim_free(sim, FALSE);
 	}
 	cam_simq_free(isp->isp_osinfo.devq);
 	isp->isp_osinfo.devq = NULL;
 	return (-1);
 }
 
 int
 isp_detach(ispsoftc_t *isp)
 {
 	int chan;
 
 	if (isp->isp_osinfo.cdev) {
 		destroy_dev(isp->isp_osinfo.cdev);
 		isp->isp_osinfo.cdev = NULL;
 	}
 	ISP_LOCK(isp);
 	/* Tell spawned threads that we're exiting. */
 	isp->isp_osinfo.is_exiting = 1;
 	for (chan = isp->isp_nchan - 1; chan >= 0; chan -= 1)
 		isp_detach_chan(isp, chan);
 	ISP_UNLOCK(isp);
 	callout_drain(&isp->isp_osinfo.tmo);
 	cam_simq_free(isp->isp_osinfo.devq);
 	return (0);
 }
 
 static void
 isp_freeze_loopdown(ispsoftc_t *isp, int chan)
 {
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	if (fc->sim == NULL)
 		return;
 	if (fc->simqfrozen == 0) {
 		isp_prt(isp, ISP_LOGDEBUG0,
 		    "Chan %d Freeze simq (loopdown)", chan);
 		fc->simqfrozen = SIMQFRZ_LOOPDOWN;
 		xpt_hold_boot();
 		xpt_freeze_simq(fc->sim, 1);
 	} else {
 		isp_prt(isp, ISP_LOGDEBUG0,
 		    "Chan %d Mark simq frozen (loopdown)", chan);
 		fc->simqfrozen |= SIMQFRZ_LOOPDOWN;
 	}
 }
 
 static void
 isp_unfreeze_loopdown(ispsoftc_t *isp, int chan)
 {
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	if (fc->sim == NULL)
 		return;
 	int wasfrozen = fc->simqfrozen & SIMQFRZ_LOOPDOWN;
 	fc->simqfrozen &= ~SIMQFRZ_LOOPDOWN;
 	if (wasfrozen && fc->simqfrozen == 0) {
 		isp_prt(isp, ISP_LOGDEBUG0,
 		    "Chan %d Release simq", chan);
 		xpt_release_simq(fc->sim, 1);
 		xpt_release_boot();
 	}
 }
 
 static int
 ispioctl(struct cdev *dev, u_long c, caddr_t addr, int flags, struct thread *td)
 {
 	ispsoftc_t *isp;
 	int nr, chan, retval = ENOTTY;
 
 	isp = dev->si_drv1;
 
 	switch (c) {
 	case ISP_SDBLEV:
 	{
 		int olddblev = isp->isp_dblev;
 		isp->isp_dblev = *(int *)addr;
 		*(int *)addr = olddblev;
 		retval = 0;
 		break;
 	}
 	case ISP_GETROLE:
 		chan = *(int *)addr;
 		if (chan < 0 || chan >= isp->isp_nchan) {
 			retval = -ENXIO;
 			break;
 		}
 		if (IS_FC(isp)) {
 			*(int *)addr = FCPARAM(isp, chan)->role;
 		} else {
 			*(int *)addr = ISP_ROLE_INITIATOR;
 		}
 		retval = 0;
 		break;
 	case ISP_SETROLE:
 		if (IS_SCSI(isp))
 			break;
 		nr = *(int *)addr;
 		chan = nr >> 8;
 		if (chan < 0 || chan >= isp->isp_nchan) {
 			retval = -ENXIO;
 			break;
 		}
 		nr &= 0xff;
 		if (nr & ~(ISP_ROLE_INITIATOR|ISP_ROLE_TARGET)) {
 			retval = EINVAL;
 			break;
 		}
 		ISP_LOCK(isp);
 		*(int *)addr = FCPARAM(isp, chan)->role;
 		retval = isp_control(isp, ISPCTL_CHANGE_ROLE, chan, nr);
 		ISP_UNLOCK(isp);
 		retval = 0;
 		break;
 
 	case ISP_RESETHBA:
 		ISP_LOCK(isp);
 		isp_reinit(isp, 0);
 		ISP_UNLOCK(isp);
 		retval = 0;
 		break;
 
 	case ISP_RESCAN:
 		if (IS_FC(isp)) {
 			chan = *(intptr_t *)addr;
 			if (chan < 0 || chan >= isp->isp_nchan) {
 				retval = -ENXIO;
 				break;
 			}
 			ISP_LOCK(isp);
 			if (isp_fc_runstate(isp, chan, 5 * 1000000) != LOOP_READY) {
 				retval = EIO;
 			} else {
 				retval = 0;
 			}
 			ISP_UNLOCK(isp);
 		}
 		break;
 
 	case ISP_FC_LIP:
 		if (IS_FC(isp)) {
 			chan = *(intptr_t *)addr;
 			if (chan < 0 || chan >= isp->isp_nchan) {
 				retval = -ENXIO;
 				break;
 			}
 			ISP_LOCK(isp);
 			if (isp_control(isp, ISPCTL_SEND_LIP, chan)) {
 				retval = EIO;
 			} else {
 				retval = 0;
 			}
 			ISP_UNLOCK(isp);
 		}
 		break;
 	case ISP_FC_GETDINFO:
 	{
 		struct isp_fc_device *ifc = (struct isp_fc_device *) addr;
 		fcportdb_t *lp;
 
 		if (IS_SCSI(isp)) {
 			break;
 		}
 		if (ifc->loopid >= MAX_FC_TARG) {
 			retval = EINVAL;
 			break;
 		}
 		lp = &FCPARAM(isp, ifc->chan)->portdb[ifc->loopid];
 		if (lp->state != FC_PORTDB_STATE_NIL) {
 			ifc->role = (lp->prli_word3 & SVC3_ROLE_MASK) >> SVC3_ROLE_SHIFT;
 			ifc->loopid = lp->handle;
 			ifc->portid = lp->portid;
 			ifc->node_wwn = lp->node_wwn;
 			ifc->port_wwn = lp->port_wwn;
 			retval = 0;
 		} else {
 			retval = ENODEV;
 		}
 		break;
 	}
 	case ISP_FC_GETHINFO:
 	{
 		struct isp_hba_device *hba = (struct isp_hba_device *) addr;
 		int chan = hba->fc_channel;
 
 		if (chan < 0 || chan >= isp->isp_nchan) {
 			retval = ENXIO;
 			break;
 		}
 		hba->fc_fw_major = ISP_FW_MAJORX(isp->isp_fwrev);
 		hba->fc_fw_minor = ISP_FW_MINORX(isp->isp_fwrev);
 		hba->fc_fw_micro = ISP_FW_MICROX(isp->isp_fwrev);
 		hba->fc_nchannels = isp->isp_nchan;
 		if (IS_FC(isp)) {
 			hba->fc_nports = MAX_FC_TARG;
 			hba->fc_speed = FCPARAM(isp, hba->fc_channel)->isp_gbspeed;
 			hba->fc_topology = FCPARAM(isp, chan)->isp_topo + 1;
 			hba->fc_loopid = FCPARAM(isp, chan)->isp_loopid;
 			hba->nvram_node_wwn = FCPARAM(isp, chan)->isp_wwnn_nvram;
 			hba->nvram_port_wwn = FCPARAM(isp, chan)->isp_wwpn_nvram;
 			hba->active_node_wwn = FCPARAM(isp, chan)->isp_wwnn;
 			hba->active_port_wwn = FCPARAM(isp, chan)->isp_wwpn;
 		} else {
 			hba->fc_nports = MAX_TARGETS;
 			hba->fc_speed = 0;
 			hba->fc_topology = 0;
 			hba->nvram_node_wwn = 0ull;
 			hba->nvram_port_wwn = 0ull;
 			hba->active_node_wwn = 0ull;
 			hba->active_port_wwn = 0ull;
 		}
 		retval = 0;
 		break;
 	}
 	case ISP_TSK_MGMT:
 	{
 		int needmarker;
 		struct isp_fc_tsk_mgmt *fct = (struct isp_fc_tsk_mgmt *) addr;
 		uint16_t nphdl;
 		mbreg_t mbs;
 
 		if (IS_SCSI(isp)) {
 			break;
 		}
 
 		chan = fct->chan;
 		if (chan < 0 || chan >= isp->isp_nchan) {
 			retval = -ENXIO;
 			break;
 		}
 
 		needmarker = retval = 0;
 		nphdl = fct->loopid;
 		ISP_LOCK(isp);
 		if (IS_24XX(isp)) {
 			void *reqp;
 			uint8_t resp[QENTRY_LEN];
 			isp24xx_tmf_t tmf;
 			isp24xx_statusreq_t sp;
 			fcparam *fcp = FCPARAM(isp, chan);
 			fcportdb_t *lp;
 			int i;
 
 			for (i = 0; i < MAX_FC_TARG; i++) {
 				lp = &fcp->portdb[i];
 				if (lp->handle == nphdl) {
 					break;
 				}
 			}
 			if (i == MAX_FC_TARG) {
 				retval = ENXIO;
 				ISP_UNLOCK(isp);
 				break;
 			}
 			ISP_MEMZERO(&tmf, sizeof(tmf));
 			tmf.tmf_header.rqs_entry_type = RQSTYPE_TSK_MGMT;
 			tmf.tmf_header.rqs_entry_count = 1;
 			tmf.tmf_nphdl = lp->handle;
 			tmf.tmf_delay = 2;
 			tmf.tmf_timeout = 4;
 			tmf.tmf_tidlo = lp->portid;
 			tmf.tmf_tidhi = lp->portid >> 16;
 			tmf.tmf_vpidx = ISP_GET_VPIDX(isp, chan);
 			tmf.tmf_lun[1] = fct->lun & 0xff;
 			if (fct->lun >= 256) {
 				tmf.tmf_lun[0] = 0x40 | (fct->lun >> 8);
 			}
 			switch (fct->action) {
 			case IPT_CLEAR_ACA:
 				tmf.tmf_flags = ISP24XX_TMF_CLEAR_ACA;
 				break;
 			case IPT_TARGET_RESET:
 				tmf.tmf_flags = ISP24XX_TMF_TARGET_RESET;
 				needmarker = 1;
 				break;
 			case IPT_LUN_RESET:
 				tmf.tmf_flags = ISP24XX_TMF_LUN_RESET;
 				needmarker = 1;
 				break;
 			case IPT_CLEAR_TASK_SET:
 				tmf.tmf_flags = ISP24XX_TMF_CLEAR_TASK_SET;
 				needmarker = 1;
 				break;
 			case IPT_ABORT_TASK_SET:
 				tmf.tmf_flags = ISP24XX_TMF_ABORT_TASK_SET;
 				needmarker = 1;
 				break;
 			default:
 				retval = EINVAL;
 				break;
 			}
 			if (retval) {
 				ISP_UNLOCK(isp);
 				break;
 			}
 
 			/* Prepare space for response in memory */
 			memset(resp, 0xff, sizeof(resp));
 			tmf.tmf_handle = isp_allocate_handle(isp, resp,
 			    ISP_HANDLE_CTRL);
 			if (tmf.tmf_handle == 0) {
 				isp_prt(isp, ISP_LOGERR,
 				    "%s: TMF of Chan %d out of handles",
 				    __func__, chan);
 				ISP_UNLOCK(isp);
 				retval = ENOMEM;
 				break;
 			}
 
 			/* Send request and wait for response. */
 			reqp = isp_getrqentry(isp);
 			if (reqp == NULL) {
 				isp_prt(isp, ISP_LOGERR,
 				    "%s: TMF of Chan %d out of rqent",
 				    __func__, chan);
 				isp_destroy_handle(isp, tmf.tmf_handle);
 				ISP_UNLOCK(isp);
 				retval = EIO;
 				break;
 			}
 			isp_put_24xx_tmf(isp, &tmf, (isp24xx_tmf_t *)reqp);
 			if (isp->isp_dblev & ISP_LOGDEBUG1)
 				isp_print_bytes(isp, "IOCB TMF", QENTRY_LEN, reqp);
 			ISP_SYNC_REQUEST(isp);
 			if (msleep(resp, &isp->isp_lock, 0, "TMF", 5*hz) == EWOULDBLOCK) {
 				isp_prt(isp, ISP_LOGERR,
 				    "%s: TMF of Chan %d timed out",
 				    __func__, chan);
 				isp_destroy_handle(isp, tmf.tmf_handle);
 				ISP_UNLOCK(isp);
 				retval = EIO;
 				break;
 			}
 			if (isp->isp_dblev & ISP_LOGDEBUG1)
 				isp_print_bytes(isp, "IOCB TMF response", QENTRY_LEN, resp);
 			isp_get_24xx_response(isp, (isp24xx_statusreq_t *)resp, &sp);
 
 			if (sp.req_completion_status != 0)
 				retval = EIO;
 			else if (needmarker)
 				fcp->sendmarker = 1;
 		} else {
 			MBSINIT(&mbs, 0, MBLOGALL, 0);
 			if (ISP_CAP_2KLOGIN(isp) == 0) {
 				nphdl <<= 8;
 			}
 			switch (fct->action) {
 			case IPT_CLEAR_ACA:
 				mbs.param[0] = MBOX_CLEAR_ACA;
 				mbs.param[1] = nphdl;
 				mbs.param[2] = fct->lun;
 				break;
 			case IPT_TARGET_RESET:
 				mbs.param[0] = MBOX_TARGET_RESET;
 				mbs.param[1] = nphdl;
 				needmarker = 1;
 				break;
 			case IPT_LUN_RESET:
 				mbs.param[0] = MBOX_LUN_RESET;
 				mbs.param[1] = nphdl;
 				mbs.param[2] = fct->lun;
 				needmarker = 1;
 				break;
 			case IPT_CLEAR_TASK_SET:
 				mbs.param[0] = MBOX_CLEAR_TASK_SET;
 				mbs.param[1] = nphdl;
 				mbs.param[2] = fct->lun;
 				needmarker = 1;
 				break;
 			case IPT_ABORT_TASK_SET:
 				mbs.param[0] = MBOX_ABORT_TASK_SET;
 				mbs.param[1] = nphdl;
 				mbs.param[2] = fct->lun;
 				needmarker = 1;
 				break;
 			default:
 				retval = EINVAL;
 				break;
 			}
 			if (retval == 0) {
 				if (needmarker) {
 					FCPARAM(isp, chan)->sendmarker = 1;
 				}
 				retval = isp_control(isp, ISPCTL_RUN_MBOXCMD, &mbs);
 				if (retval) {
 					retval = EIO;
 				}
 			}
 		}
 		ISP_UNLOCK(isp);
 		break;
 	}
 	default:
 		break;
 	}
 	return (retval);
 }
 
 /*
  * Local Inlines
  */
 
 static ISP_INLINE int isp_get_pcmd(ispsoftc_t *, union ccb *);
 static ISP_INLINE void isp_free_pcmd(ispsoftc_t *, union ccb *);
 
 static ISP_INLINE int
 isp_get_pcmd(ispsoftc_t *isp, union ccb *ccb)
 {
 	ISP_PCMD(ccb) = isp->isp_osinfo.pcmd_free;
 	if (ISP_PCMD(ccb) == NULL) {
 		return (-1);
 	}
 	isp->isp_osinfo.pcmd_free = ((struct isp_pcmd *)ISP_PCMD(ccb))->next;
 	return (0);
 }
 
 static ISP_INLINE void
 isp_free_pcmd(ispsoftc_t *isp, union ccb *ccb)
 {
 	if (ISP_PCMD(ccb)) {
 #ifdef	ISP_TARGET_MODE
 		PISP_PCMD(ccb)->datalen = 0;
 #endif
 		PISP_PCMD(ccb)->next = isp->isp_osinfo.pcmd_free;
 		isp->isp_osinfo.pcmd_free = ISP_PCMD(ccb);
 		ISP_PCMD(ccb) = NULL;
 	}
 }
 
 /*
  * Put the target mode functions here, because some are inlines
  */
 #ifdef	ISP_TARGET_MODE
 static ISP_INLINE tstate_t *get_lun_statep(ispsoftc_t *, int, lun_id_t);
 static atio_private_data_t *isp_get_atpd(ispsoftc_t *, int, uint32_t);
 static atio_private_data_t *isp_find_atpd(ispsoftc_t *, int, uint32_t);
 static void isp_put_atpd(ispsoftc_t *, int, atio_private_data_t *);
 static inot_private_data_t *isp_get_ntpd(ispsoftc_t *, int);
 static inot_private_data_t *isp_find_ntpd(ispsoftc_t *, int, uint32_t, uint32_t);
 static void isp_put_ntpd(ispsoftc_t *, int, inot_private_data_t *);
 static cam_status create_lun_state(ispsoftc_t *, int, struct cam_path *, tstate_t **);
 static void destroy_lun_state(ispsoftc_t *, int, tstate_t *);
 static void isp_enable_lun(ispsoftc_t *, union ccb *);
 static void isp_disable_lun(ispsoftc_t *, union ccb *);
 static timeout_t isp_refire_putback_atio;
 static timeout_t isp_refire_notify_ack;
 static void isp_complete_ctio(union ccb *);
 static void isp_target_putback_atio(union ccb *);
 enum Start_Ctio_How { FROM_CAM, FROM_TIMER, FROM_SRR, FROM_CTIO_DONE };
 static void isp_target_start_ctio(ispsoftc_t *, union ccb *, enum Start_Ctio_How);
 static void isp_handle_platform_atio2(ispsoftc_t *, at2_entry_t *);
 static void isp_handle_platform_atio7(ispsoftc_t *, at7_entry_t *);
 static void isp_handle_platform_ctio(ispsoftc_t *, void *);
 static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *, uint32_t rsp);
 static void isp_handle_platform_target_tmf(ispsoftc_t *, isp_notify_t *);
 static void isp_target_mark_aborted_early(ispsoftc_t *, int chan, tstate_t *, uint32_t);
 
 static ISP_INLINE tstate_t *
 get_lun_statep(ispsoftc_t *isp, int bus, lun_id_t lun)
 {
 	tstate_t *tptr = NULL;
 	struct tslist *lhp;
 
 	if (bus < isp->isp_nchan) {
 		ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(lun)], lhp);
 		SLIST_FOREACH(tptr, lhp, next) {
 			if (tptr->ts_lun == lun)
 				return (tptr);
 		}
 	}
 	return (NULL);
 }
 
 static int
 isp_atio_restart(ispsoftc_t *isp, int bus, tstate_t *tptr)
 {
 	inot_private_data_t *ntp;
 	struct ntpdlist rq;
 
 	if (STAILQ_EMPTY(&tptr->restart_queue))
 		return (0);
 	STAILQ_INIT(&rq);
 	STAILQ_CONCAT(&rq, &tptr->restart_queue);
 	while ((ntp = STAILQ_FIRST(&rq)) != NULL) {
 		STAILQ_REMOVE_HEAD(&rq, next);
 		if (IS_24XX(isp)) {
 			isp_prt(isp, ISP_LOGTDEBUG0,
 			    "%s: restarting resrc deprived %x", __func__,
 			    ((at7_entry_t *)ntp->data)->at_rxid);
 			isp_handle_platform_atio7(isp, (at7_entry_t *) ntp->data);
 		} else {
 			isp_prt(isp, ISP_LOGTDEBUG0,
 			    "%s: restarting resrc deprived %x", __func__,
 			    ((at2_entry_t *)ntp->data)->at_rxid);
 			isp_handle_platform_atio2(isp, (at2_entry_t *) ntp->data);
 		}
 		isp_put_ntpd(isp, bus, ntp);
 		if (!STAILQ_EMPTY(&tptr->restart_queue))
 			break;
 	}
 	if (!STAILQ_EMPTY(&rq)) {
 		STAILQ_CONCAT(&rq, &tptr->restart_queue);
 		STAILQ_CONCAT(&tptr->restart_queue, &rq);
 	}
 	return (!STAILQ_EMPTY(&tptr->restart_queue));
 }
 
 static void
 isp_tmcmd_restart(ispsoftc_t *isp)
 {
 	tstate_t *tptr;
 	union ccb *ccb;
 	struct tslist *lhp;
 	struct isp_ccbq *waitq;
 	int bus, i;
 
 	for (bus = 0; bus < isp->isp_nchan; bus++) {
 		for (i = 0; i < LUN_HASH_SIZE; i++) {
 			ISP_GET_PC_ADDR(isp, bus, lun_hash[i], lhp);
 			SLIST_FOREACH(tptr, lhp, next)
 				isp_atio_restart(isp, bus, tptr);
 		}
 
 		/*
 		 * We only need to do this once per channel.
 		 */
 		ISP_GET_PC_ADDR(isp, bus, waitq, waitq);
 		ccb = (union ccb *)TAILQ_FIRST(waitq);
 		if (ccb != NULL) {
 			TAILQ_REMOVE(waitq, &ccb->ccb_h, sim_links.tqe);
 			isp_target_start_ctio(isp, ccb, FROM_TIMER);
 		}
 	}
 }
 
 static atio_private_data_t *
 isp_get_atpd(ispsoftc_t *isp, int chan, uint32_t tag)
 {
 	struct atpdlist *atfree;
 	struct atpdlist *atused;
 	atio_private_data_t *atp;
 
 	ISP_GET_PC_ADDR(isp, chan, atfree, atfree);
 	atp = LIST_FIRST(atfree);
 	if (atp) {
 		LIST_REMOVE(atp, next);
 		atp->tag = tag;
 		ISP_GET_PC(isp, chan, atused, atused);
 		LIST_INSERT_HEAD(&atused[ATPDPHASH(tag)], atp, next);
 	}
 	return (atp);
 }
 
 static atio_private_data_t *
 isp_find_atpd(ispsoftc_t *isp, int chan, uint32_t tag)
 {
 	struct atpdlist *atused;
 	atio_private_data_t *atp;
 
 	ISP_GET_PC(isp, chan, atused, atused);
 	LIST_FOREACH(atp, &atused[ATPDPHASH(tag)], next) {
 		if (atp->tag == tag)
 			return (atp);
 	}
 	return (NULL);
 }
 
 static void
 isp_put_atpd(ispsoftc_t *isp, int chan, atio_private_data_t *atp)
 {
 	struct atpdlist *atfree;
 
 	if (atp->ests) {
 		isp_put_ecmd(isp, atp->ests);
 	}
 	LIST_REMOVE(atp, next);
 	memset(atp, 0, sizeof (*atp));
 	ISP_GET_PC_ADDR(isp, chan, atfree, atfree);
 	LIST_INSERT_HEAD(atfree, atp, next);
 }
 
 static void
 isp_dump_atpd(ispsoftc_t *isp, int chan)
 {
 	atio_private_data_t *atp, *atpool;
 	const char *states[8] = { "Free", "ATIO", "CAM", "CTIO", "LAST_CTIO", "PDON", "?6", "7" };
 
 	ISP_GET_PC(isp, chan, atpool, atpool);
 	for (atp = atpool; atp < &atpool[ATPDPSIZE]; atp++) {
 		if (atp->state == ATPD_STATE_FREE)
 			continue;
 		isp_prt(isp, ISP_LOGALL, "Chan %d ATP [0x%x] origdlen %u bytes_xfrd %u lun %jx nphdl 0x%04x s_id 0x%06x d_id 0x%06x oxid 0x%04x state %s",
 		    chan, atp->tag, atp->orig_datalen, atp->bytes_xfered, (uintmax_t)atp->lun, atp->nphdl, atp->sid, atp->did, atp->oxid, states[atp->state & 0x7]);
 	}
 }
 
 static inot_private_data_t *
 isp_get_ntpd(ispsoftc_t *isp, int chan)
 {
 	struct ntpdlist *ntfree;
 	inot_private_data_t *ntp;
 
 	ISP_GET_PC_ADDR(isp, chan, ntfree, ntfree);
 	ntp = STAILQ_FIRST(ntfree);
 	if (ntp)
 		STAILQ_REMOVE_HEAD(ntfree, next);
 	return (ntp);
 }
 
 static inot_private_data_t *
 isp_find_ntpd(ispsoftc_t *isp, int chan, uint32_t tag_id, uint32_t seq_id)
 {
 	inot_private_data_t *ntp, *ntp2;
 
 	ISP_GET_PC(isp, chan, ntpool, ntp);
 	ISP_GET_PC_ADDR(isp, chan, ntpool[ATPDPSIZE], ntp2);
 	for (; ntp < ntp2; ntp++) {
 		if (ntp->tag_id == tag_id && ntp->seq_id == seq_id)
 			return (ntp);
 	}
 	return (NULL);
 }
 
 static void
 isp_put_ntpd(ispsoftc_t *isp, int chan, inot_private_data_t *ntp)
 {
 	struct ntpdlist *ntfree;
 
 	ntp->tag_id = ntp->seq_id = 0;
 	ISP_GET_PC_ADDR(isp, chan, ntfree, ntfree);
 	STAILQ_INSERT_HEAD(ntfree, ntp, next);
 }
 
 static cam_status
 create_lun_state(ispsoftc_t *isp, int bus, struct cam_path *path, tstate_t **rslt)
 {
 	lun_id_t lun;
 	struct tslist *lhp;
 	tstate_t *tptr;
 
 	lun = xpt_path_lun_id(path);
 	if (lun != CAM_LUN_WILDCARD) {
 		if (ISP_MAX_LUNS(isp) > 0 && lun >= ISP_MAX_LUNS(isp)) {
 			return (CAM_LUN_INVALID);
 		}
 	}
 	tptr = malloc(sizeof (tstate_t), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (tptr == NULL) {
 		return (CAM_RESRC_UNAVAIL);
 	}
 	tptr->ts_lun = lun;
 	SLIST_INIT(&tptr->atios);
 	SLIST_INIT(&tptr->inots);
 	STAILQ_INIT(&tptr->restart_queue);
 	ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(lun)], lhp);
 	SLIST_INSERT_HEAD(lhp, tptr, next);
 	*rslt = tptr;
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, path, "created tstate\n");
 	return (CAM_REQ_CMP);
 }
 
 static void
 destroy_lun_state(ispsoftc_t *isp, int bus, tstate_t *tptr)
 {
 	union ccb *ccb;
 	struct tslist *lhp;
 	inot_private_data_t *ntp;
 
 	while ((ccb = (union ccb *)SLIST_FIRST(&tptr->atios)) != NULL) {
 		SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle);
 		ccb->ccb_h.status = CAM_REQ_ABORTED;
 		xpt_done(ccb);
 	};
 	while ((ccb = (union ccb *)SLIST_FIRST(&tptr->inots)) != NULL) {
 		SLIST_REMOVE_HEAD(&tptr->inots, sim_links.sle);
 		ccb->ccb_h.status = CAM_REQ_ABORTED;
 		xpt_done(ccb);
 	}
 	while ((ntp = STAILQ_FIRST(&tptr->restart_queue)) != NULL) {
 		isp_endcmd(isp, ntp->data, NIL_HANDLE, bus, SCSI_STATUS_BUSY, 0);
 		STAILQ_REMOVE_HEAD(&tptr->restart_queue, next);
 		isp_put_ntpd(isp, bus, ntp);
 	}
 	ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(tptr->ts_lun)], lhp);
 	SLIST_REMOVE(lhp, tptr, tstate, next);
 	free(tptr, M_DEVBUF);
 }
 
 static void
 isp_enable_lun(ispsoftc_t *isp, union ccb *ccb)
 {
 	tstate_t *tptr;
 	int bus;
 	target_id_t target;
 	lun_id_t lun;
 
 	if (!IS_FC(isp) || !ISP_CAP_TMODE(isp) || !ISP_CAP_SCCFW(isp)) {
 		xpt_print(ccb->ccb_h.path, "Target mode is not supported\n");
 		ccb->ccb_h.status = CAM_FUNC_NOTAVAIL;
 		xpt_done(ccb);
 		return;
 	}
 
 	/*
 	 * We only support either target and lun both wildcard
 	 * or target and lun both non-wildcard.
 	 */
 	bus = XS_CHANNEL(ccb);
 	target = ccb->ccb_h.target_id;
 	lun = ccb->ccb_h.target_lun;
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG0|ISP_LOGCONFIG, ccb->ccb_h.path,
 	    "enabling lun %jx\n", (uintmax_t)lun);
 	if ((target == CAM_TARGET_WILDCARD) != (lun == CAM_LUN_WILDCARD)) {
 		ccb->ccb_h.status = CAM_LUN_INVALID;
 		xpt_done(ccb);
 		return;
 	}
 
 	/* Create the state pointer. It should not already exist. */
 	tptr = get_lun_statep(isp, bus, lun);
 	if (tptr) {
 		ccb->ccb_h.status = CAM_LUN_ALRDY_ENA;
 		xpt_done(ccb);
 		return;
 	}
 	ccb->ccb_h.status = create_lun_state(isp, bus, ccb->ccb_h.path, &tptr);
 	if (ccb->ccb_h.status != CAM_REQ_CMP) {
 		xpt_done(ccb);
 		return;
 	}
 
 	ccb->ccb_h.status = CAM_REQ_CMP;
 	xpt_done(ccb);
 }
 
 static void
 isp_disable_lun(ispsoftc_t *isp, union ccb *ccb)
 {
 	tstate_t *tptr = NULL;
 	int bus;
 	target_id_t target;
 	lun_id_t lun;
 
 	bus = XS_CHANNEL(ccb);
 	target = ccb->ccb_h.target_id;
 	lun = ccb->ccb_h.target_lun;
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG0|ISP_LOGCONFIG, ccb->ccb_h.path,
 	    "disabling lun %jx\n", (uintmax_t)lun);
 	if ((target == CAM_TARGET_WILDCARD) != (lun == CAM_LUN_WILDCARD)) {
 		ccb->ccb_h.status = CAM_LUN_INVALID;
 		xpt_done(ccb);
 		return;
 	}
 
 	/* Find the state pointer. */
 	if ((tptr = get_lun_statep(isp, bus, lun)) == NULL) {
 		ccb->ccb_h.status = CAM_PATH_INVALID;
 		xpt_done(ccb);
 		return;
 	}
 
 	destroy_lun_state(isp, bus, tptr);
 	ccb->ccb_h.status = CAM_REQ_CMP;
 	xpt_done(ccb);
 }
 
 static void
 isp_target_start_ctio(ispsoftc_t *isp, union ccb *ccb, enum Start_Ctio_How how)
 {
 	int fctape, sendstatus, resid;
 	fcparam *fcp;
 	atio_private_data_t *atp;
 	struct ccb_scsiio *cso;
 	struct isp_ccbq *waitq;
 	uint32_t dmaresult, handle, xfrlen, sense_length, tmp;
 	uint8_t local[QENTRY_LEN];
 
 	isp_prt(isp, ISP_LOGTDEBUG0, "%s: ENTRY[0x%x] how %u xfrlen %u sendstatus %d sense_len %u", __func__, ccb->csio.tag_id, how, ccb->csio.dxfer_len,
 	    (ccb->ccb_h.flags & CAM_SEND_STATUS) != 0, ((ccb->ccb_h.flags & CAM_SEND_SENSE)? ccb->csio.sense_len : 0));
 
 	ISP_GET_PC_ADDR(isp, XS_CHANNEL(ccb), waitq, waitq);
 	switch (how) {
 	case FROM_CAM:
 		/*
 		 * Insert at the tail of the list, if any, waiting CTIO CCBs
 		 */
 		TAILQ_INSERT_TAIL(waitq, &ccb->ccb_h, sim_links.tqe);
 		break;
 	case FROM_TIMER:
 	case FROM_SRR:
 	case FROM_CTIO_DONE:
 		TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 		break;
 	}
 
 	while ((ccb = (union ccb *) TAILQ_FIRST(waitq)) != NULL) {
 		TAILQ_REMOVE(waitq, &ccb->ccb_h, sim_links.tqe);
 
 		cso = &ccb->csio;
 		xfrlen = cso->dxfer_len;
 		if (xfrlen == 0) {
 			if ((ccb->ccb_h.flags & CAM_SEND_STATUS) == 0) {
 				ISP_PATH_PRT(isp, ISP_LOGERR, ccb->ccb_h.path, "a data transfer length of zero but no status to send is wrong\n");
 				ccb->ccb_h.status = CAM_REQ_INVALID;
 				xpt_done(ccb);
 				continue;
 			}
 		}
 
 		atp = isp_find_atpd(isp, XS_CHANNEL(ccb), cso->tag_id);
 		if (atp == NULL) {
 			isp_prt(isp, ISP_LOGERR, "%s: [0x%x] cannot find private data adjunct in %s", __func__, cso->tag_id, __func__);
 			isp_dump_atpd(isp, XS_CHANNEL(ccb));
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			xpt_done(ccb);
 			continue;
 		}
 
 		/*
 		 * Is this command a dead duck?
 		 */
 		if (atp->dead) {
 			isp_prt(isp, ISP_LOGERR, "%s: [0x%x] not sending a CTIO for a dead command", __func__, cso->tag_id);
 			ccb->ccb_h.status = CAM_REQ_ABORTED;
 			xpt_done(ccb);
 			continue;
 		}
 
 		/*
 		 * Check to make sure we're still in target mode.
 		 */
 		fcp = FCPARAM(isp, XS_CHANNEL(ccb));
 		if ((fcp->role & ISP_ROLE_TARGET) == 0) {
 			isp_prt(isp, ISP_LOGERR, "%s: [0x%x] stopping sending a CTIO because we're no longer in target mode", __func__, cso->tag_id);
 			ccb->ccb_h.status = CAM_PROVIDE_FAIL;
 			xpt_done(ccb);
 			continue;
 		}
 
 		/*
 		 * We're only handling ATPD_CCB_OUTSTANDING outstanding CCB at a time (one of which
 		 * could be split into two CTIOs to split data and status).
 		 */
 		if (atp->ctcnt >= ATPD_CCB_OUTSTANDING) {
 			isp_prt(isp, ISP_LOGTINFO, "[0x%x] handling only %d CCBs at a time (flags for this ccb: 0x%x)", cso->tag_id, ATPD_CCB_OUTSTANDING, ccb->ccb_h.flags);
 			TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 			break;
 		}
 
 		/*
 		 * Does the initiator expect FC-Tape style responses?
 		 */
 		if ((atp->word3 & PRLI_WD3_RETRY) && fcp->fctape_enabled) {
 			fctape = 1;
 		} else {
 			fctape = 0;
 		}
 
 		/*
 		 * If we already did the data xfer portion of a CTIO that sends data
 		 * and status, don't do it again and do the status portion now.
 		 */
 		if (atp->sendst) {
 			isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] now sending synthesized status orig_dl=%u xfered=%u bit=%u",
 			    cso->tag_id, atp->orig_datalen, atp->bytes_xfered, atp->bytes_in_transit);
 			xfrlen = 0;	/* we already did the data transfer */
 			atp->sendst = 0;
 		}
 		if (ccb->ccb_h.flags & CAM_SEND_STATUS) {
 			sendstatus = 1;
 		} else {
 			sendstatus = 0;
 		}
 
 		if (ccb->ccb_h.flags & CAM_SEND_SENSE) {
 			KASSERT((sendstatus != 0), ("how can you have CAM_SEND_SENSE w/o CAM_SEND_STATUS?"));
 			/*
 			 * Sense length is not the entire sense data structure size. Periph
 			 * drivers don't seem to be setting sense_len to reflect the actual
 			 * size. We'll peek inside to get the right amount.
 			 */
 			sense_length = cso->sense_len;
 
 			/*
 			 * This 'cannot' happen
 			 */
 			if (sense_length > (XCMD_SIZE - MIN_FCP_RESPONSE_SIZE)) {
 				sense_length = XCMD_SIZE - MIN_FCP_RESPONSE_SIZE;
 			}
 		} else {
 			sense_length = 0;
 		}
 
 		memset(local, 0, QENTRY_LEN);
 
 		/*
 		 * Check for overflow
 		 */
 		tmp = atp->bytes_xfered + atp->bytes_in_transit;
 		if (xfrlen > 0 && tmp > atp->orig_datalen) {
 			isp_prt(isp, ISP_LOGERR,
 			    "%s: [0x%x] data overflow by %u bytes", __func__,
 			    cso->tag_id, tmp + xfrlen - atp->orig_datalen);
 			ccb->ccb_h.status = CAM_DATA_RUN_ERR;
 			xpt_done(ccb);
 			continue;
 		}
 		if (xfrlen > atp->orig_datalen - tmp) {
 			xfrlen = atp->orig_datalen - tmp;
 			if (xfrlen == 0 && !sendstatus) {
 				cso->resid = cso->dxfer_len;
 				ccb->ccb_h.status = CAM_REQ_CMP;
 				xpt_done(ccb);
 				continue;
 			}
 		}
 
 		if (IS_24XX(isp)) {
 			ct7_entry_t *cto = (ct7_entry_t *) local;
 
 			cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7;
 			cto->ct_header.rqs_entry_count = 1;
 			cto->ct_header.rqs_seqno |= ATPD_SEQ_NOTIFY_CAM;
 			ATPD_SET_SEQNO(cto, atp);
 			cto->ct_nphdl = atp->nphdl;
 			cto->ct_rxid = atp->tag;
 			cto->ct_iid_lo = atp->sid;
 			cto->ct_iid_hi = atp->sid >> 16;
 			cto->ct_oxid = atp->oxid;
 			cto->ct_vpidx = ISP_GET_VPIDX(isp, XS_CHANNEL(ccb));
 			cto->ct_timeout = XS_TIME(ccb);
 			cto->ct_flags = atp->tattr << CT7_TASK_ATTR_SHIFT;
 
 			/*
 			 * Mode 1, status, no data. Only possible when we are sending status, have
 			 * no data to transfer, and any sense data can fit into a ct7_entry_t.
 			 *
 			 * Mode 2, status, no data. We have to use this in the case that
 			 * the sense data won't fit into a ct7_entry_t.
 			 *
 			 */
 			if (sendstatus && xfrlen == 0) {
 				cto->ct_flags |= CT7_SENDSTATUS | CT7_NO_DATA;
 				resid = atp->orig_datalen - atp->bytes_xfered - atp->bytes_in_transit;
 				if (sense_length <= MAXRESPLEN_24XX) {
 					cto->ct_flags |= CT7_FLAG_MODE1;
 					cto->ct_scsi_status = cso->scsi_status;
 					if (resid < 0) {
 						cto->ct_resid = -resid;
 						cto->ct_scsi_status |= (FCP_RESID_OVERFLOW << 8);
 					} else if (resid > 0) {
 						cto->ct_resid = resid;
 						cto->ct_scsi_status |= (FCP_RESID_UNDERFLOW << 8);
 					}
 					if (fctape) {
 						cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF;
 					}
 					if (sense_length) {
 						cto->ct_scsi_status |= (FCP_SNSLEN_VALID << 8);
 						cto->rsp.m1.ct_resplen = cto->ct_senselen = sense_length;
 						memcpy(cto->rsp.m1.ct_resp, &cso->sense_data, sense_length);
 					}
 				} else {
 					bus_addr_t addr;
 					char buf[XCMD_SIZE];
 					fcp_rsp_iu_t *rp;
 
 					if (atp->ests == NULL) {
 						atp->ests = isp_get_ecmd(isp);
 						if (atp->ests == NULL) {
 							TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 							break;
 						}
 					}
 					memset(buf, 0, sizeof (buf));
 					rp = (fcp_rsp_iu_t *)buf;
 					if (fctape) {
 						cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF;
 						rp->fcp_rsp_bits |= FCP_CONF_REQ;
 					}
 					cto->ct_flags |= CT7_FLAG_MODE2;
 	        			rp->fcp_rsp_scsi_status = cso->scsi_status;
 					if (resid < 0) {
 						rp->fcp_rsp_resid = -resid;
 						rp->fcp_rsp_bits |= FCP_RESID_OVERFLOW;
 					} else if (resid > 0) {
 						rp->fcp_rsp_resid = resid;
 						rp->fcp_rsp_bits |= FCP_RESID_UNDERFLOW;
 					}
 					if (sense_length) {
 	        				rp->fcp_rsp_snslen = sense_length;
 						cto->ct_senselen = sense_length;
 						rp->fcp_rsp_bits |= FCP_SNSLEN_VALID;
 						isp_put_fcp_rsp_iu(isp, rp, atp->ests);
 						memcpy(((fcp_rsp_iu_t *)atp->ests)->fcp_rsp_extra, &cso->sense_data, sense_length);
 					} else {
 						isp_put_fcp_rsp_iu(isp, rp, atp->ests);
 					}
 					if (isp->isp_dblev & ISP_LOGTDEBUG1) {
 						isp_print_bytes(isp, "FCP Response Frame After Swizzling", MIN_FCP_RESPONSE_SIZE + sense_length, atp->ests);
 					}
 					addr = isp->isp_osinfo.ecmd_dma;
 					addr += ((((isp_ecmd_t *)atp->ests) - isp->isp_osinfo.ecmd_base) * XCMD_SIZE);
 					isp_prt(isp, ISP_LOGTDEBUG0, "%s: ests base %p vaddr %p ecmd_dma %jx addr %jx len %u", __func__, isp->isp_osinfo.ecmd_base, atp->ests,
 					    (uintmax_t) isp->isp_osinfo.ecmd_dma, (uintmax_t)addr, MIN_FCP_RESPONSE_SIZE + sense_length);
 					cto->rsp.m2.ct_datalen = MIN_FCP_RESPONSE_SIZE + sense_length;
 					cto->rsp.m2.ct_fcp_rsp_iudata.ds_base = DMA_LO32(addr);
 					cto->rsp.m2.ct_fcp_rsp_iudata.ds_basehi = DMA_HI32(addr);
 					cto->rsp.m2.ct_fcp_rsp_iudata.ds_count = MIN_FCP_RESPONSE_SIZE + sense_length;
 				}
 				if (sense_length) {
 					isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d slen %u sense: %x %x/%x/%x", __func__,
 					    cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, cto->ct_resid, sense_length,
 					    cso->sense_data.error_code, cso->sense_data.sense_buf[1], cso->sense_data.sense_buf[11], cso->sense_data.sense_buf[12]);
 				} else {
 					isp_prt(isp, ISP_LOGDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d", __func__,
 					    cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, cto->ct_resid);
 				}
 				atp->state = ATPD_STATE_LAST_CTIO;
 			}
 
 			/*
 			 * Mode 0 data transfers, *possibly* with status.
 			 */
 			if (xfrlen != 0) {
 				cto->ct_flags |= CT7_FLAG_MODE0;
 				if ((cso->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 					cto->ct_flags |= CT7_DATA_IN;
 				} else {
 					cto->ct_flags |= CT7_DATA_OUT;
 				}
 
 				cto->rsp.m0.reloff = atp->bytes_xfered + atp->bytes_in_transit;
 				cto->rsp.m0.ct_xfrlen = xfrlen;
 
 #ifdef	DEBUG
 				if (ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame && xfrlen > ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame) {
 					isp_prt(isp, ISP_LOGWARN, "%s: truncating data frame with xfrlen %d to %d", __func__, xfrlen, xfrlen - (xfrlen >> 2));
 					ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame = 0;
 					cto->rsp.m0.ct_xfrlen -= xfrlen >> 2;
 				}
 #endif
 				if (sendstatus) {
 					resid = atp->orig_datalen - atp->bytes_xfered - xfrlen;
 					if (cso->scsi_status == SCSI_STATUS_OK && resid == 0 /* && fctape == 0 */) {
 						cto->ct_flags |= CT7_SENDSTATUS;
 						atp->state = ATPD_STATE_LAST_CTIO;
 						if (fctape) {
 							cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF;
 						}
 					} else {
 						atp->sendst = 1;	/* send status later */
 						cto->ct_header.rqs_seqno &= ~ATPD_SEQ_NOTIFY_CAM;
 						atp->state = ATPD_STATE_CTIO;
 					}
 				} else {
 					atp->state = ATPD_STATE_CTIO;
 				}
 				isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x xfrlen=%u off=%u", __func__,
 				    cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, xfrlen, atp->bytes_xfered);
 			}
 		} else {
 			ct2_entry_t *cto = (ct2_entry_t *) local;
 
 			cto->ct_header.rqs_entry_type = RQSTYPE_CTIO2;
 			cto->ct_header.rqs_entry_count = 1;
 			cto->ct_header.rqs_seqno |= ATPD_SEQ_NOTIFY_CAM;
 			ATPD_SET_SEQNO(cto, atp);
 			if (ISP_CAP_2KLOGIN(isp)) {
 				((ct2e_entry_t *)cto)->ct_iid = atp->nphdl;
 			} else {
 				cto->ct_iid = atp->nphdl;
 				if (ISP_CAP_SCCFW(isp) == 0) {
 					cto->ct_lun = ccb->ccb_h.target_lun;
 				}
 			}
 			cto->ct_timeout = XS_TIME(ccb);
 			cto->ct_rxid = cso->tag_id;
 
 			/*
 			 * Mode 1, status, no data. Only possible when we are sending status, have
 			 * no data to transfer, and the sense length can fit in the ct7_entry.
 			 *
 			 * Mode 2, status, no data. We have to use this in the case the response
 			 * length won't fit into a ct2_entry_t.
 			 *
 			 * We'll fill out this structure with information as if this were a
 			 * Mode 1. The hardware layer will create the Mode 2 FCP RSP IU as
 			 * needed based upon this.
 			 */
 			if (sendstatus && xfrlen == 0) {
 				cto->ct_flags |= CT2_SENDSTATUS | CT2_NO_DATA;
 				resid = atp->orig_datalen - atp->bytes_xfered - atp->bytes_in_transit;
 				if (sense_length <= MAXRESPLEN) {
 					if (resid < 0) {
 						cto->ct_resid = -resid;
 					} else if (resid > 0) {
 						cto->ct_resid = resid;
 					}
 					cto->ct_flags |= CT2_FLAG_MODE1;
 					cto->rsp.m1.ct_scsi_status = cso->scsi_status;
 					if (resid < 0) {
 						cto->rsp.m1.ct_scsi_status |= CT2_DATA_OVER;
 					} else if (resid > 0) {
 						cto->rsp.m1.ct_scsi_status |= CT2_DATA_UNDER;
 					}
 					if (fctape) {
 						cto->ct_flags |= CT2_CONFIRM;
 					}
 					if (sense_length) {
 						cto->rsp.m1.ct_scsi_status |= CT2_SNSLEN_VALID;
 						cto->rsp.m1.ct_resplen = cto->rsp.m1.ct_senselen = sense_length;
 						memcpy(cto->rsp.m1.ct_resp, &cso->sense_data, sense_length);
 					}
 				} else {
 					bus_addr_t addr;
 					char buf[XCMD_SIZE];
 					fcp_rsp_iu_t *rp;
 
 					if (atp->ests == NULL) {
 						atp->ests = isp_get_ecmd(isp);
 						if (atp->ests == NULL) {
 							TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 							break;
 						}
 					}
 					memset(buf, 0, sizeof (buf));
 					rp = (fcp_rsp_iu_t *)buf;
 					if (fctape) {
 						cto->ct_flags |= CT2_CONFIRM;
 						rp->fcp_rsp_bits |= FCP_CONF_REQ;
 					}
 					cto->ct_flags |= CT2_FLAG_MODE2;
 	        			rp->fcp_rsp_scsi_status = cso->scsi_status;
 					if (resid < 0) {
 						rp->fcp_rsp_resid = -resid;
 						rp->fcp_rsp_bits |= FCP_RESID_OVERFLOW;
 					} else if (resid > 0) {
 						rp->fcp_rsp_resid = resid;
 						rp->fcp_rsp_bits |= FCP_RESID_UNDERFLOW;
 					}
 					if (sense_length) {
 	        				rp->fcp_rsp_snslen = sense_length;
 						rp->fcp_rsp_bits |= FCP_SNSLEN_VALID;
 						isp_put_fcp_rsp_iu(isp, rp, atp->ests);
 						memcpy(((fcp_rsp_iu_t *)atp->ests)->fcp_rsp_extra, &cso->sense_data, sense_length);
 					} else {
 						isp_put_fcp_rsp_iu(isp, rp, atp->ests);
 					}
 					if (isp->isp_dblev & ISP_LOGTDEBUG1) {
 						isp_print_bytes(isp, "FCP Response Frame After Swizzling", MIN_FCP_RESPONSE_SIZE + sense_length, atp->ests);
 					}
 					addr = isp->isp_osinfo.ecmd_dma;
 					addr += ((((isp_ecmd_t *)atp->ests) - isp->isp_osinfo.ecmd_base) * XCMD_SIZE);
 					isp_prt(isp, ISP_LOGTDEBUG0, "%s: ests base %p vaddr %p ecmd_dma %jx addr %jx len %u", __func__, isp->isp_osinfo.ecmd_base, atp->ests,
 					    (uintmax_t) isp->isp_osinfo.ecmd_dma, (uintmax_t)addr, MIN_FCP_RESPONSE_SIZE + sense_length);
 					cto->rsp.m2.ct_datalen = MIN_FCP_RESPONSE_SIZE + sense_length;
 					cto->rsp.m2.u.ct_fcp_rsp_iudata_32.ds_base = DMA_LO32(addr);
 					cto->rsp.m2.u.ct_fcp_rsp_iudata_32.ds_count = MIN_FCP_RESPONSE_SIZE + sense_length;
 				}
 				if (sense_length) {
 					isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d sense: %x %x/%x/%x", __func__,
 					    cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid,
 					    cso->sense_data.error_code, cso->sense_data.sense_buf[1], cso->sense_data.sense_buf[11], cso->sense_data.sense_buf[12]);
 				} else {
 					isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d", __func__, cto->ct_rxid,
 					    ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid);
 				}
 				atp->state = ATPD_STATE_LAST_CTIO;
 			}
 
 			if (xfrlen != 0) {
 				cto->ct_flags |= CT2_FLAG_MODE0;
 				if ((cso->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 					cto->ct_flags |= CT2_DATA_IN;
 				} else {
 					cto->ct_flags |= CT2_DATA_OUT;
 				}
 
 				cto->ct_reloff = atp->bytes_xfered + atp->bytes_in_transit;
 				cto->rsp.m0.ct_xfrlen = xfrlen;
 
 				if (sendstatus) {
 					resid = atp->orig_datalen - atp->bytes_xfered - xfrlen;
 					if (cso->scsi_status == SCSI_STATUS_OK && resid == 0 /*&& fctape == 0*/) {
 						cto->ct_flags |= CT2_SENDSTATUS;
 						atp->state = ATPD_STATE_LAST_CTIO;
 						if (fctape) {
 							cto->ct_flags |= CT2_CONFIRM;
 						}
 					} else {
 						atp->sendst = 1;	/* send status later */
 						cto->ct_header.rqs_seqno &= ~ATPD_SEQ_NOTIFY_CAM;
 						atp->state = ATPD_STATE_CTIO;
 					}
 				} else {
 					atp->state = ATPD_STATE_CTIO;
 				}
 			}
 			isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[%x] seq %u nc %d CDB0=%x scsi status %x flags %x resid %d xfrlen %u offset %u", __func__, cto->ct_rxid,
 			    ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid, cso->dxfer_len, atp->bytes_xfered);
 		}
 
 		if (isp_get_pcmd(isp, ccb)) {
 			ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "out of PCMDs\n");
 			TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 			break;
 		}
 		handle = isp_allocate_handle(isp, ccb, ISP_HANDLE_TARGET);
 		if (handle == 0) {
 			ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "No XFLIST pointers for %s\n", __func__);
 			TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 			isp_free_pcmd(isp, ccb);
 			break;
 		}
 		atp->bytes_in_transit += xfrlen;
 		PISP_PCMD(ccb)->datalen = xfrlen;
 
 
 		/*
 		 * Call the dma setup routines for this entry (and any subsequent
 		 * CTIOs) if there's data to move, and then tell the f/w it's got
 		 * new things to play with. As with isp_start's usage of DMA setup,
 		 * any swizzling is done in the machine dependent layer. Because
 		 * of this, we put the request onto the queue area first in native
 		 * format.
 		 */
 
 		if (IS_24XX(isp)) {
 			ct7_entry_t *cto = (ct7_entry_t *) local;
 			cto->ct_syshandle = handle;
 		} else {
 			ct2_entry_t *cto = (ct2_entry_t *) local;
 			cto->ct_syshandle = handle;
 		}
 
 		dmaresult = ISP_DMASETUP(isp, cso, (ispreq_t *) local);
 		if (dmaresult != CMD_QUEUED) {
 			isp_destroy_handle(isp, handle);
 			isp_free_pcmd(isp, ccb);
 			if (dmaresult == CMD_EAGAIN) {
 				TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe);
 				break;
 			}
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			xpt_done(ccb);
 			continue;
 		}
 		ccb->ccb_h.status = CAM_REQ_INPROG | CAM_SIM_QUEUED;
 		if (xfrlen) {
 			ccb->ccb_h.spriv_field0 = atp->bytes_xfered;
 		} else {
 			ccb->ccb_h.spriv_field0 = ~0;
 		}
 		atp->ctcnt++;
 		atp->seqno++;
 	}
 }
 
 static void
 isp_refire_putback_atio(void *arg)
 {
 	union ccb *ccb = arg;
 
 	ISP_ASSERT_LOCKED((ispsoftc_t *)XS_ISP(ccb));
 	isp_target_putback_atio(ccb);
 }
 
 static void
 isp_refire_notify_ack(void *arg)
 {
 	isp_tna_t *tp  = arg;
 	ispsoftc_t *isp = tp->isp;
 
 	ISP_ASSERT_LOCKED(isp);
 	if (isp_notify_ack(isp, tp->not)) {
 		callout_schedule(&tp->timer, 5);
 	} else {
 		free(tp, M_DEVBUF);
 	}
 }
 
 
 static void
 isp_target_putback_atio(union ccb *ccb)
 {
 	ispsoftc_t *isp = XS_ISP(ccb);
 	struct ccb_scsiio *cso = &ccb->csio;
 	at2_entry_t local, *at = &local;
 
 	ISP_MEMZERO(at, sizeof (at2_entry_t));
 	at->at_header.rqs_entry_type = RQSTYPE_ATIO2;
 	at->at_header.rqs_entry_count = 1;
 	if (ISP_CAP_SCCFW(isp)) {
 		at->at_scclun = (uint16_t) ccb->ccb_h.target_lun;
 	} else {
 		at->at_lun = (uint8_t) ccb->ccb_h.target_lun;
 	}
 	at->at_status = CT_OK;
 	at->at_rxid = cso->tag_id;
 	at->at_iid = cso->init_id;
 	if (isp_target_put_entry(isp, at)) {
 		callout_reset(&PISP_PCMD(ccb)->wdog, 10,
 		    isp_refire_putback_atio, ccb);
 	} else
 		isp_complete_ctio(ccb);
 }
 
 static void
 isp_complete_ctio(union ccb *ccb)
 {
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG) {
 		ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
 		xpt_done(ccb);
 	}
 }
 
 static void
 isp_handle_platform_atio2(ispsoftc_t *isp, at2_entry_t *aep)
 {
 	fcparam *fcp;
 	lun_id_t lun;
 	fcportdb_t *lp;
 	tstate_t *tptr;
 	struct ccb_accept_tio *atiop;
 	uint16_t nphdl;
 	atio_private_data_t *atp;
 	inot_private_data_t *ntp;
 
 	/*
 	 * The firmware status (except for the QLTM_SVALID bit)
 	 * indicates why this ATIO was sent to us.
 	 *
 	 * If QLTM_SVALID is set, the firmware has recommended Sense Data.
 	 */
 	if ((aep->at_status & ~QLTM_SVALID) != AT_CDB) {
 		isp_prt(isp, ISP_LOGWARN, "bogus atio (0x%x) leaked to platform", aep->at_status);
 		isp_endcmd(isp, aep, NIL_HANDLE, 0, SCSI_STATUS_BUSY, 0);
 		return;
 	}
 
 	fcp = FCPARAM(isp, 0);
 	if (ISP_CAP_SCCFW(isp)) {
 		lun = aep->at_scclun;
 	} else {
 		lun = aep->at_lun;
 	}
 	if (ISP_CAP_2KLOGIN(isp)) {
 		nphdl = ((at2e_entry_t *)aep)->at_iid;
 	} else {
 		nphdl = aep->at_iid;
 	}
 	tptr = get_lun_statep(isp, 0, lun);
 	if (tptr == NULL) {
 		tptr = get_lun_statep(isp, 0, CAM_LUN_WILDCARD);
 		if (tptr == NULL) {
 			isp_prt(isp, ISP_LOGWARN, "%s: [0x%x] no state pointer for lun %jx or wildcard", __func__, aep->at_rxid, (uintmax_t)lun);
 			if (lun == 0) {
 				isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0);
 			} else {
 				isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_CHECK_COND | ECMD_SVALID | (0x5 << 12) | (0x25 << 16), 0);
 			}
 			return;
 		}
 	}
 
 	/*
 	 * Start any commands pending resources first.
 	 */
 	if (isp_atio_restart(isp, 0, tptr))
 		goto noresrc;
 
 	atiop = (struct ccb_accept_tio *) SLIST_FIRST(&tptr->atios);
 	if (atiop == NULL) {
 		goto noresrc;
 	}
 
 	atp = isp_get_atpd(isp, 0, aep->at_rxid);
 	if (atp == NULL) {
 		goto noresrc;
 	}
 
 	atp->state = ATPD_STATE_ATIO;
 	SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle);
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, atiop->ccb_h.path, "Take FREE ATIO\n");
 	atiop->ccb_h.target_id = ISP_MAX_TARGETS(isp);
 	atiop->ccb_h.target_lun = lun;
 
 	/*
 	 * We don't get 'suggested' sense data as we do with SCSI cards.
 	 */
 	atiop->sense_len = 0;
 
 	/*
 	 * If we're not in the port database, add ourselves.
 	 */
 	if (IS_2100(isp))
 		atiop->init_id = nphdl;
 	else {
 		if (isp_find_pdb_by_handle(isp, 0, nphdl, &lp)) {
 			atiop->init_id = FC_PORTDB_TGT(isp, 0, lp);
 		} else {
 			isp_prt(isp, ISP_LOGTINFO, "%s: port %x isn't in PDB",
 			    __func__, nphdl);
 			isp_dump_portdb(isp, 0);
 			isp_endcmd(isp, aep, NIL_HANDLE, 0, ECMD_TERMINATE, 0);
 			return;
 		}
 	}
 	atiop->cdb_len = ATIO2_CDBLEN;
 	ISP_MEMCPY(atiop->cdb_io.cdb_bytes, aep->at_cdb, ATIO2_CDBLEN);
 	atiop->ccb_h.status = CAM_CDB_RECVD;
 	atiop->tag_id = atp->tag;
 	switch (aep->at_taskflags & ATIO2_TC_ATTR_MASK) {
 	case ATIO2_TC_ATTR_SIMPLEQ:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_SIMPLE_Q_TAG;
 		break;
 	case ATIO2_TC_ATTR_HEADOFQ:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_HEAD_OF_Q_TAG;
 		break;
 	case ATIO2_TC_ATTR_ORDERED:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_ORDERED_Q_TAG;
 		break;
 	case ATIO2_TC_ATTR_ACAQ:		/* ?? */
 	case ATIO2_TC_ATTR_UNTAGGED:
 	default:
 		atiop->tag_action = 0;
 		break;
 	}
 
 	atp->orig_datalen = aep->at_datalen;
 	atp->bytes_xfered = 0;
 	atp->lun = lun;
 	atp->nphdl = nphdl;
 	atp->sid = PORT_ANY;
 	atp->oxid = aep->at_oxid;
 	atp->cdb0 = aep->at_cdb[0];
 	atp->tattr = aep->at_taskflags & ATIO2_TC_ATTR_MASK;
 	atp->state = ATPD_STATE_CAM;
 	xpt_done((union ccb *)atiop);
 	isp_prt(isp, ISP_LOGTDEBUG0, "ATIO2[0x%x] CDB=0x%x lun %jx datalen %u", aep->at_rxid, atp->cdb0, (uintmax_t)lun, atp->orig_datalen);
 	return;
 noresrc:
 	ntp = isp_get_ntpd(isp, 0);
 	if (ntp == NULL) {
 		isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0);
 		return;
 	}
 	memcpy(ntp->data, aep, QENTRY_LEN);
 	STAILQ_INSERT_TAIL(&tptr->restart_queue, ntp, next);
 }
 
 static void
 isp_handle_platform_atio7(ispsoftc_t *isp, at7_entry_t *aep)
 {
 	int cdbxlen;
 	lun_id_t lun;
 	uint16_t chan, nphdl = NIL_HANDLE;
 	uint32_t did, sid;
 	fcportdb_t *lp;
 	tstate_t *tptr;
 	struct ccb_accept_tio *atiop;
 	atio_private_data_t *atp = NULL;
 	atio_private_data_t *oatp;
 	inot_private_data_t *ntp;
 
 	did = (aep->at_hdr.d_id[0] << 16) | (aep->at_hdr.d_id[1] << 8) | aep->at_hdr.d_id[2];
 	sid = (aep->at_hdr.s_id[0] << 16) | (aep->at_hdr.s_id[1] << 8) | aep->at_hdr.s_id[2];
 	lun = CAM_EXTLUN_BYTE_SWIZZLE(be64dec(aep->at_cmnd.fcp_cmnd_lun));
 
 	if (ISP_CAP_MULTI_ID(isp) && isp->isp_nchan > 1) {
 		/* Channel has to be derived from D_ID */
 		isp_find_chan_by_did(isp, did, &chan);
 		if (chan == ISP_NOCHAN) {
 			isp_prt(isp, ISP_LOGWARN,
 			    "%s: [RX_ID 0x%x] D_ID %x not found on any channel",
 			    __func__, aep->at_rxid, did);
 			isp_endcmd(isp, aep, NIL_HANDLE, ISP_NOCHAN,
 			    ECMD_TERMINATE, 0);
 			return;
 		}
 	} else {
 		chan = 0;
 	}
 
 	/*
 	 * Find the PDB entry for this initiator
 	 */
 	if (isp_find_pdb_by_portid(isp, chan, sid, &lp) == 0) {
 		/*
 		 * If we're not in the port database terminate the exchange.
 		 */
 		isp_prt(isp, ISP_LOGTINFO, "%s: [RX_ID 0x%x] D_ID 0x%06x found on Chan %d for S_ID 0x%06x wasn't in PDB already",
 		    __func__, aep->at_rxid, did, chan, sid);
 		isp_dump_portdb(isp, chan);
 		isp_endcmd(isp, aep, NIL_HANDLE, chan, ECMD_TERMINATE, 0);
 		return;
 	}
 	nphdl = lp->handle;
 
 	/*
 	 * Get the tstate pointer
 	 */
 	tptr = get_lun_statep(isp, chan, lun);
 	if (tptr == NULL) {
 		tptr = get_lun_statep(isp, chan, CAM_LUN_WILDCARD);
 		if (tptr == NULL) {
 			isp_prt(isp, ISP_LOGWARN,
 			    "%s: [0x%x] no state pointer for lun %jx or wildcard",
 			    __func__, aep->at_rxid, (uintmax_t)lun);
 			if (lun == 0) {
 				isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_BUSY, 0);
 			} else {
 				isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_CHECK_COND | ECMD_SVALID | (0x5 << 12) | (0x25 << 16), 0);
 			}
 			return;
 		}
 	}
 
 	/*
 	 * Start any commands pending resources first.
 	 */
 	if (isp_atio_restart(isp, chan, tptr))
 		goto noresrc;
 
 	/*
 	 * If the f/w is out of resources, just send a BUSY status back.
 	 */
 	if (aep->at_rxid == AT7_NORESRC_RXID) {
 		isp_endcmd(isp, aep, nphdl, chan, SCSI_BUSY, 0);
 		return;
 	}
 
 	/*
 	 * If we're out of resources, just send a BUSY status back.
 	 */
 	atiop = (struct ccb_accept_tio *) SLIST_FIRST(&tptr->atios);
 	if (atiop == NULL) {
 		isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] out of atios", aep->at_rxid);
 		goto noresrc;
 	}
 
 	oatp = isp_find_atpd(isp, chan, aep->at_rxid);
 	if (oatp) {
 		isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] tag wraparound in isp_handle_platforms_atio7 (N-Port Handle 0x%04x S_ID 0x%04x OX_ID 0x%04x) oatp state %d",
 		    aep->at_rxid, nphdl, sid, aep->at_hdr.ox_id, oatp->state);
 		/*
 		 * It's not a "no resource" condition- but we can treat it like one
 		 */
 		goto noresrc;
 	}
 	atp = isp_get_atpd(isp, chan, aep->at_rxid);
 	if (atp == NULL) {
 		isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] out of atps", aep->at_rxid);
 		goto noresrc;
 	}
 	atp->word3 = lp->prli_word3;
 	atp->state = ATPD_STATE_ATIO;
 	SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle);
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, atiop->ccb_h.path, "Take FREE ATIO\n");
 	atiop->init_id = FC_PORTDB_TGT(isp, chan, lp);
 	atiop->ccb_h.target_id = ISP_MAX_TARGETS(isp);
 	atiop->ccb_h.target_lun = lun;
 	atiop->sense_len = 0;
 	cdbxlen = aep->at_cmnd.fcp_cmnd_alen_datadir >> FCP_CMND_ADDTL_CDBLEN_SHIFT;
 	if (cdbxlen) {
 		isp_prt(isp, ISP_LOGWARN, "additional CDBLEN ignored");
 	}
 	cdbxlen = sizeof (aep->at_cmnd.cdb_dl.sf.fcp_cmnd_cdb);
 	ISP_MEMCPY(atiop->cdb_io.cdb_bytes, aep->at_cmnd.cdb_dl.sf.fcp_cmnd_cdb, cdbxlen);
 	atiop->cdb_len = cdbxlen;
 	atiop->ccb_h.status = CAM_CDB_RECVD;
 	atiop->tag_id = atp->tag;
 	switch (aep->at_cmnd.fcp_cmnd_task_attribute & FCP_CMND_TASK_ATTR_MASK) {
 	case FCP_CMND_TASK_ATTR_SIMPLE:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_SIMPLE_Q_TAG;
 		break;
 	case FCP_CMND_TASK_ATTR_HEAD:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_HEAD_OF_Q_TAG;
 		break;
 	case FCP_CMND_TASK_ATTR_ORDERED:
 		atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID;
 		atiop->tag_action = MSG_ORDERED_Q_TAG;
 		break;
 	default:
 		/* FALLTHROUGH */
 	case FCP_CMND_TASK_ATTR_ACA:
 	case FCP_CMND_TASK_ATTR_UNTAGGED:
 		atiop->tag_action = 0;
 		break;
 	}
 	atp->orig_datalen = aep->at_cmnd.cdb_dl.sf.fcp_cmnd_dl;
 	atp->bytes_xfered = 0;
 	atp->lun = lun;
 	atp->nphdl = nphdl;
 	atp->sid = sid;
 	atp->did = did;
 	atp->oxid = aep->at_hdr.ox_id;
 	atp->rxid = aep->at_hdr.rx_id;
 	atp->cdb0 = atiop->cdb_io.cdb_bytes[0];
 	atp->tattr = aep->at_cmnd.fcp_cmnd_task_attribute & FCP_CMND_TASK_ATTR_MASK;
 	atp->state = ATPD_STATE_CAM;
 	isp_prt(isp, ISP_LOGTDEBUG0, "ATIO7[0x%x] CDB=0x%x lun %jx datalen %u",
 	    aep->at_rxid, atp->cdb0, (uintmax_t)lun, atp->orig_datalen);
 	xpt_done((union ccb *)atiop);
 	return;
 noresrc:
 	if (atp)
 		isp_put_atpd(isp, chan, atp);
 	ntp = isp_get_ntpd(isp, chan);
 	if (ntp == NULL) {
 		isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_BUSY, 0);
 		return;
 	}
 	memcpy(ntp->data, aep, QENTRY_LEN);
 	STAILQ_INSERT_TAIL(&tptr->restart_queue, ntp, next);
 }
 
 
 /*
  * Handle starting an SRR (sequence retransmit request)
  * We get here when we've gotten the immediate notify
  * and the return of all outstanding CTIOs for this
  * transaction.
  */
 static void
 isp_handle_srr_start(ispsoftc_t *isp, atio_private_data_t *atp)
 {
 	in_fcentry_24xx_t *inot;
 	uint32_t srr_off, ccb_off, ccb_len, ccb_end;
 	union ccb *ccb;
 
 	inot = (in_fcentry_24xx_t *)atp->srr;
 	srr_off = inot->in_srr_reloff_lo | (inot->in_srr_reloff_hi << 16);
 	ccb = atp->srr_ccb;
 	atp->srr_ccb = NULL;
 	atp->nsrr++;
 	if (ccb == NULL) {
 		isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] null ccb", atp->tag);
 		goto fail;
 	}
 
 	ccb_off = ccb->ccb_h.spriv_field0;
 	ccb_len = ccb->csio.dxfer_len;
         ccb_end = (ccb_off == ~0)? ~0 : ccb_off + ccb_len;
 
 	switch (inot->in_srr_iu) {
 	case R_CTL_INFO_SOLICITED_DATA:
 		/*
 		 * We have to restart a FCP_DATA data out transaction
 		 */
 		atp->sendst = 0;
 		atp->bytes_xfered = srr_off;
 		if (ccb_len == 0) {
 			isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x but current CCB doesn't transfer data", atp->tag, srr_off);
 			goto mdp;
 		}
  		if (srr_off < ccb_off || ccb_off > srr_off + ccb_len) {
 			isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x not covered by current CCB data range [0x%x..0x%x]", atp->tag, srr_off, ccb_off, ccb_end);
 			goto mdp;
 		}
 		isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x covered by current CCB data range [0x%x..0x%x]", atp->tag, srr_off, ccb_off, ccb_end);
 		break;
 	case R_CTL_INFO_COMMAND_STATUS:
 		isp_prt(isp, ISP_LOGTINFO, "SRR[0x%x] Got an FCP RSP SRR- resending status", atp->tag);
 		atp->sendst = 1;
 		/*
 		 * We have to restart a FCP_RSP IU transaction
 		 */
 		break;
 	case R_CTL_INFO_DATA_DESCRIPTOR:
 		/*
 		 * We have to restart an FCP DATA in transaction
 		 */
 		isp_prt(isp, ISP_LOGWARN, "Got an FCP DATA IN SRR- dropping");
 		goto fail;
 		
 	default:
 		isp_prt(isp, ISP_LOGWARN, "Got an unknown information (%x) SRR- dropping", inot->in_srr_iu);
 		goto fail;
 	}
 
 	/*
 	 * We can't do anything until this is acked, so we might as well start it now.
 	 * We aren't going to do the usual asynchronous ack issue because we need
 	 * to make sure this gets on the wire first.
 	 */
 	if (isp_notify_ack(isp, inot)) {
 		isp_prt(isp, ISP_LOGWARN, "could not push positive ack for SRR- you lose");
 		goto fail;
 	}
 	isp_target_start_ctio(isp, ccb, FROM_SRR);
 	return;
 fail:
 	inot->in_reserved = 1;
 	isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, inot);
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	ccb->ccb_h.status |= CAM_REQ_CMP_ERR;
 	isp_complete_ctio(ccb);
 	return;
 mdp:
 	if (isp_notify_ack(isp, inot)) {
 		isp_prt(isp, ISP_LOGWARN, "could not push positive ack for SRR- you lose");
 		goto fail;
 	}
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	ccb->ccb_h.status = CAM_MESSAGE_RECV;
 	/*
 	 * This is not a strict interpretation of MDP, but it's close
 	 */
 	ccb->csio.msg_ptr = &ccb->csio.sense_data.sense_buf[SSD_FULL_SIZE - 16];
 	ccb->csio.msg_len = 7;
 	ccb->csio.msg_ptr[0] = MSG_EXTENDED;
 	ccb->csio.msg_ptr[1] = 5;
 	ccb->csio.msg_ptr[2] = 0;	/* modify data pointer */
 	ccb->csio.msg_ptr[3] = srr_off >> 24;
 	ccb->csio.msg_ptr[4] = srr_off >> 16;
 	ccb->csio.msg_ptr[5] = srr_off >> 8;
 	ccb->csio.msg_ptr[6] = srr_off;
 	isp_complete_ctio(ccb);
 }
 
 
 static void
 isp_handle_platform_srr(ispsoftc_t *isp, isp_notify_t *notify)
 {
 	in_fcentry_24xx_t *inot = notify->nt_lreserved;
 	atio_private_data_t *atp;
 	uint32_t tag = notify->nt_tagval & 0xffffffff;
 
 	atp = isp_find_atpd(isp, notify->nt_channel, tag);
 	if (atp == NULL) {
 		isp_prt(isp, ISP_LOGERR, "%s: cannot find adjunct for %x in SRR Notify",
 		    __func__, tag);
 		isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, inot);
 		return;
 	}
 	atp->srr_notify_rcvd = 1;
 	memcpy(atp->srr, inot, sizeof (atp->srr));
 	isp_prt(isp, ISP_LOGTINFO, "SRR[0x%x] flags 0x%x srr_iu %x reloff 0x%x",
 	    inot->in_rxid, inot->in_flags, inot->in_srr_iu,
 	    ((uint32_t)inot->in_srr_reloff_hi << 16) | inot->in_srr_reloff_lo);
 	if (atp->srr_ccb)
 		isp_handle_srr_start(isp, atp);
 }
 
 static void
 isp_handle_platform_ctio(ispsoftc_t *isp, void *arg)
 {
 	union ccb *ccb;
 	int sentstatus = 0, ok = 0, notify_cam = 0, failure = 0;
 	atio_private_data_t *atp = NULL;
 	int bus;
 	uint32_t handle, data_requested, resid;
 
 	handle = ((ct2_entry_t *)arg)->ct_syshandle;
 	ccb = isp_find_xs(isp, handle);
 	if (ccb == NULL) {
 		isp_print_bytes(isp, "null ccb in isp_handle_platform_ctio", QENTRY_LEN, arg);
 		return;
 	}
 	isp_destroy_handle(isp, handle);
 	resid = data_requested = PISP_PCMD(ccb)->datalen;
 	isp_free_pcmd(isp, ccb);
 
 	bus = XS_CHANNEL(ccb);
 	if (IS_24XX(isp)) {
 		atp = isp_find_atpd(isp, bus, ((ct7_entry_t *)arg)->ct_rxid);
 	} else {
 		atp = isp_find_atpd(isp, bus, ((ct2_entry_t *)arg)->ct_rxid);
 	}
 	if (atp == NULL) {
 		/*
 		 * XXX: isp_clear_commands() generates fake CTIO with zero
 		 * ct_rxid value, filling only ct_syshandle.  Workaround
 		 * that using tag_id from the CCB, pointed by ct_syshandle.
 		 */
 		atp = isp_find_atpd(isp, bus, ccb->csio.tag_id);
 	}
 	if (atp == NULL) {
 		isp_prt(isp, ISP_LOGERR, "%s: cannot find adjunct for %x after I/O", __func__, ccb->csio.tag_id);
 		return;
 	}
 	KASSERT((atp->ctcnt > 0), ("ctio count not greater than zero"));
 	atp->bytes_in_transit -= data_requested;
 	atp->ctcnt -= 1;
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 
 	if (IS_24XX(isp)) {
 		ct7_entry_t *ct = arg;
 
 		if (ct->ct_nphdl == CT7_SRR) {
 			atp->srr_ccb = ccb;
 			if (atp->srr_notify_rcvd)
 				isp_handle_srr_start(isp, atp);
 			return;
 		}
 		if (ct->ct_nphdl == CT_HBA_RESET) {
 			sentstatus = (ccb->ccb_h.flags & CAM_SEND_STATUS) &&
 			    (atp->sendst == 0);
 			failure = CAM_UNREC_HBA_ERROR;
 		} else {
 			sentstatus = ct->ct_flags & CT7_SENDSTATUS;
 			ok = (ct->ct_nphdl == CT7_OK);
 			notify_cam = (ct->ct_header.rqs_seqno & ATPD_SEQ_NOTIFY_CAM) != 0;
 			if ((ct->ct_flags & CT7_DATAMASK) != CT7_NO_DATA)
 				resid = ct->ct_resid;
 		}
 		isp_prt(isp, ok? ISP_LOGTDEBUG0 : ISP_LOGWARN, "%s: CTIO7[%x] seq %u nc %d sts 0x%x flg 0x%x sns %d resid %d %s", __func__, ct->ct_rxid, ATPD_GET_SEQNO(ct),
 		   notify_cam, ct->ct_nphdl, ct->ct_flags, (ccb->ccb_h.status & CAM_SENT_SENSE) != 0, resid, sentstatus? "FIN" : "MID");
 	} else {
 		ct2_entry_t *ct = arg;
 		if (ct->ct_status == CT_SRR) {
 			atp->srr_ccb = ccb;
 			if (atp->srr_notify_rcvd)
 				isp_handle_srr_start(isp, atp);
 			isp_target_putback_atio(ccb);
 			return;
 		}
 		if (ct->ct_status == CT_HBA_RESET) {
 			sentstatus = (ccb->ccb_h.flags & CAM_SEND_STATUS) &&
 			    (atp->sendst == 0);
 			failure = CAM_UNREC_HBA_ERROR;
 		} else {
 			sentstatus = ct->ct_flags & CT2_SENDSTATUS;
 			ok = (ct->ct_status & ~QLTM_SVALID) == CT_OK;
 			notify_cam = (ct->ct_header.rqs_seqno & ATPD_SEQ_NOTIFY_CAM) != 0;
 			if ((ct->ct_flags & CT2_DATAMASK) != CT2_NO_DATA)
 				resid = ct->ct_resid;
 		}
 		isp_prt(isp, ok? ISP_LOGTDEBUG0 : ISP_LOGWARN, "%s: CTIO2[%x] seq %u nc %d sts 0x%x flg 0x%x sns %d resid %d %s", __func__, ct->ct_rxid, ATPD_GET_SEQNO(ct),
 		    notify_cam, ct->ct_status, ct->ct_flags, (ccb->ccb_h.status & CAM_SENT_SENSE) != 0, resid, sentstatus? "FIN" : "MID");
 	}
 	if (ok) {
 		if (data_requested > 0) {
 			atp->bytes_xfered += data_requested - resid;
 			ccb->csio.resid = ccb->csio.dxfer_len -
 			    (data_requested - resid);
 		}
 		if (sentstatus && (ccb->ccb_h.flags & CAM_SEND_SENSE))
 			ccb->ccb_h.status |= CAM_SENT_SENSE;
 		ccb->ccb_h.status |= CAM_REQ_CMP;
 	} else {
 		notify_cam = 1;
 		if (failure == CAM_UNREC_HBA_ERROR)
 			ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
 		else
 			ccb->ccb_h.status |= CAM_REQ_CMP_ERR;
 	}
 	atp->state = ATPD_STATE_PDON;
 
 	/*
 	 * We never *not* notify CAM when there has been any error (ok == 0),
 	 * so we never need to do an ATIO putback if we're not notifying CAM.
 	 */
 	isp_prt(isp, ISP_LOGTDEBUG0, "%s CTIO[0x%x] done (ok=%d nc=%d nowsendstatus=%d ccb ss=%d)",
 	    (sentstatus)? "  FINAL " : "MIDTERM ", atp->tag, ok, notify_cam, atp->sendst, (ccb->ccb_h.flags & CAM_SEND_STATUS) != 0);
 	if (notify_cam == 0) {
 		if (atp->sendst) {
 			isp_target_start_ctio(isp, ccb, FROM_CTIO_DONE);
 		}
 		return;
 	}
 
 	/*
 	 * We are done with this ATIO if we successfully sent status.
 	 * In all other cases expect either another CTIO or XPT_ABORT.
 	 */
 	if (ok && sentstatus)
 		isp_put_atpd(isp, bus, atp);
 
 	/*
 	 * We're telling CAM we're done with this CTIO transaction.
 	 *
 	 * 24XX cards never need an ATIO put back.
 	 *
 	 * Other cards need one put back only on error.
 	 * In the latter case, a timeout will re-fire
 	 * and try again in case we didn't have
 	 * queue resources to do so at first. In any case,
 	 * once the putback is done we do the completion
 	 * call.
 	 */
 	if (ok || IS_24XX(isp)) {
 		isp_complete_ctio(ccb);
 	} else {
 		isp_target_putback_atio(ccb);
 	}
 }
 
 static int
 isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp, uint32_t rsp)
 {
 
 	if (isp->isp_state != ISP_RUNSTATE) {
 		isp_prt(isp, ISP_LOGTINFO, "Notify Code 0x%x (qevalid=%d) acked- h/w not ready (dropping)", mp->nt_ncode, mp->nt_lreserved != NULL);
 		return (0);
 	}
 
 	/*
 	 * This case is for a Task Management Function, which shows up as an ATIO7 entry.
 	 */
 	if (IS_24XX(isp) && mp->nt_lreserved && ((isphdr_t *)mp->nt_lreserved)->rqs_entry_type == RQSTYPE_ATIO) {
 		ct7_entry_t local, *cto = &local;
 		at7_entry_t *aep = (at7_entry_t *)mp->nt_lreserved;
 		fcportdb_t *lp;
 		uint32_t sid;
 		uint16_t nphdl;
 
 		sid = (aep->at_hdr.s_id[0] << 16) | (aep->at_hdr.s_id[1] << 8) | aep->at_hdr.s_id[2];
 		if (isp_find_pdb_by_portid(isp, mp->nt_channel, sid, &lp)) {
 			nphdl = lp->handle;
 		} else {
 			nphdl = NIL_HANDLE;
 		}
 		ISP_MEMZERO(&local, sizeof (local));
 		cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7;
 		cto->ct_header.rqs_entry_count = 1;
 		cto->ct_nphdl = nphdl;
 		cto->ct_rxid = aep->at_rxid;
 		cto->ct_vpidx = mp->nt_channel;
 		cto->ct_iid_lo = sid;
 		cto->ct_iid_hi = sid >> 16;
 		cto->ct_oxid = aep->at_hdr.ox_id;
 		cto->ct_flags = CT7_SENDSTATUS|CT7_NOACK|CT7_NO_DATA|CT7_FLAG_MODE1;
 		cto->ct_flags |= (aep->at_ta_len >> 12) << CT7_TASK_ATTR_SHIFT;
 		if (rsp != 0) {
 			cto->ct_scsi_status |= (FCP_RSPLEN_VALID << 8);
 			cto->rsp.m1.ct_resplen = 4;
 			ISP_MEMZERO(cto->rsp.m1.ct_resp, sizeof (cto->rsp.m1.ct_resp));
 			cto->rsp.m1.ct_resp[0] = rsp & 0xff;
 			cto->rsp.m1.ct_resp[1] = (rsp >> 8) & 0xff;
 			cto->rsp.m1.ct_resp[2] = (rsp >> 16) & 0xff;
 			cto->rsp.m1.ct_resp[3] = (rsp >> 24) & 0xff;
 		}
 		return (isp_target_put_entry(isp, &local));
 	}
 
 	/*
 	 * This case is for a responding to an ABTS frame
 	 */
 	if (IS_24XX(isp) && mp->nt_lreserved && ((isphdr_t *)mp->nt_lreserved)->rqs_entry_type == RQSTYPE_ABTS_RCVD) {
 
 		/*
 		 * Overload nt_need_ack here to mark whether we've terminated the associated command.
 		 */
 		if (mp->nt_need_ack) {
 			uint8_t storage[QENTRY_LEN];
 			ct7_entry_t *cto = (ct7_entry_t *) storage;
 			abts_t *abts = (abts_t *)mp->nt_lreserved;
 
 			ISP_MEMZERO(cto, sizeof (ct7_entry_t));
 			isp_prt(isp, ISP_LOGTDEBUG0, "%s: [%x] terminating after ABTS received", __func__, abts->abts_rxid_task);
 			cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7;
 			cto->ct_header.rqs_entry_count = 1;
 			cto->ct_nphdl = mp->nt_nphdl;
 			cto->ct_rxid = abts->abts_rxid_task;
 			cto->ct_iid_lo = mp->nt_sid;
 			cto->ct_iid_hi = mp->nt_sid >> 16;
 			cto->ct_oxid = abts->abts_ox_id;
 			cto->ct_vpidx = mp->nt_channel;
 			cto->ct_flags = CT7_NOACK|CT7_TERMINATE;
 			if (isp_target_put_entry(isp, cto)) {
 				return (ENOMEM);
 			}
 			mp->nt_need_ack = 0;
 		}
 		if (isp_acknak_abts(isp, mp->nt_lreserved, 0) == ENOMEM) {
 			return (ENOMEM);
 		} else {
 			return (0);
 		}
 	}
 
 	/*
 	 * Handle logout cases here
 	 */
 	if (mp->nt_ncode == NT_GLOBAL_LOGOUT) {
 		isp_del_all_wwn_entries(isp, mp->nt_channel);
 	}
 
 	if (mp->nt_ncode == NT_LOGOUT) {
 		if (!IS_2100(isp) && IS_FC(isp)) {
 			isp_del_wwn_entries(isp, mp);
 		}
 	}
 
 	/*
 	 * General purpose acknowledgement
 	 */
 	if (mp->nt_need_ack) {
 		isp_prt(isp, ISP_LOGTINFO, "Notify Code 0x%x (qevalid=%d) being acked", mp->nt_ncode, mp->nt_lreserved != NULL);
 		/*
 		 * Don't need to use the guaranteed send because the caller can retry
 		 */
 		return (isp_notify_ack(isp, mp->nt_lreserved));
 	}
 	return (0);
 }
 
 /*
  * Handle task management functions.
  *
  * We show up here with a notify structure filled out.
  *
  * The nt_lreserved tag points to the original queue entry
  */
 static void
 isp_handle_platform_target_tmf(ispsoftc_t *isp, isp_notify_t *notify)
 {
 	tstate_t *tptr;
 	fcportdb_t *lp;
 	struct ccb_immediate_notify *inot;
 	inot_private_data_t *ntp = NULL;
 	atio_private_data_t *atp;
 	lun_id_t lun;
 
 	isp_prt(isp, ISP_LOGTDEBUG0, "%s: code 0x%x sid  0x%x tagval 0x%016llx chan %d lun %jx", __func__, notify->nt_ncode,
 	    notify->nt_sid, (unsigned long long) notify->nt_tagval, notify->nt_channel, notify->nt_lun);
 	if (notify->nt_lun == LUN_ANY) {
 		if (notify->nt_tagval == TAG_ANY) {
 			lun = CAM_LUN_WILDCARD;
 		} else {
 			atp = isp_find_atpd(isp, notify->nt_channel,
 			    notify->nt_tagval & 0xffffffff);
 			lun = atp ? atp->lun : CAM_LUN_WILDCARD;
 		}
 	} else {
 		lun = notify->nt_lun;
 	}
 	tptr = get_lun_statep(isp, notify->nt_channel, lun);
 	if (tptr == NULL) {
 		tptr = get_lun_statep(isp, notify->nt_channel, CAM_LUN_WILDCARD);
 		if (tptr == NULL) {
 			isp_prt(isp, ISP_LOGWARN, "%s: no state pointer found for chan %d lun %#jx", __func__, notify->nt_channel, (uintmax_t)lun);
 			goto bad;
 		}
 	}
 	inot = (struct ccb_immediate_notify *) SLIST_FIRST(&tptr->inots);
 	if (inot == NULL) {
 		isp_prt(isp, ISP_LOGWARN, "%s: out of immediate notify structures for chan %d lun %#jx", __func__, notify->nt_channel, (uintmax_t)lun);
 		goto bad;
 	}
 
 	inot->ccb_h.target_id = ISP_MAX_TARGETS(isp);
 	inot->ccb_h.target_lun = lun;
 	if (isp_find_pdb_by_portid(isp, notify->nt_channel, notify->nt_sid, &lp) == 0 &&
 	    isp_find_pdb_by_handle(isp, notify->nt_channel, notify->nt_nphdl, &lp) == 0) {
 		inot->initiator_id = CAM_TARGET_WILDCARD;
 	} else {
 		inot->initiator_id = FC_PORTDB_TGT(isp, notify->nt_channel, lp);
 	}
 	inot->seq_id = notify->nt_tagval;
 	inot->tag_id = notify->nt_tagval >> 32;
 
 	switch (notify->nt_ncode) {
 	case NT_ABORT_TASK:
 		isp_target_mark_aborted_early(isp, notify->nt_channel, tptr, inot->tag_id);
 		inot->arg = MSG_ABORT_TASK;
 		break;
 	case NT_ABORT_TASK_SET:
 		isp_target_mark_aborted_early(isp, notify->nt_channel, tptr, TAG_ANY);
 		inot->arg = MSG_ABORT_TASK_SET;
 		break;
 	case NT_CLEAR_ACA:
 		inot->arg = MSG_CLEAR_ACA;
 		break;
 	case NT_CLEAR_TASK_SET:
 		inot->arg = MSG_CLEAR_TASK_SET;
 		break;
 	case NT_LUN_RESET:
 		inot->arg = MSG_LOGICAL_UNIT_RESET;
 		break;
 	case NT_TARGET_RESET:
 		inot->arg = MSG_TARGET_RESET;
 		break;
 	case NT_QUERY_TASK_SET:
 		inot->arg = MSG_QUERY_TASK_SET;
 		break;
 	case NT_QUERY_ASYNC_EVENT:
 		inot->arg = MSG_QUERY_ASYNC_EVENT;
 		break;
 	default:
 		isp_prt(isp, ISP_LOGWARN, "%s: unknown TMF code 0x%x for chan %d lun %#jx", __func__, notify->nt_ncode, notify->nt_channel, (uintmax_t)lun);
 		goto bad;
 	}
 
 	ntp = isp_get_ntpd(isp, notify->nt_channel);
 	if (ntp == NULL) {
 		isp_prt(isp, ISP_LOGWARN, "%s: out of inotify private structures", __func__);
 		goto bad;
 	}
 	ISP_MEMCPY(&ntp->nt, notify, sizeof (isp_notify_t));
 	if (notify->nt_lreserved) {
 		ISP_MEMCPY(&ntp->data, notify->nt_lreserved, QENTRY_LEN);
 		ntp->nt.nt_lreserved = &ntp->data;
 	}
 	ntp->seq_id = notify->nt_tagval;
 	ntp->tag_id = notify->nt_tagval >> 32;
 
 	SLIST_REMOVE_HEAD(&tptr->inots, sim_links.sle);
 	ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, inot->ccb_h.path, "Take FREE INOT\n");
 	inot->ccb_h.status = CAM_MESSAGE_RECV;
 	xpt_done((union ccb *)inot);
 	return;
 bad:
 	if (notify->nt_need_ack) {
 		if (((isphdr_t *)notify->nt_lreserved)->rqs_entry_type == RQSTYPE_ABTS_RCVD) {
 			if (isp_acknak_abts(isp, notify->nt_lreserved, ENOMEM)) {
 				isp_prt(isp, ISP_LOGWARN, "you lose- unable to send an ACKNAK");
 			}
 		} else {
 			isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, notify->nt_lreserved);
 		}
 	}
 }
 
 static void
 isp_target_mark_aborted_early(ispsoftc_t *isp, int chan, tstate_t *tptr, uint32_t tag_id)
 {
 	atio_private_data_t *atp, *atpool;
 	inot_private_data_t *ntp, *tmp;
 	uint32_t this_tag_id;
 
 	/*
 	 * First, clean any commands pending restart
 	 */
 	STAILQ_FOREACH_SAFE(ntp, &tptr->restart_queue, next, tmp) {
 		if (IS_24XX(isp))
 			this_tag_id = ((at7_entry_t *)ntp->data)->at_rxid;
 		else
 			this_tag_id = ((at2_entry_t *)ntp->data)->at_rxid;
 		if ((uint64_t)tag_id == TAG_ANY || tag_id == this_tag_id) {
 			isp_endcmd(isp, ntp->data, NIL_HANDLE, chan,
 			    ECMD_TERMINATE, 0);
 			isp_put_ntpd(isp, chan, ntp);
 			STAILQ_REMOVE(&tptr->restart_queue, ntp,
 			    inot_private_data, next);
 		}
 	}
 
 	/*
 	 * Now mark other ones dead as well.
 	 */
 	ISP_GET_PC(isp, chan, atpool, atpool);
 	for (atp = atpool; atp < &atpool[ATPDPSIZE]; atp++) {
 		if (atp->lun != tptr->ts_lun)
 			continue;
 		if ((uint64_t)tag_id == TAG_ANY || atp->tag == tag_id)
 			atp->dead = 1;
 	}
 }
 #endif
 
 static void
 isp_cam_async(void *cbarg, uint32_t code, struct cam_path *path, void *arg)
 {
 	struct cam_sim *sim;
 	int bus, tgt;
 	ispsoftc_t *isp;
 
 	sim = (struct cam_sim *)cbarg;
 	isp = (ispsoftc_t *) cam_sim_softc(sim);
 	bus = cam_sim_bus(sim);
 	tgt = xpt_path_target_id(path);
 
 	switch (code) {
 	case AC_LOST_DEVICE:
 		if (IS_SCSI(isp)) {
 			uint16_t oflags, nflags;
 			sdparam *sdp = SDPARAM(isp, bus);
 
 			if (tgt >= 0) {
 				nflags = sdp->isp_devparam[tgt].nvrm_flags;
 				nflags &= DPARM_SAFE_DFLT;
 				if (isp->isp_loaded_fw) {
 					nflags |= DPARM_NARROW | DPARM_ASYNC;
 				}
 				oflags = sdp->isp_devparam[tgt].goal_flags;
 				sdp->isp_devparam[tgt].goal_flags = nflags;
 				sdp->isp_devparam[tgt].dev_update = 1;
 				sdp->update = 1;
 				(void) isp_control(isp, ISPCTL_UPDATE_PARAMS, bus);
 				sdp->isp_devparam[tgt].goal_flags = oflags;
 			}
 		}
 		break;
 	default:
 		isp_prt(isp, ISP_LOGWARN, "isp_cam_async: Code 0x%x", code);
 		break;
 	}
 }
 
 static void
 isp_poll(struct cam_sim *sim)
 {
 	ispsoftc_t *isp = cam_sim_softc(sim);
 
 	ISP_RUN_ISR(isp);
 }
 
 
 static void
 isp_watchdog(void *arg)
 {
 	struct ccb_scsiio *xs = arg;
 	ispsoftc_t *isp;
 	uint32_t ohandle = ISP_HANDLE_FREE, handle;
 
 	isp = XS_ISP(xs);
 
 	handle = isp_find_handle(isp, xs);
 
 	/*
 	 * Hand crank the interrupt code just to be sure the command isn't stuck somewhere.
 	 */
 	if (handle != ISP_HANDLE_FREE) {
 		ISP_RUN_ISR(isp);
 		ohandle = handle;
 		handle = isp_find_handle(isp, xs);
 	}
 	if (handle != ISP_HANDLE_FREE) {
 		/*
 		 * Try and make sure the command is really dead before
 		 * we release the handle (and DMA resources) for reuse.
 		 *
 		 * If we are successful in aborting the command then
 		 * we're done here because we'll get the command returned
 		 * back separately.
 		 */
 		if (isp_control(isp, ISPCTL_ABORT_CMD, xs) == 0) {
 			return;
 		}
 
 		/*
 		 * Note that after calling the above, the command may in
 		 * fact have been completed.
 		 */
 		xs = isp_find_xs(isp, handle);
 
 		/*
 		 * If the command no longer exists, then we won't
 		 * be able to find the xs again with this handle.
 		 */
 		if (xs == NULL) {
 			return;
 		}
 
 		/*
 		 * After this point, the command is really dead.
 		 */
 		if (XS_XFRLEN(xs)) {
 			ISP_DMAFREE(isp, xs, handle);
 		} 
 		isp_destroy_handle(isp, handle);
 		isp_prt(isp, ISP_LOGERR, "%s: timeout for handle 0x%x", __func__, handle);
 		XS_SETERR(xs, CAM_CMD_TIMEOUT);
 		isp_done(xs);
 	} else {
 		if (ohandle != ISP_HANDLE_FREE) {
 			isp_prt(isp, ISP_LOGWARN, "%s: timeout for handle 0x%x, recovered during interrupt", __func__, ohandle);
 		} else {
 			isp_prt(isp, ISP_LOGWARN, "%s: timeout for handle already free", __func__);
 		}
 	}
 }
 
 static void
 isp_make_here(ispsoftc_t *isp, fcportdb_t *fcp, int chan, int tgt)
 {
 	union ccb *ccb;
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	/*
 	 * Allocate a CCB, create a wildcard path for this target and schedule a rescan.
 	 */
 	ccb = xpt_alloc_ccb_nowait();
 	if (ccb == NULL) {
 		isp_prt(isp, ISP_LOGWARN, "Chan %d unable to alloc CCB for rescan", chan);
 		return;
 	}
 	if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(fc->sim),
 	    tgt, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 		isp_prt(isp, ISP_LOGWARN, "unable to create path for rescan");
 		xpt_free_ccb(ccb);
 		return;
 	}
 	xpt_rescan(ccb);
 }
 
 static void
 isp_make_gone(ispsoftc_t *isp, fcportdb_t *fcp, int chan, int tgt)
 {
 	struct cam_path *tp;
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	if (xpt_create_path(&tp, NULL, cam_sim_path(fc->sim), tgt, CAM_LUN_WILDCARD) == CAM_REQ_CMP) {
 		xpt_async(AC_LOST_DEVICE, tp, NULL);
 		xpt_free_path(tp);
 	}
 }
 
 /*
  * Gone Device Timer Function- when we have decided that a device has gone
  * away, we wait a specific period of time prior to telling the OS it has
  * gone away.
  *
  * This timer function fires once a second and then scans the port database
  * for devices that are marked dead but still have a virtual target assigned.
  * We decrement a counter for that port database entry, and when it hits zero,
  * we tell the OS the device has gone away.
  */
 static void
 isp_gdt(void *arg)
 {
 	struct isp_fc *fc = arg;
 	taskqueue_enqueue(taskqueue_thread, &fc->gtask);
 }
 
 static void
 isp_gdt_task(void *arg, int pending)
 {
 	struct isp_fc *fc = arg;
 	ispsoftc_t *isp = fc->isp;
 	int chan = fc - isp->isp_osinfo.pc.fc;
 	fcportdb_t *lp;
 	struct ac_contract ac;
 	struct ac_device_changed *adc;
 	int dbidx, more_to_do = 0;
 
 	ISP_LOCK(isp);
 	isp_prt(isp, ISP_LOGDEBUG0, "Chan %d GDT timer expired", chan);
 	for (dbidx = 0; dbidx < MAX_FC_TARG; dbidx++) {
 		lp = &FCPARAM(isp, chan)->portdb[dbidx];
 
 		if (lp->state != FC_PORTDB_STATE_ZOMBIE) {
 			continue;
 		}
 		if (lp->gone_timer != 0) {
 			lp->gone_timer -= 1;
 			more_to_do++;
 			continue;
 		}
 		isp_prt(isp, ISP_LOGCONFIG, prom3, chan, dbidx, lp->portid, "Gone Device Timeout");
 		if (lp->is_target) {
 			lp->is_target = 0;
 			isp_make_gone(isp, lp, chan, dbidx);
 		}
 		if (lp->is_initiator) {
 			lp->is_initiator = 0;
 			ac.contract_number = AC_CONTRACT_DEV_CHG;
 			adc = (struct ac_device_changed *) ac.contract_data;
 			adc->wwpn = lp->port_wwn;
 			adc->port = lp->portid;
 			adc->target = dbidx;
 			adc->arrived = 0;
 			xpt_async(AC_CONTRACT, fc->path, &ac);
 		}
 		lp->state = FC_PORTDB_STATE_NIL;
 	}
 	if (fc->ready) {
 		if (more_to_do) {
 			callout_reset(&fc->gdt, hz, isp_gdt, fc);
 		} else {
 			callout_deactivate(&fc->gdt);
 			isp_prt(isp, ISP_LOG_SANCFG, "Chan %d Stopping Gone Device Timer @ %lu", chan, (unsigned long) time_uptime);
 		}
 	}
 	ISP_UNLOCK(isp);
 }
 
 /*
  * When loop goes down we remember the time and freeze CAM command queue.
  * During some time period we are trying to reprobe the loop.  But if we
  * fail, we tell the OS that devices have gone away and drop the freeze.
  *
  * We don't clear the devices out of our port database because, when loop
  * come back up, we have to do some actual cleanup with the chip at that
  * point (implicit PLOGO, e.g., to get the chip's port database state right).
  */
 static void
 isp_loop_changed(ispsoftc_t *isp, int chan)
 {
 	fcparam *fcp = FCPARAM(isp, chan);
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	if (fc->loop_down_time)
 		return;
 	isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop changed", chan);
 	if (fcp->role & ISP_ROLE_INITIATOR)
 		isp_freeze_loopdown(isp, chan);
 	fc->loop_down_time = time_uptime;
 	wakeup(fc);
 }
 
 static void
 isp_loop_up(ispsoftc_t *isp, int chan)
 {
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop is up", chan);
 	fc->loop_seen_once = 1;
 	fc->loop_down_time = 0;
 	isp_unfreeze_loopdown(isp, chan);
 }
 
 static void
 isp_loop_dead(ispsoftc_t *isp, int chan)
 {
 	fcparam *fcp = FCPARAM(isp, chan);
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 	fcportdb_t *lp;
 	struct ac_contract ac;
 	struct ac_device_changed *adc;
 	int dbidx, i;
 
 	isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop is dead", chan);
 
 	/*
 	 * Notify to the OS all targets who we now consider have departed.
 	 */
 	for (dbidx = 0; dbidx < MAX_FC_TARG; dbidx++) {
 		lp = &fcp->portdb[dbidx];
 
 		if (lp->state == FC_PORTDB_STATE_NIL)
 			continue;
 
 		for (i = 0; i < isp->isp_maxcmds; i++) {
 			struct ccb_scsiio *xs;
 
 			if (ISP_H2HT(isp->isp_xflist[i].handle) != ISP_HANDLE_INITIATOR) {
 				continue;
 			}
 			if ((xs = isp->isp_xflist[i].cmd) == NULL) {
 				continue;
                         }
 			if (dbidx != XS_TGT(xs)) {
 				continue;
 			}
 			isp_prt(isp, ISP_LOGWARN, "command handle 0x%x for %d.%d.%jx orphaned by loop down timeout",
 			    isp->isp_xflist[i].handle, chan, XS_TGT(xs),
 			    (uintmax_t)XS_LUN(xs));
 
 			/*
 			 * Just like in isp_watchdog, abort the outstanding
 			 * command or immediately free its resources if it is
 			 * not active
 			 */
 			if (isp_control(isp, ISPCTL_ABORT_CMD, xs) == 0) {
 				continue;
 			}
 
 			if (XS_XFRLEN(xs)) {
 				ISP_DMAFREE(isp, xs, isp->isp_xflist[i].handle);
 			}
 			isp_destroy_handle(isp, isp->isp_xflist[i].handle);
 			isp_prt(isp, ISP_LOGWARN, "command handle 0x%x for %d.%d.%jx could not be aborted and was destroyed",
 			    isp->isp_xflist[i].handle, chan, XS_TGT(xs),
 			    (uintmax_t)XS_LUN(xs));
 			XS_SETERR(xs, HBA_BUSRESET);
 			isp_done(xs);
 		}
 
 		isp_prt(isp, ISP_LOGCONFIG, prom3, chan, dbidx, lp->portid, "Loop Down Timeout");
 		if (lp->is_target) {
 			lp->is_target = 0;
 			isp_make_gone(isp, lp, chan, dbidx);
 		}
 		if (lp->is_initiator) {
 			lp->is_initiator = 0;
 			ac.contract_number = AC_CONTRACT_DEV_CHG;
 			adc = (struct ac_device_changed *) ac.contract_data;
 			adc->wwpn = lp->port_wwn;
 			adc->port = lp->portid;
 			adc->target = dbidx;
 			adc->arrived = 0;
 			xpt_async(AC_CONTRACT, fc->path, &ac);
 		}
 	}
 
 	isp_unfreeze_loopdown(isp, chan);
 	fc->loop_down_time = 0;
 }
 
 static void
 isp_kthread(void *arg)
 {
 	struct isp_fc *fc = arg;
 	ispsoftc_t *isp = fc->isp;
 	int chan = fc - isp->isp_osinfo.pc.fc;
 	int slp = 0, d;
 	int lb, lim;
 
 	ISP_LOCK(isp);
 	while (isp->isp_osinfo.is_exiting == 0) {
 		isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0,
 		    "Chan %d Checking FC state", chan);
 		lb = isp_fc_runstate(isp, chan, 250000);
 		isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0,
 		    "Chan %d FC got to %s state", chan,
 		    isp_fc_loop_statename(lb));
 
 		/*
 		 * Our action is different based upon whether we're supporting
 		 * Initiator mode or not. If we are, we might freeze the simq
 		 * when loop is down and set all sorts of different delays to
 		 * check again.
 		 *
 		 * If not, we simply just wait for loop to come up.
 		 */
 		if (lb == LOOP_READY || lb < 0) {
 			slp = 0;
 		} else {
 			/*
 			 * If we've never seen loop up and we've waited longer
 			 * than quickboot time, or we've seen loop up but we've
 			 * waited longer than loop_down_limit, give up and go
 			 * to sleep until loop comes up.
 			 */
 			if (fc->loop_seen_once == 0)
 				lim = isp_quickboot_time;
 			else
 				lim = fc->loop_down_limit;
 			d = time_uptime - fc->loop_down_time;
 			if (d >= lim)
 				slp = 0;
 			else if (d < 10)
 				slp = 1;
 			else if (d < 30)
 				slp = 5;
 			else if (d < 60)
 				slp = 10;
 			else if (d < 120)
 				slp = 20;
 			else
 				slp = 30;
 		}
 
 		if (slp == 0) {
 			if (lb == LOOP_READY)
 				isp_loop_up(isp, chan);
 			else
 				isp_loop_dead(isp, chan);
 		}
 
 		isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0,
 		    "Chan %d sleep for %d seconds", chan, slp);
 		msleep(fc, &isp->isp_lock, PRIBIO, "ispf", slp * hz);
 	}
 	fc->num_threads -= 1;
 	ISP_UNLOCK(isp);
 	kthread_exit();
 }
 
 #ifdef	ISP_TARGET_MODE
 static void
 isp_abort_atio(ispsoftc_t *isp, union ccb *ccb)
 {
 	atio_private_data_t *atp;
 	union ccb *accb = ccb->cab.abort_ccb;
 	struct ccb_hdr *sccb;
 	tstate_t *tptr;
 
 	tptr = get_lun_statep(isp, XS_CHANNEL(accb), XS_LUN(accb));
 	if (tptr != NULL) {
 		/* Search for the ATIO among queueued. */
 		SLIST_FOREACH(sccb, &tptr->atios, sim_links.sle) {
 			if (sccb != &accb->ccb_h)
 				continue;
 			SLIST_REMOVE(&tptr->atios, sccb, ccb_hdr, sim_links.sle);
 			ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, sccb->path,
 			    "Abort FREE ATIO\n");
 			accb->ccb_h.status = CAM_REQ_ABORTED;
 			xpt_done(accb);
 			ccb->ccb_h.status = CAM_REQ_CMP;
 			return;
 		}
 	}
 
 	/* Search for the ATIO among running. */
 	atp = isp_find_atpd(isp, XS_CHANNEL(accb), accb->atio.tag_id);
 	if (atp != NULL) {
 		/* Send TERMINATE to firmware. */
 		if (!atp->dead && IS_24XX(isp)) {
 			uint8_t storage[QENTRY_LEN];
 			ct7_entry_t *cto = (ct7_entry_t *) storage;
 
 			ISP_MEMZERO(cto, sizeof (ct7_entry_t));
 			cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7;
 			cto->ct_header.rqs_entry_count = 1;
 			cto->ct_nphdl = atp->nphdl;
 			cto->ct_rxid = atp->tag;
 			cto->ct_iid_lo = atp->sid;
 			cto->ct_iid_hi = atp->sid >> 16;
 			cto->ct_oxid = atp->oxid;
 			cto->ct_vpidx = XS_CHANNEL(accb);
 			cto->ct_flags = CT7_NOACK|CT7_TERMINATE;
 			isp_target_put_entry(isp, cto);
 		}
 		isp_put_atpd(isp, XS_CHANNEL(accb), atp);
 		ccb->ccb_h.status = CAM_REQ_CMP;
 	} else {
 		ccb->ccb_h.status = CAM_UA_ABORT;
 	}
 }
 
 static void
 isp_abort_inot(ispsoftc_t *isp, union ccb *ccb)
 {
 	inot_private_data_t *ntp;
 	union ccb *accb = ccb->cab.abort_ccb;
 	struct ccb_hdr *sccb;
 	tstate_t *tptr;
 
 	tptr = get_lun_statep(isp, XS_CHANNEL(accb), XS_LUN(accb));
 	if (tptr != NULL) {
 		/* Search for the INOT among queueued. */
 		SLIST_FOREACH(sccb, &tptr->inots, sim_links.sle) {
 			if (sccb != &accb->ccb_h)
 				continue;
 			SLIST_REMOVE(&tptr->inots, sccb, ccb_hdr, sim_links.sle);
 			ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, sccb->path,
 			    "Abort FREE INOT\n");
 			accb->ccb_h.status = CAM_REQ_ABORTED;
 			xpt_done(accb);
 			ccb->ccb_h.status = CAM_REQ_CMP;
 			return;
 		}
 	}
 
 	/* Search for the INOT among running. */
 	ntp = isp_find_ntpd(isp, XS_CHANNEL(accb), accb->cin1.tag_id, accb->cin1.seq_id);
 	if (ntp != NULL) {
 		if (ntp->nt.nt_need_ack) {
 			isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK,
 			    ntp->nt.nt_lreserved);
 		}
 		isp_put_ntpd(isp, XS_CHANNEL(accb), ntp);
 		ccb->ccb_h.status = CAM_REQ_CMP;
 	} else {
 		ccb->ccb_h.status = CAM_UA_ABORT;
 		return;
 	}
 }
 #endif
 
 static void
 isp_action(struct cam_sim *sim, union ccb *ccb)
 {
 	int bus, tgt, error;
 	ispsoftc_t *isp;
 	struct ccb_trans_settings *cts;
 	sbintime_t ts;
 
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("isp_action\n"));
 
 	isp = (ispsoftc_t *)cam_sim_softc(sim);
 	ISP_ASSERT_LOCKED(isp);
 	bus = cam_sim_bus(sim);
 	isp_prt(isp, ISP_LOGDEBUG2, "isp_action code %x", ccb->ccb_h.func_code);
 	ISP_PCMD(ccb) = NULL;
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_SCSI_IO:	/* Execute the requested I/O operation */
 		/*
 		 * Do a couple of preliminary checks...
 		 */
 		if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0) {
 			if ((ccb->ccb_h.flags & CAM_CDB_PHYS) != 0) {
 				ccb->ccb_h.status = CAM_REQ_INVALID;
 				isp_done((struct ccb_scsiio *) ccb);
 				break;
 			}
 		}
 		ccb->csio.req_map = NULL;
 #ifdef	DIAGNOSTIC
 		if (ccb->ccb_h.target_id >= ISP_MAX_TARGETS(isp)) {
 			xpt_print(ccb->ccb_h.path, "invalid target\n");
 			ccb->ccb_h.status = CAM_PATH_INVALID;
 		} else if (ISP_MAX_LUNS(isp) > 0 &&
 		    ccb->ccb_h.target_lun >= ISP_MAX_LUNS(isp)) {
 			xpt_print(ccb->ccb_h.path, "invalid lun\n");
 			ccb->ccb_h.status = CAM_PATH_INVALID;
 		}
 		if (ccb->ccb_h.status == CAM_PATH_INVALID) {
 			xpt_done(ccb);
 			break;
 		}
 #endif
 		ccb->csio.scsi_status = SCSI_STATUS_OK;
 		if (isp_get_pcmd(isp, ccb)) {
 			isp_prt(isp, ISP_LOGWARN, "out of PCMDs");
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 250, 0);
 			ccb->ccb_h.status = CAM_REQUEUE_REQ;
 			xpt_done(ccb);
 			break;
 		}
 		error = isp_start((XS_T *) ccb);
 		switch (error) {
 		case CMD_QUEUED:
 			ccb->ccb_h.status |= CAM_SIM_QUEUED;
 			if (ccb->ccb_h.timeout == CAM_TIME_INFINITY)
 				break;
 			/* Give firmware extra 10s to handle timeout. */
 			ts = SBT_1MS * ccb->ccb_h.timeout + 10 * SBT_1S;
 			callout_reset_sbt(&PISP_PCMD(ccb)->wdog, ts, 0,
 			    isp_watchdog, ccb, 0);
 			break;
 		case CMD_RQLATER:
 			isp_prt(isp, ISP_LOGDEBUG0, "%d.%jx retry later",
 			    XS_TGT(ccb), (uintmax_t)XS_LUN(ccb));
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0);
 			ccb->ccb_h.status = CAM_REQUEUE_REQ;
 			isp_free_pcmd(isp, ccb);
 			xpt_done(ccb);
 			break;
 		case CMD_EAGAIN:
 			isp_free_pcmd(isp, ccb);
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 100, 0);
 			ccb->ccb_h.status = CAM_REQUEUE_REQ;
 			xpt_done(ccb);
 			break;
 		case CMD_COMPLETE:
 			isp_done((struct ccb_scsiio *) ccb);
 			break;
 		default:
 			isp_prt(isp, ISP_LOGERR, "What's this? 0x%x at %d in file %s", error, __LINE__, __FILE__);
 			ccb->ccb_h.status = CAM_REQUEUE_REQ;
 			isp_free_pcmd(isp, ccb);
 			xpt_done(ccb);
 		}
 		break;
 
 #ifdef	ISP_TARGET_MODE
 	case XPT_EN_LUN:		/* Enable/Disable LUN as a target */
 		if (ccb->cel.enable) {
 			isp_enable_lun(isp, ccb);
 		} else {
 			isp_disable_lun(isp, ccb);
 		}
 		break;
 	case XPT_IMMEDIATE_NOTIFY:	/* Add Immediate Notify Resource */
 	case XPT_ACCEPT_TARGET_IO:	/* Add Accept Target IO Resource */
 	{
 		tstate_t *tptr = get_lun_statep(isp, XS_CHANNEL(ccb), ccb->ccb_h.target_lun);
 		if (tptr == NULL) {
 			const char *str;
 
 			if (ccb->ccb_h.func_code == XPT_IMMEDIATE_NOTIFY)
 				str = "XPT_IMMEDIATE_NOTIFY";
 			else
 				str = "XPT_ACCEPT_TARGET_IO";
 			ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path,
 			    "%s: no state pointer found for %s\n",
 			    __func__, str);
 			ccb->ccb_h.status = CAM_DEV_NOT_THERE;
 			xpt_done(ccb);
 			break;
 		}
 		ccb->ccb_h.spriv_field0 = 0;
 		ccb->ccb_h.spriv_ptr1 = isp;
 
 		if (ccb->ccb_h.func_code == XPT_ACCEPT_TARGET_IO) {
 			ccb->atio.tag_id = 0;
 			SLIST_INSERT_HEAD(&tptr->atios, &ccb->ccb_h, sim_links.sle);
 			ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, ccb->ccb_h.path,
 			    "Put FREE ATIO\n");
 		} else if (ccb->ccb_h.func_code == XPT_IMMEDIATE_NOTIFY) {
 			ccb->cin1.seq_id = ccb->cin1.tag_id = 0;
 			SLIST_INSERT_HEAD(&tptr->inots, &ccb->ccb_h, sim_links.sle);
 			ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, ccb->ccb_h.path,
 			    "Put FREE INOT\n");
 		}
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		break;
 	}
 	case XPT_NOTIFY_ACKNOWLEDGE:		/* notify ack */
 	{
 		inot_private_data_t *ntp;
 
 		/*
 		 * XXX: Because we cannot guarantee that the path information in the notify acknowledge ccb
 		 * XXX: matches that for the immediate notify, we have to *search* for the notify structure
 		 */
 		/*
 		 * All the relevant path information is in the associated immediate notify
 		 */
 		ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, ccb->ccb_h.path, "%s: [0x%x] NOTIFY ACKNOWLEDGE for 0x%x seen\n", __func__, ccb->cna2.tag_id, ccb->cna2.seq_id);
 		ntp = isp_find_ntpd(isp, XS_CHANNEL(ccb), ccb->cna2.tag_id, ccb->cna2.seq_id);
 		if (ntp == NULL) {
 			ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "%s: [0x%x] XPT_NOTIFY_ACKNOWLEDGE of 0x%x cannot find ntp private data\n", __func__,
 			     ccb->cna2.tag_id, ccb->cna2.seq_id);
 			ccb->ccb_h.status = CAM_DEV_NOT_THERE;
 			xpt_done(ccb);
 			break;
 		}
 		if (isp_handle_platform_target_notify_ack(isp, &ntp->nt,
 		    (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0)) {
 			cam_freeze_devq(ccb->ccb_h.path);
 			cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0);
 			ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 			ccb->ccb_h.status |= CAM_REQUEUE_REQ;
 			break;
 		}
 		isp_put_ntpd(isp, XS_CHANNEL(ccb), ntp);
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, ccb->ccb_h.path, "%s: [0x%x] calling xpt_done for tag 0x%x\n", __func__, ccb->cna2.tag_id, ccb->cna2.seq_id);
 		xpt_done(ccb);
 		break;
 	}
 	case XPT_CONT_TARGET_IO:
 		isp_target_start_ctio(isp, ccb, FROM_CAM);
 		break;
 #endif
 	case XPT_RESET_DEV:		/* BDR the specified SCSI device */
 		tgt = ccb->ccb_h.target_id;
 		tgt |= (bus << 16);
 
 		error = isp_control(isp, ISPCTL_RESET_DEV, bus, tgt);
 		if (error) {
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 		} else {
 			/*
 			 * If we have a FC device, reset the Command
 			 * Reference Number, because the target will expect
 			 * that we re-start the CRN at 1 after a reset.
 			 */
 			if (IS_FC(isp))
 				isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1);
 
 			ccb->ccb_h.status = CAM_REQ_CMP;
 		}
 		xpt_done(ccb);
 		break;
 	case XPT_ABORT:			/* Abort the specified CCB */
 	{
 		union ccb *accb = ccb->cab.abort_ccb;
 		switch (accb->ccb_h.func_code) {
 #ifdef	ISP_TARGET_MODE
 		case XPT_ACCEPT_TARGET_IO:
 			isp_abort_atio(isp, ccb);
 			break;
 		case XPT_IMMEDIATE_NOTIFY:
 			isp_abort_inot(isp, ccb);
 			break;
 #endif
 		case XPT_SCSI_IO:
 			error = isp_control(isp, ISPCTL_ABORT_CMD, accb);
 			if (error) {
 				ccb->ccb_h.status = CAM_UA_ABORT;
 			} else {
 				ccb->ccb_h.status = CAM_REQ_CMP;
 			}
 			break;
 		default:
 			ccb->ccb_h.status = CAM_REQ_INVALID;
 			break;
 		}
 		/*
 		 * This is not a queued CCB, so the caller expects it to be
 		 * complete when control is returned.
 		 */
 		break;
 	}
 #define	IS_CURRENT_SETTINGS(c)	(c->type == CTS_TYPE_CURRENT_SETTINGS)
 	case XPT_SET_TRAN_SETTINGS:	/* Nexus Settings */
 		cts = &ccb->cts;
 		if (!IS_CURRENT_SETTINGS(cts)) {
 			ccb->ccb_h.status = CAM_REQ_INVALID;
 			xpt_done(ccb);
 			break;
 		}
 		tgt = cts->ccb_h.target_id;
 		if (IS_SCSI(isp)) {
 			struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi;
 			struct ccb_trans_settings_spi *spi = &cts->xport_specific.spi;
 			sdparam *sdp = SDPARAM(isp, bus);
 			uint16_t *dptr;
 
 			if (spi->valid == 0 && scsi->valid == 0) {
 				ccb->ccb_h.status = CAM_REQ_CMP;
 				xpt_done(ccb);
 				break;
 			}
 
 			/*
 			 * We always update (internally) from goal_flags
 			 * so any request to change settings just gets
 			 * vectored to that location.
 			 */
 			dptr = &sdp->isp_devparam[tgt].goal_flags;
 
 			if ((spi->valid & CTS_SPI_VALID_DISC) != 0) {
 				if ((spi->flags & CTS_SPI_FLAGS_DISC_ENB) != 0)
 					*dptr |= DPARM_DISC;
 				else
 					*dptr &= ~DPARM_DISC;
 			}
 
 			if ((scsi->valid & CTS_SCSI_VALID_TQ) != 0) {
 				if ((scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) != 0)
 					*dptr |= DPARM_TQING;
 				else
 					*dptr &= ~DPARM_TQING;
 			}
 
 			if ((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0) {
 				if (spi->bus_width == MSG_EXT_WDTR_BUS_16_BIT)
 					*dptr |= DPARM_WIDE;
 				else
 					*dptr &= ~DPARM_WIDE;
 			}
 
 			/*
 			 * XXX: FIX ME
 			 */
 			if ((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) && (spi->valid & CTS_SPI_VALID_SYNC_RATE) && (spi->sync_period && spi->sync_offset)) {
 				*dptr |= DPARM_SYNC;
 				/*
 				 * XXX: CHECK FOR LEGALITY
 				 */
 				sdp->isp_devparam[tgt].goal_period = spi->sync_period;
 				sdp->isp_devparam[tgt].goal_offset = spi->sync_offset;
 			} else {
 				*dptr &= ~DPARM_SYNC;
 			}
 			isp_prt(isp, ISP_LOGDEBUG0, "SET (%d.%d.%jx) to flags %x off %x per %x", bus, tgt, (uintmax_t)cts->ccb_h.target_lun, sdp->isp_devparam[tgt].goal_flags,
 			    sdp->isp_devparam[tgt].goal_offset, sdp->isp_devparam[tgt].goal_period);
 			sdp->isp_devparam[tgt].dev_update = 1;
 			sdp->update = 1;
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(ccb);
 		break;
 	case XPT_GET_TRAN_SETTINGS:
 		cts = &ccb->cts;
 		tgt = cts->ccb_h.target_id;
 		if (IS_FC(isp)) {
 			fcparam *fcp = FCPARAM(isp, bus);
 			struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi;
 			struct ccb_trans_settings_fc *fc = &cts->xport_specific.fc;
 
 			cts->protocol = PROTO_SCSI;
 			cts->protocol_version = SCSI_REV_2;
 			cts->transport = XPORT_FC;
 			cts->transport_version = 0;
 
 			scsi->valid = CTS_SCSI_VALID_TQ;
 			scsi->flags = CTS_SCSI_FLAGS_TAG_ENB;
 			fc->valid = CTS_FC_VALID_SPEED;
 			fc->bitrate = 100000;
 			fc->bitrate *= fcp->isp_gbspeed;
 			if (tgt < MAX_FC_TARG) {
 				fcportdb_t *lp = &fcp->portdb[tgt];
 				fc->wwnn = lp->node_wwn;
 				fc->wwpn = lp->port_wwn;
 				fc->port = lp->portid;
 				fc->valid |= CTS_FC_VALID_WWNN | CTS_FC_VALID_WWPN | CTS_FC_VALID_PORT;
 			}
 		} else {
 			struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi;
 			struct ccb_trans_settings_spi *spi = &cts->xport_specific.spi;
 			sdparam *sdp = SDPARAM(isp, bus);
 			uint16_t dval, pval, oval;
 
 			if (IS_CURRENT_SETTINGS(cts)) {
 				sdp->isp_devparam[tgt].dev_refresh = 1;
 				sdp->update = 1;
 				(void) isp_control(isp, ISPCTL_UPDATE_PARAMS, bus);
 				dval = sdp->isp_devparam[tgt].actv_flags;
 				oval = sdp->isp_devparam[tgt].actv_offset;
 				pval = sdp->isp_devparam[tgt].actv_period;
 			} else {
 				dval = sdp->isp_devparam[tgt].nvrm_flags;
 				oval = sdp->isp_devparam[tgt].nvrm_offset;
 				pval = sdp->isp_devparam[tgt].nvrm_period;
 			}
 
 			cts->protocol = PROTO_SCSI;
 			cts->protocol_version = SCSI_REV_2;
 			cts->transport = XPORT_SPI;
 			cts->transport_version = 2;
 
 			spi->valid = 0;
 			scsi->valid = 0;
 			spi->flags = 0;
 			scsi->flags = 0;
 			if (dval & DPARM_DISC) {
 				spi->flags |= CTS_SPI_FLAGS_DISC_ENB;
 			}
 			if ((dval & DPARM_SYNC) && oval && pval) {
 				spi->sync_offset = oval;
 				spi->sync_period = pval;
 			} else {
 				spi->sync_offset = 0;
 				spi->sync_period = 0;
 			}
 			spi->valid |= CTS_SPI_VALID_SYNC_OFFSET;
 			spi->valid |= CTS_SPI_VALID_SYNC_RATE;
 			spi->valid |= CTS_SPI_VALID_BUS_WIDTH;
 			if (dval & DPARM_WIDE) {
 				spi->bus_width = MSG_EXT_WDTR_BUS_16_BIT;
 			} else {
 				spi->bus_width = MSG_EXT_WDTR_BUS_8_BIT;
 			}
 			if (cts->ccb_h.target_lun != CAM_LUN_WILDCARD) {
 				scsi->valid = CTS_SCSI_VALID_TQ;
 				if (dval & DPARM_TQING) {
 					scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB;
 				}
 				spi->valid |= CTS_SPI_VALID_DISC;
 			}
 			isp_prt(isp, ISP_LOGDEBUG0, "GET %s (%d.%d.%jx) to flags %x off %x per %x", IS_CURRENT_SETTINGS(cts)? "ACTIVE" : "NVRAM",
 			    bus, tgt, (uintmax_t)cts->ccb_h.target_lun, dval, oval, pval);
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(ccb);
 		break;
 
 	case XPT_CALC_GEOMETRY:
 		cam_calc_geometry(&ccb->ccg, 1);
 		xpt_done(ccb);
 		break;
 
 	case XPT_RESET_BUS:		/* Reset the specified bus */
 		error = isp_control(isp, ISPCTL_RESET_BUS, bus);
 		if (error) {
 			ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 			xpt_done(ccb);
 			break;
 		}
 		if (bootverbose) {
 			xpt_print(ccb->ccb_h.path, "reset bus on channel %d\n", bus);
 		}
 		if (IS_FC(isp)) {
 			xpt_async(AC_BUS_RESET, ISP_FC_PC(isp, bus)->path, 0);
 		} else {
 			xpt_async(AC_BUS_RESET, ISP_SPI_PC(isp, bus)->path, 0);
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(ccb);
 		break;
 
 	case XPT_TERM_IO:		/* Terminate the I/O process */
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		xpt_done(ccb);
 		break;
 
 	case XPT_SET_SIM_KNOB:		/* Set SIM knobs */
 	{
 		struct ccb_sim_knob *kp = &ccb->knob;
 		fcparam *fcp;
 
 		if (!IS_FC(isp)) {
 			ccb->ccb_h.status = CAM_REQ_INVALID;
 			xpt_done(ccb);
 			break;
 		}
 
 		fcp = FCPARAM(isp, bus);
 
 		if (kp->xport_specific.fc.valid & KNOB_VALID_ADDRESS) {
 			fcp->isp_wwnn = ISP_FC_PC(isp, bus)->def_wwnn = kp->xport_specific.fc.wwnn;
 			fcp->isp_wwpn = ISP_FC_PC(isp, bus)->def_wwpn = kp->xport_specific.fc.wwpn;
 			isp_prt(isp, ISP_LOGALL, "Setting Channel %d wwns to 0x%jx 0x%jx", bus, fcp->isp_wwnn, fcp->isp_wwpn);
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		if (kp->xport_specific.fc.valid & KNOB_VALID_ROLE) {
 			int rchange = 0;
 			int newrole = 0;
 
 			switch (kp->xport_specific.fc.role) {
 			case KNOB_ROLE_NONE:
 				if (fcp->role != ISP_ROLE_NONE) {
 					rchange = 1;
 					newrole = ISP_ROLE_NONE;
 				}
 				break;
 			case KNOB_ROLE_TARGET:
 				if (fcp->role != ISP_ROLE_TARGET) {
 					rchange = 1;
 					newrole = ISP_ROLE_TARGET;
 				}
 				break;
 			case KNOB_ROLE_INITIATOR:
 				if (fcp->role != ISP_ROLE_INITIATOR) {
 					rchange = 1;
 					newrole = ISP_ROLE_INITIATOR;
 				}
 				break;
 			case KNOB_ROLE_BOTH:
 				if (fcp->role != ISP_ROLE_BOTH) {
 					rchange = 1;
 					newrole = ISP_ROLE_BOTH;
 				}
 				break;
 			}
 			if (rchange) {
 				ISP_PATH_PRT(isp, ISP_LOGCONFIG, ccb->ccb_h.path, "changing role on from %d to %d\n", fcp->role, newrole);
 				if (isp_control(isp, ISPCTL_CHANGE_ROLE,
 				    bus, newrole) != 0) {
 					ccb->ccb_h.status = CAM_REQ_CMP_ERR;
 					xpt_done(ccb);
 					break;
 				}
 			}
 		}
 		xpt_done(ccb);
 		break;
 	}
 	case XPT_GET_SIM_KNOB_OLD:	/* Get SIM knobs -- compat value */
 	case XPT_GET_SIM_KNOB:		/* Get SIM knobs */
 	{
 		struct ccb_sim_knob *kp = &ccb->knob;
 
 		if (IS_FC(isp)) {
 			fcparam *fcp;
 
 			fcp = FCPARAM(isp, bus);
 
 			kp->xport_specific.fc.wwnn = fcp->isp_wwnn;
 			kp->xport_specific.fc.wwpn = fcp->isp_wwpn;
 			switch (fcp->role) {
 			case ISP_ROLE_NONE:
 				kp->xport_specific.fc.role = KNOB_ROLE_NONE;
 				break;
 			case ISP_ROLE_TARGET:
 				kp->xport_specific.fc.role = KNOB_ROLE_TARGET;
 				break;
 			case ISP_ROLE_INITIATOR:
 				kp->xport_specific.fc.role = KNOB_ROLE_INITIATOR;
 				break;
 			case ISP_ROLE_BOTH:
 				kp->xport_specific.fc.role = KNOB_ROLE_BOTH;
 				break;
 			}
 			kp->xport_specific.fc.valid = KNOB_VALID_ADDRESS | KNOB_VALID_ROLE;
 			ccb->ccb_h.status = CAM_REQ_CMP;
 		} else {
 			ccb->ccb_h.status = CAM_REQ_INVALID;
 		}
 		xpt_done(ccb);
 		break;
 	}
 	case XPT_PATH_INQ:		/* Path routing inquiry */
 	{
 		struct ccb_pathinq *cpi = &ccb->cpi;
 
 		cpi->version_num = 1;
 #ifdef	ISP_TARGET_MODE
 		if (IS_FC(isp) && ISP_CAP_TMODE(isp) && ISP_CAP_SCCFW(isp))
 			cpi->target_sprt = PIT_PROCESSOR | PIT_DISCONNECT | PIT_TERM_IO;
 		else
 #endif
 			cpi->target_sprt = 0;
 		cpi->hba_eng_cnt = 0;
 		cpi->max_target = ISP_MAX_TARGETS(isp) - 1;
 		cpi->max_lun = ISP_MAX_LUNS(isp) == 0 ?
 		    255 : ISP_MAX_LUNS(isp) - 1;
 		cpi->bus_id = cam_sim_bus(sim);
 		if (sizeof (bus_size_t) > 4)
 			cpi->maxio = (ISP_NSEG64_MAX - 1) * PAGE_SIZE;
 		else
 			cpi->maxio = (ISP_NSEG_MAX - 1) * PAGE_SIZE;
 
 		if (IS_FC(isp)) {
 			fcparam *fcp = FCPARAM(isp, bus);
 
 			cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED;
 			cpi->hba_misc |= PIM_EXTLUNS | PIM_NOSCAN;
 
 			/*
 			 * Because our loop ID can shift from time to time,
 			 * make our initiator ID out of range of our bus.
 			 */
 			cpi->initiator_id = cpi->max_target + 1;
 
 			/*
 			 * Set base transfer capabilities for Fibre Channel, for this HBA.
 			 */
 			if (IS_25XX(isp)) {
 				cpi->base_transfer_speed = 8000000;
 			} else if (IS_24XX(isp)) {
 				cpi->base_transfer_speed = 4000000;
 			} else if (IS_23XX(isp)) {
 				cpi->base_transfer_speed = 2000000;
 			} else {
 				cpi->base_transfer_speed = 1000000;
 			}
 			cpi->hba_inquiry = PI_TAG_ABLE;
 			cpi->transport = XPORT_FC;
 			cpi->transport_version = 0;
 			cpi->xport_specific.fc.wwnn = fcp->isp_wwnn;
 			cpi->xport_specific.fc.wwpn = fcp->isp_wwpn;
 			cpi->xport_specific.fc.port = fcp->isp_portid;
 			cpi->xport_specific.fc.bitrate = fcp->isp_gbspeed * 1000;
 		} else {
 			sdparam *sdp = SDPARAM(isp, bus);
 			cpi->hba_inquiry = PI_SDTR_ABLE|PI_TAG_ABLE|PI_WIDE_16;
 			cpi->hba_misc = PIM_UNMAPPED;
 			cpi->initiator_id = sdp->isp_initiator_id;
 			cpi->base_transfer_speed = 3300;
 			cpi->transport = XPORT_SPI;
 			cpi->transport_version = 2;
 		}
 		cpi->protocol = PROTO_SCSI;
 		cpi->protocol_version = SCSI_REV_2;
 		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strlcpy(cpi->hba_vid, "Qlogic", HBA_IDLEN);
 		strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
 		cpi->unit_number = cam_sim_unit(sim);
 		cpi->ccb_h.status = CAM_REQ_CMP;
 		xpt_done(ccb);
 		break;
 	}
 	default:
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		xpt_done(ccb);
 		break;
 	}
 }
 
 void
 isp_done(XS_T *sccb)
 {
 	ispsoftc_t *isp = XS_ISP(sccb);
 	uint32_t status;
 
 	if (XS_NOERR(sccb))
 		XS_SETERR(sccb, CAM_REQ_CMP);
 
 	if ((sccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP && (sccb->scsi_status != SCSI_STATUS_OK)) {
 		sccb->ccb_h.status &= ~CAM_STATUS_MASK;
 		if ((sccb->scsi_status == SCSI_STATUS_CHECK_COND) && (sccb->ccb_h.status & CAM_AUTOSNS_VALID) == 0) {
 			sccb->ccb_h.status |= CAM_AUTOSENSE_FAIL;
 		} else {
 			sccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR;
 		}
 	}
 
 	sccb->ccb_h.status &= ~CAM_SIM_QUEUED;
 	status = sccb->ccb_h.status & CAM_STATUS_MASK;
 	if (status != CAM_REQ_CMP &&
 	    (sccb->ccb_h.status & CAM_DEV_QFRZN) == 0) {
 		sccb->ccb_h.status |= CAM_DEV_QFRZN;
 		xpt_freeze_devq(sccb->ccb_h.path, 1);
 	}
 
 	if (ISP_PCMD(sccb)) {
 		if (callout_active(&PISP_PCMD(sccb)->wdog))
 			callout_stop(&PISP_PCMD(sccb)->wdog);
 		isp_free_pcmd(isp, (union ccb *) sccb);
 	}
 	xpt_done((union ccb *) sccb);
 }
 
 void
 isp_async(ispsoftc_t *isp, ispasync_t cmd, ...)
 {
 	int bus;
 	static const char prom[] = "Chan %d [%d] WWPN 0x%16jx PortID 0x%06x handle 0x%x %s %s";
 	char buf[64];
 	char *msg = NULL;
 	target_id_t tgt = 0;
 	fcportdb_t *lp;
 	struct isp_fc *fc;
 	struct cam_path *tmppath;
 	struct ac_contract ac;
 	struct ac_device_changed *adc;
 	va_list ap;
 
 	switch (cmd) {
 	case ISPASYNC_NEW_TGT_PARAMS:
 	{
 		struct ccb_trans_settings_scsi *scsi;
 		struct ccb_trans_settings_spi *spi;
 		int flags, tgt;
 		sdparam *sdp;
 		struct ccb_trans_settings cts;
 
 		memset(&cts, 0, sizeof (struct ccb_trans_settings));
 
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		tgt = va_arg(ap, int);
 		va_end(ap);
 		sdp = SDPARAM(isp, bus);
 
 		if (xpt_create_path(&tmppath, NULL, cam_sim_path(ISP_SPI_PC(isp, bus)->sim), tgt, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 			isp_prt(isp, ISP_LOGWARN, "isp_async cannot make temp path for %d.%d", tgt, bus);
 			break;
 		}
 		flags = sdp->isp_devparam[tgt].actv_flags;
 		cts.type = CTS_TYPE_CURRENT_SETTINGS;
 		cts.protocol = PROTO_SCSI;
 		cts.transport = XPORT_SPI;
 
 		scsi = &cts.proto_specific.scsi;
 		spi = &cts.xport_specific.spi;
 
 		if (flags & DPARM_TQING) {
 			scsi->valid |= CTS_SCSI_VALID_TQ;
 			scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB;
 		}
 
 		if (flags & DPARM_DISC) {
 			spi->valid |= CTS_SPI_VALID_DISC;
 			spi->flags |= CTS_SPI_FLAGS_DISC_ENB;
 		}
 		spi->flags |= CTS_SPI_VALID_BUS_WIDTH;
 		if (flags & DPARM_WIDE) {
 			spi->bus_width = MSG_EXT_WDTR_BUS_16_BIT;
 		} else {
 			spi->bus_width = MSG_EXT_WDTR_BUS_8_BIT;
 		}
 		if (flags & DPARM_SYNC) {
 			spi->valid |= CTS_SPI_VALID_SYNC_RATE;
 			spi->valid |= CTS_SPI_VALID_SYNC_OFFSET;
 			spi->sync_period = sdp->isp_devparam[tgt].actv_period;
 			spi->sync_offset = sdp->isp_devparam[tgt].actv_offset;
 		}
 		isp_prt(isp, ISP_LOGDEBUG2, "NEW_TGT_PARAMS bus %d tgt %d period %x offset %x flags %x", bus, tgt, sdp->isp_devparam[tgt].actv_period, sdp->isp_devparam[tgt].actv_offset, flags);
 		xpt_setup_ccb(&cts.ccb_h, tmppath, 1);
 		xpt_async(AC_TRANSFER_NEG, tmppath, &cts);
 		xpt_free_path(tmppath);
 		break;
 	}
 	case ISPASYNC_BUS_RESET:
 	{
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		va_end(ap);
 		isp_prt(isp, ISP_LOGINFO, "SCSI bus reset on bus %d detected", bus);
 		if (IS_FC(isp)) {
 			xpt_async(AC_BUS_RESET, ISP_FC_PC(isp, bus)->path, NULL);
 		} else {
 			xpt_async(AC_BUS_RESET, ISP_SPI_PC(isp, bus)->path, NULL);
 		}
 		break;
 	}
 	case ISPASYNC_LOOP_RESET:
 	{
 		uint16_t lipp;
 		fcparam *fcp;
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		va_end(ap);
 
 		lipp = ISP_READ(isp, OUTMAILBOX1);
 		fcp = FCPARAM(isp, bus);
 		
 		isp_prt(isp, ISP_LOGINFO, "Chan %d LOOP Reset, LIP primitive %x", bus, lipp);
 		/* 
 		 * Per FCP-4, a Reset LIP should result in a CRN reset. Other
 		 * LIPs and loop up/down events should never reset the CRN. For
 		 * an as of yet unknown reason, 24xx series cards (and
 		 * potentially others) can interrupt with a LIP Reset status
 		 * when no LIP reset came down the wire. Additionally, the LIP
 		 * primitive accompanying this status would not be a valid LIP
 		 * Reset primitive, but some variation of an invalid AL_PA
 		 * LIP. As a result, we have to verify the AL_PD in the LIP
 		 * addresses our port before blindly resetting.
 		*/
 		if (FCP_IS_DEST_ALPD(fcp, (lipp & 0x00FF)))
 			isp_fcp_reset_crn(isp, bus, /*tgt*/0, /*tgt_set*/ 0);
 		isp_loop_changed(isp, bus);
 		break;
 	}
 	case ISPASYNC_LIP:
 		if (msg == NULL)
 			msg = "LIP Received";
 		/* FALLTHROUGH */
 	case ISPASYNC_LOOP_DOWN:
 		if (msg == NULL)
 			msg = "LOOP Down";
 		/* FALLTHROUGH */
 	case ISPASYNC_LOOP_UP:
 		if (msg == NULL)
 			msg = "LOOP Up";
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		va_end(ap);
 		isp_loop_changed(isp, bus);
 		isp_prt(isp, ISP_LOGINFO, "Chan %d %s", bus, msg);
 		break;
 	case ISPASYNC_DEV_ARRIVED:
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		lp = va_arg(ap, fcportdb_t *);
 		va_end(ap);
 		fc = ISP_FC_PC(isp, bus);
 		tgt = FC_PORTDB_TGT(isp, bus, lp);
 		isp_gen_role_str(buf, sizeof (buf), lp->prli_word3);
 		isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "arrived");
 		if ((FCPARAM(isp, bus)->role & ISP_ROLE_INITIATOR) &&
 		    (lp->prli_word3 & PRLI_WD3_TARGET_FUNCTION)) {
 			lp->is_target = 1;
 			isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1);
 			isp_make_here(isp, lp, bus, tgt);
 		}
 		if ((FCPARAM(isp, bus)->role & ISP_ROLE_TARGET) &&
 		    (lp->prli_word3 & PRLI_WD3_INITIATOR_FUNCTION)) {
 			lp->is_initiator = 1;
 			ac.contract_number = AC_CONTRACT_DEV_CHG;
 			adc = (struct ac_device_changed *) ac.contract_data;
 			adc->wwpn = lp->port_wwn;
 			adc->port = lp->portid;
 			adc->target = tgt;
 			adc->arrived = 1;
 			xpt_async(AC_CONTRACT, fc->path, &ac);
 		}
 		break;
 	case ISPASYNC_DEV_CHANGED:
 	case ISPASYNC_DEV_STAYED:		
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		lp = va_arg(ap, fcportdb_t *);
 		va_end(ap);
 		fc = ISP_FC_PC(isp, bus);
 		tgt = FC_PORTDB_TGT(isp, bus, lp);
 		isp_gen_role_str(buf, sizeof (buf), lp->new_prli_word3);
 		if (cmd == ISPASYNC_DEV_CHANGED)
 			isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->new_portid, lp->handle, buf, "changed");
 		else
 			isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "stayed");			
 
 		if (lp->is_target !=
 		    ((FCPARAM(isp, bus)->role & ISP_ROLE_INITIATOR) &&
 		     (lp->new_prli_word3 & PRLI_WD3_TARGET_FUNCTION))) {
 			lp->is_target = !lp->is_target;
 			if (lp->is_target) {
 				if (cmd == ISPASYNC_DEV_CHANGED)
 					isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1);
 				isp_make_here(isp, lp, bus, tgt);
 			} else {
 				isp_make_gone(isp, lp, bus, tgt);
 				if (cmd == ISPASYNC_DEV_CHANGED)
 					isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1);
 			}
 		}
 		if (lp->is_initiator !=
 		    ((FCPARAM(isp, bus)->role & ISP_ROLE_TARGET) &&
 		     (lp->new_prli_word3 & PRLI_WD3_INITIATOR_FUNCTION))) {
 			lp->is_initiator = !lp->is_initiator;
 			ac.contract_number = AC_CONTRACT_DEV_CHG;
 			adc = (struct ac_device_changed *) ac.contract_data;
 			adc->wwpn = lp->port_wwn;
 			adc->port = lp->portid;
 			adc->target = tgt;
 			adc->arrived = lp->is_initiator;
 			xpt_async(AC_CONTRACT, fc->path, &ac);
 		}
 		break;
 	case ISPASYNC_DEV_GONE:
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		lp = va_arg(ap, fcportdb_t *);
 		va_end(ap);
 		fc = ISP_FC_PC(isp, bus);
 		tgt = FC_PORTDB_TGT(isp, bus, lp);
 		/*
 		 * If this has a virtual target or initiator set the isp_gdt
 		 * timer running on it to delay its departure.
 		 */
 		isp_gen_role_str(buf, sizeof (buf), lp->prli_word3);
 		if (lp->is_target || lp->is_initiator) {
 			lp->state = FC_PORTDB_STATE_ZOMBIE;
 			lp->gone_timer = fc->gone_device_time;
 			isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "gone zombie");
 			if (fc->ready && !callout_active(&fc->gdt)) {
 				isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Starting Gone Device Timer with %u seconds time now %lu", bus, lp->gone_timer, (unsigned long)time_uptime);
 				callout_reset(&fc->gdt, hz, isp_gdt, fc);
 			}
 			break;
 		}
 		isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "gone");
 		break;
 	case ISPASYNC_CHANGE_NOTIFY:
 	{
 		char *msg;
 		int evt, nphdl, nlstate, portid, reason;
 
 		va_start(ap, cmd);
 		bus = va_arg(ap, int);
 		evt = va_arg(ap, int);
 		if (evt == ISPASYNC_CHANGE_PDB) {
 			nphdl = va_arg(ap, int);
 			nlstate = va_arg(ap, int);
 			reason = va_arg(ap, int);
 		} else if (evt == ISPASYNC_CHANGE_SNS) {
 			portid = va_arg(ap, int);
 		} else {
 			nphdl = NIL_HANDLE;
 			nlstate = reason = 0;
 		}
 		va_end(ap);
 
 		if (evt == ISPASYNC_CHANGE_PDB) {
 			int tgt_set = 0;
 			msg = "Port Database Changed";
 			isp_prt(isp, ISP_LOGINFO,
 			    "Chan %d %s (nphdl 0x%x state 0x%x reason 0x%x)",
 			    bus, msg, nphdl, nlstate, reason);
 			/*
 			 * Port database syncs are not sufficient for
 			 * determining that logins or logouts are done on the
 			 * loop, but this information is directly available from
 			 * the reason code from the incoming mbox. We must reset
 			 * the fcp crn on these events according to FCP-4
 			 */
 			switch (reason) {
 			case PDB24XX_AE_IMPL_LOGO_1:
 			case PDB24XX_AE_IMPL_LOGO_2:
 			case PDB24XX_AE_IMPL_LOGO_3:
 			case PDB24XX_AE_PLOGI_RCVD:
 			case PDB24XX_AE_PRLI_RCVD:
 			case PDB24XX_AE_PRLO_RCVD:
 			case PDB24XX_AE_LOGO_RCVD:
 			case PDB24XX_AE_PLOGI_DONE:
 			case PDB24XX_AE_PRLI_DONE:
 				/*
 				 * If the event is not global, twiddle tgt and
 				 * tgt_set to nominate only the target
 				 * associated with the nphdl.
 				 */
 				if (nphdl != PDB24XX_AE_GLOBAL) {
 					/* Break if we don't yet have the pdb */
 					if (!isp_find_pdb_by_handle(isp, bus, nphdl, &lp))
 						break;
 					tgt = FC_PORTDB_TGT(isp, bus, lp);
 					tgt_set = 1;
 				}
 				isp_fcp_reset_crn(isp, bus, tgt, tgt_set);
 				break;
 			default:
 				break; /* NOP */
 			}
 		} else if (evt == ISPASYNC_CHANGE_SNS) {
 			msg = "Name Server Database Changed";
 			isp_prt(isp, ISP_LOGINFO, "Chan %d %s (PortID 0x%06x)",
 			    bus, msg, portid);
 		} else {
 			msg = "Other Change Notify";
 			isp_prt(isp, ISP_LOGINFO, "Chan %d %s", bus, msg);
 		}
 		isp_loop_changed(isp, bus);
 		break;
 	}
 #ifdef	ISP_TARGET_MODE
 	case ISPASYNC_TARGET_NOTIFY:
 	{
 		isp_notify_t *notify;
 		va_start(ap, cmd);
 		notify = va_arg(ap, isp_notify_t *);
 		va_end(ap);
 		switch (notify->nt_ncode) {
 		case NT_ABORT_TASK:
 		case NT_ABORT_TASK_SET:
 		case NT_CLEAR_ACA:
 		case NT_CLEAR_TASK_SET:
 		case NT_LUN_RESET:
 		case NT_TARGET_RESET:
 		case NT_QUERY_TASK_SET:
 		case NT_QUERY_ASYNC_EVENT:
 			/*
 			 * These are task management functions.
 			 */
 			isp_handle_platform_target_tmf(isp, notify);
 			break;
 		case NT_BUS_RESET:
 		case NT_LIP_RESET:
 		case NT_LINK_UP:
 		case NT_LINK_DOWN:
 		case NT_HBA_RESET:
 			/*
 			 * No action need be taken here.
 			 */
 			break;
 		case NT_GLOBAL_LOGOUT:
 		case NT_LOGOUT:
 			/*
 			 * This is device arrival/departure notification
 			 */
 			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		case NT_SRR:
 			isp_handle_platform_srr(isp, notify);
 			break;
 		default:
 			isp_prt(isp, ISP_LOGALL, "target notify code 0x%x", notify->nt_ncode);
 			isp_handle_platform_target_notify_ack(isp, notify, 0);
 			break;
 		}
 		break;
 	}
 	case ISPASYNC_TARGET_NOTIFY_ACK:
 	{
 		void *inot;
 		va_start(ap, cmd);
 		inot = va_arg(ap, void *);
 		va_end(ap);
 		if (isp_notify_ack(isp, inot)) {
 			isp_tna_t *tp = malloc(sizeof (*tp), M_DEVBUF, M_NOWAIT);
 			if (tp) {
 				tp->isp = isp;
 				memcpy(tp->data, inot, sizeof (tp->data));
 				tp->not = tp->data;
 				callout_init_mtx(&tp->timer, &isp->isp_lock, 0);
 				callout_reset(&tp->timer, 5,
 				    isp_refire_notify_ack, tp);
 			} else {
 				isp_prt(isp, ISP_LOGERR, "you lose- cannot allocate a notify refire");
 			}
 		}
 		break;
 	}
 	case ISPASYNC_TARGET_ACTION:
 	{
 		isphdr_t *hp;
 
 		va_start(ap, cmd);
 		hp = va_arg(ap, isphdr_t *);
 		va_end(ap);
 		switch (hp->rqs_entry_type) {
 		case RQSTYPE_ATIO:
 			isp_handle_platform_atio7(isp, (at7_entry_t *) hp);
 			break;
 		case RQSTYPE_ATIO2:
 			isp_handle_platform_atio2(isp, (at2_entry_t *) hp);
 			break;
 		case RQSTYPE_CTIO7:
 		case RQSTYPE_CTIO3:
 		case RQSTYPE_CTIO2:
 		case RQSTYPE_CTIO:
 			isp_handle_platform_ctio(isp, hp);
 			break;
 		default:
 			isp_prt(isp, ISP_LOGWARN, "%s: unhandled target action 0x%x",
 			    __func__, hp->rqs_entry_type);
 			break;
 		}
 		break;
 	}
 #endif
 	case ISPASYNC_FW_CRASH:
 	{
 		uint16_t mbox1, mbox6;
 		mbox1 = ISP_READ(isp, OUTMAILBOX1);
 		if (IS_DUALBUS(isp)) { 
 			mbox6 = ISP_READ(isp, OUTMAILBOX6);
 		} else {
 			mbox6 = 0;
 		}
 		isp_prt(isp, ISP_LOGERR, "Internal Firmware Error on bus %d @ RISC Address 0x%x", mbox6, mbox1);
 #if 0
 		mbox1 = isp->isp_osinfo.mbox_sleep_ok;
 		isp->isp_osinfo.mbox_sleep_ok = 0;
 		isp_reinit(isp, 1);
 		isp->isp_osinfo.mbox_sleep_ok = mbox1;
 		isp_async(isp, ISPASYNC_FW_RESTARTED, NULL);
 #endif
 		break;
 	}
 	default:
 		isp_prt(isp, ISP_LOGERR, "unknown isp_async event %d", cmd);
 		break;
 	}
 }
 
 uint64_t
 isp_default_wwn(ispsoftc_t * isp, int chan, int isactive, int iswwnn)
 {
 	uint64_t seed;
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 
 	/* First try to use explicitly configured WWNs. */
 	seed = iswwnn ? fc->def_wwnn : fc->def_wwpn;
 	if (seed)
 		return (seed);
 
 	/* Otherwise try to use WWNs from NVRAM. */
 	if (isactive) {
 		seed = iswwnn ? FCPARAM(isp, chan)->isp_wwnn_nvram :
 		    FCPARAM(isp, chan)->isp_wwpn_nvram;
 		if (seed)
 			return (seed);
 	}
 
 	/* If still no WWNs, try to steal them from the first channel. */
 	if (chan > 0) {
 		seed = iswwnn ? ISP_FC_PC(isp, 0)->def_wwnn :
 		    ISP_FC_PC(isp, 0)->def_wwpn;
 		if (seed == 0) {
 			seed = iswwnn ? FCPARAM(isp, 0)->isp_wwnn_nvram :
 			    FCPARAM(isp, 0)->isp_wwpn_nvram;
 		}
 	}
 
 	/* If still nothing -- improvise. */
 	if (seed == 0) {
 		seed = 0x400000007F000000ull + device_get_unit(isp->isp_dev);
 		if (!iswwnn)
 			seed ^= 0x0100000000000000ULL;
 	}
 
 	/* For additional channels we have to improvise even more. */
 	if (!iswwnn && chan > 0) {
 		/*
 		 * We'll stick our channel number plus one first into bits
 		 * 57..59 and thence into bits 52..55 which allows for 8 bits
 		 * of channel which is enough for our maximum of 255 channels.
 		 */
 		seed ^= 0x0100000000000000ULL;
 		seed ^= ((uint64_t) (chan + 1) & 0xf) << 56;
 		seed ^= ((uint64_t) ((chan + 1) >> 4) & 0xf) << 52;
 	}
 	return (seed);
 }
 
 void
 isp_prt(ispsoftc_t *isp, int level, const char *fmt, ...)
 {
 	int loc;
 	char lbuf[200];
 	va_list ap;
 
 	if (level != ISP_LOGALL && (level & isp->isp_dblev) == 0) {
 		return;
 	}
 	snprintf(lbuf, sizeof (lbuf), "%s: ", device_get_nameunit(isp->isp_dev));
 	loc = strlen(lbuf);
 	va_start(ap, fmt);
 	vsnprintf(&lbuf[loc], sizeof (lbuf) - loc - 1, fmt, ap); 
 	va_end(ap);
 	printf("%s\n", lbuf);
 }
 
 void
 isp_xs_prt(ispsoftc_t *isp, XS_T *xs, int level, const char *fmt, ...)
 {
 	va_list ap;
 	if (level != ISP_LOGALL && (level & isp->isp_dblev) == 0) {
 		return;
 	}
 	xpt_print_path(xs->ccb_h.path);
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("\n");
 }
 
 uint64_t
 isp_nanotime_sub(struct timespec *b, struct timespec *a)
 {
 	uint64_t elapsed;
-	struct timespec x = *b;
-	timespecsub(&x, a);
+	struct timespec x;
+
+	timespecsub(b, a, &x);
 	elapsed = GET_NANOSEC(&x);
 	if (elapsed == 0)
 		elapsed++;
 	return (elapsed);
 }
 
 int
 isp_mbox_acquire(ispsoftc_t *isp)
 {
 	if (isp->isp_osinfo.mboxbsy) {
 		return (1);
 	} else {
 		isp->isp_osinfo.mboxcmd_done = 0;
 		isp->isp_osinfo.mboxbsy = 1;
 		return (0);
 	}
 }
 
 void
 isp_mbox_wait_complete(ispsoftc_t *isp, mbreg_t *mbp)
 {
 	u_int t, to;
 
 	to = (mbp->timeout == 0) ? MBCMD_DEFAULT_TIMEOUT : mbp->timeout;
 	if (isp->isp_osinfo.mbox_sleep_ok) {
 		isp->isp_osinfo.mbox_sleep_ok = 0;
 		isp->isp_osinfo.mbox_sleeping = 1;
 		msleep_sbt(&isp->isp_osinfo.mboxcmd_done, &isp->isp_lock,
 		    PRIBIO, "ispmbx_sleep", to * SBT_1US, 0, 0);
 		isp->isp_osinfo.mbox_sleep_ok = 1;
 		isp->isp_osinfo.mbox_sleeping = 0;
 	} else {
 		for (t = 0; t < to; t += 100) {
 			if (isp->isp_osinfo.mboxcmd_done)
 				break;
 			ISP_RUN_ISR(isp);
 			if (isp->isp_osinfo.mboxcmd_done)
 				break;
 			ISP_DELAY(100);
 		}
 	}
 	if (isp->isp_osinfo.mboxcmd_done == 0) {
 		isp_prt(isp, ISP_LOGWARN, "%s Mailbox Command (0x%x) Timeout (%uus) (%s:%d)",
 		    isp->isp_osinfo.mbox_sleep_ok? "Interrupting" : "Polled",
 		    isp->isp_lastmbxcmd, to, mbp->func, mbp->lineno);
 		mbp->param[0] = MBOX_TIMEOUT;
 		isp->isp_osinfo.mboxcmd_done = 1;
 	}
 }
 
 void
 isp_mbox_notify_done(ispsoftc_t *isp)
 {
 	isp->isp_osinfo.mboxcmd_done = 1;
 	if (isp->isp_osinfo.mbox_sleeping)
 		wakeup(&isp->isp_osinfo.mboxcmd_done);
 }
 
 void
 isp_mbox_release(ispsoftc_t *isp)
 {
 	isp->isp_osinfo.mboxbsy = 0;
 }
 
 int
 isp_fc_scratch_acquire(ispsoftc_t *isp, int chan)
 {
 	int ret = 0;
 	if (isp->isp_osinfo.pc.fc[chan].fcbsy) {
 		ret = -1;
 	} else {
 		isp->isp_osinfo.pc.fc[chan].fcbsy = 1;
 	}
 	return (ret);
 }
 
 void
 isp_platform_intr(void *arg)
 {
 	ispsoftc_t *isp = arg;
 
 	ISP_LOCK(isp);
 	ISP_RUN_ISR(isp);
 	ISP_UNLOCK(isp);
 }
 
 void
 isp_platform_intr_resp(void *arg)
 {
 	ispsoftc_t *isp = arg;
 
 	ISP_LOCK(isp);
 	isp_intr_respq(isp);
 	ISP_UNLOCK(isp);
 
 	/* We have handshake enabled, so explicitly complete interrupt */
 	ISP_WRITE(isp, BIU2400_HCCR, HCCR_2400_CMD_CLEAR_RISC_INT);
 }
 
 void
 isp_platform_intr_atio(void *arg)
 {
 	ispsoftc_t *isp = arg;
 
 	ISP_LOCK(isp);
 #ifdef	ISP_TARGET_MODE
 	isp_intr_atioq(isp);
 #endif
 	ISP_UNLOCK(isp);
 
 	/* We have handshake enabled, so explicitly complete interrupt */
 	ISP_WRITE(isp, BIU2400_HCCR, HCCR_2400_CMD_CLEAR_RISC_INT);
 }
 
 void
 isp_common_dmateardown(ispsoftc_t *isp, struct ccb_scsiio *csio, uint32_t hdl)
 {
 	if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 		bus_dmamap_sync(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap, BUS_DMASYNC_POSTREAD);
 	} else {
 		bus_dmamap_sync(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap, BUS_DMASYNC_POSTWRITE);
 	}
 	bus_dmamap_unload(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap);
 }
 
 /*
  * Reset the command reference number for all LUNs on a specific target
  * (needed when a target arrives again) or for all targets on a port
  * (needed for events like a LIP).
  */
 void
 isp_fcp_reset_crn(ispsoftc_t *isp, int chan, uint32_t tgt, int tgt_set)
 {
 	struct isp_fc *fc = ISP_FC_PC(isp, chan);
 	struct isp_nexus *nxp;
 	int i;
 
 	if (tgt_set == 0)
 		isp_prt(isp, ISP_LOGDEBUG0,
 		    "Chan %d resetting CRN on all targets", chan);
 	else
 		isp_prt(isp, ISP_LOGDEBUG0,
 		    "Chan %d resetting CRN on target %u", chan, tgt);
 
 	for (i = 0; i < NEXUS_HASH_WIDTH; i++) {
 		for (nxp = fc->nexus_hash[i]; nxp != NULL; nxp = nxp->next) {
 			if (tgt_set == 0 || tgt == nxp->tgt)
 				nxp->crnseed = 0;
 		}
 	}
 }
 
 int
 isp_fcp_next_crn(ispsoftc_t *isp, uint8_t *crnp, XS_T *cmd)
 {
 	lun_id_t lun;
 	uint32_t chan, tgt;
 	struct isp_fc *fc;
 	struct isp_nexus *nxp;
 	int idx;
 
 	if (IS_2100(isp))
 		return (0);
 
 	chan = XS_CHANNEL(cmd);
 	tgt = XS_TGT(cmd);
 	lun = XS_LUN(cmd);
 	fc = &isp->isp_osinfo.pc.fc[chan];
 	idx = NEXUS_HASH(tgt, lun);
 	nxp = fc->nexus_hash[idx];
 
 	while (nxp) {
 		if (nxp->tgt == tgt && nxp->lun == lun)
 			break;
 		nxp = nxp->next;
 	}
 	if (nxp == NULL) {
 		nxp = fc->nexus_free_list;
 		if (nxp == NULL) {
 			nxp = malloc(sizeof (struct isp_nexus), M_DEVBUF, M_ZERO|M_NOWAIT);
 			if (nxp == NULL) {
 				return (-1);
 			}
 		} else {
 			fc->nexus_free_list = nxp->next;
 		}
 		nxp->tgt = tgt;
 		nxp->lun = lun;
 		nxp->next = fc->nexus_hash[idx];
 		fc->nexus_hash[idx] = nxp;
 	}
 	if (nxp->crnseed == 0)
 		nxp->crnseed = 1;
 	*crnp = nxp->crnseed++;
 	return (0);
 }
 
 /*
  * We enter with the lock held
  */
 void
 isp_timer(void *arg)
 {
 	ispsoftc_t *isp = arg;
 #ifdef	ISP_TARGET_MODE
 	isp_tmcmd_restart(isp);
 #endif
 	callout_reset(&isp->isp_osinfo.tmo, isp_timer_count, isp_timer, isp);
 }
 
 isp_ecmd_t *
 isp_get_ecmd(ispsoftc_t *isp)
 {
 	isp_ecmd_t *ecmd = isp->isp_osinfo.ecmd_free;
 	if (ecmd) {
 		isp->isp_osinfo.ecmd_free = ecmd->next;
 	}
 	return (ecmd);
 }
 
 void
 isp_put_ecmd(ispsoftc_t *isp, isp_ecmd_t *ecmd)
 {
 	ecmd->next = isp->isp_osinfo.ecmd_free;
 	isp->isp_osinfo.ecmd_free = ecmd;
 }
Index: head/sys/dev/joy/joy.c
===================================================================
--- head/sys/dev/joy/joy.c	(revision 336913)
+++ head/sys/dev/joy/joy.c	(revision 336914)
@@ -1,251 +1,251 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1995 Jean-Marc Zucconi
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 #include <sys/time.h>
 #include <sys/joystick.h>
 #include <dev/joy/joyvar.h>
 
 /* The game port can manage 4 buttons and 4 variable resistors (usually 2
  * joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201.
  * Getting the state of the buttons is done by reading the game port:
  * buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2)
  * to bits 0-3.
  * if button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5, 6, 7) is set to 0
  * to get the value of a resistor, write the value 0xff at port and
  * wait until the corresponding bit returns to 0.
  */
 
 #define joypart(d) (dev2unit(d)&1)
 #ifndef JOY_TIMEOUT
 #define JOY_TIMEOUT   2000 /* 2 milliseconds */
 #endif
 
 static	d_open_t	joyopen;
 static	d_close_t	joyclose;
 static	d_read_t	joyread;
 static	d_ioctl_t	joyioctl;
 
 static struct cdevsw joy_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	joyopen,
 	.d_close =	joyclose,
 	.d_read =	joyread,
 	.d_ioctl =	joyioctl,
 	.d_name =	"joy",
 };
 
 devclass_t joy_devclass;
 
 int
 joy_probe(device_t dev)
 {
 #ifdef WANT_JOYSTICK_CONNECTED
 #ifdef notyet
 	outb(dev->id_iobase, 0xff);
 	DELAY(10000); /*  10 ms delay */
 	return (inb(dev->id_iobase) & 0x0f) != 0x0f;
 #else
 	return (0);
 #endif
 #else
 	return (0);
 #endif
 }
 
 int
 joy_attach(device_t dev)
 {
 	int	unit = device_get_unit(dev);
 	struct joy_softc *joy = device_get_softc(dev);
 
 	joy->rid = 0;
 	joy->res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &joy->rid,
 	    RF_ACTIVE|RF_SHAREABLE);
 	if (joy->res == NULL)
 		return ENXIO;
 	joy->bt = rman_get_bustag(joy->res);
 	joy->port = rman_get_bushandle(joy->res);
 	joy->timeout[0] = joy->timeout[1] = 0;
 	joy->d = make_dev(&joy_cdevsw, unit, 0, 0, 0600, "joy%d", unit);
 	joy->d->si_drv1 = joy;
 	gone_in_dev(dev, 12, "joy(4) driver");
 
 	return (0);
 }
 
 int
 joy_detach(device_t dev)
 {
 	struct joy_softc *joy = device_get_softc(dev);
 
 	if (joy->res != NULL)
 		bus_release_resource(dev, SYS_RES_IOPORT, joy->rid, joy->res);
 	if (joy->d)
 		destroy_dev(joy->d);
 	return (0);
 }
 
 
 static int
 joyopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	int i = joypart (dev);
 	struct joy_softc *joy = dev->si_drv1;
 
 	if (joy->timeout[i])
 		return (EBUSY);
 	joy->x_off[i] = joy->y_off[i] = 0;
 	joy->timeout[i] = JOY_TIMEOUT;
 	return (0);
 }
 
 static int
 joyclose(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	int i = joypart (dev);
 	struct joy_softc *joy = dev->si_drv1;
 
 	joy->timeout[i] = 0;
 	return (0);
 }
 
 static int
 joyread(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct joy_softc *joy = dev->si_drv1;
 	bus_space_handle_t port = joy->port;
 	bus_space_tag_t bt = joy->bt;
 	struct timespec t, start, end;
 	int state = 0;
 	struct timespec x, y;
 	struct joystick c;
 #ifndef __i386__
 	int s;
 
 	s = splhigh();
 #else
 	disable_intr ();
 #endif
 	nanotime(&t);
 	end.tv_sec = 0;
 	end.tv_nsec = joy->timeout[joypart(dev)] * 1000;
-	timespecadd(&end, &t);
+	timespecadd(&end, &t, &end);
 	for (; timespeccmp(&t, &end, <) && (bus_space_read_1(bt, port, 0) & 0x0f); nanotime(&t))
 		;	/* nothing */
 	bus_space_write_1 (bt, port, 0, 0xff);
 	nanotime(&start);
 	end.tv_sec = 0;
 	end.tv_nsec = joy->timeout[joypart(dev)] * 1000;
-	timespecadd(&end, &start);
+	timespecadd(&end, &start, &end);
 	t = start;
 	timespecclear(&x);
 	timespecclear(&y);
 	while (timespeccmp(&t, &end, <)) {
 		state = bus_space_read_1 (bt, port, 0);
 		if (joypart(dev) == 1)
 			state >>= 2;
 		nanotime(&t);
 		if (!timespecisset(&x) && !(state & 0x01))
 			x = t;
 		if (!timespecisset(&y) && !(state & 0x02))
 			y = t;
 		if (timespecisset(&x) && timespecisset(&y))
 			break;
 	}
 #ifndef __i386__
 	splx(s);
 #else
 	enable_intr ();
 #endif
 	if (timespecisset(&x)) {
-		timespecsub(&x, &start);
+		timespecsub(&x, &start, &x);
 		c.x = joy->x_off[joypart(dev)] + x.tv_nsec / 1000;
 	} else
 		c.x = 0x80000000;
 	if (timespecisset(&y)) {
-		timespecsub(&y, &start);
+		timespecsub(&y, &start, &y);
 		c.y = joy->y_off[joypart(dev)] + y.tv_nsec / 1000;
 	} else
 		c.y = 0x80000000;
 	state >>= 4;
 	c.b1 = ~state & 1;
 	c.b2 = ~(state >> 1) & 1;
 	return (uiomove((caddr_t)&c, sizeof(struct joystick), uio));
 }
 
 static int
 joyioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
 {
 	struct joy_softc *joy = dev->si_drv1;
 	int i = joypart (dev);
 	int x;
 
 	switch (cmd) {
 	case JOY_SETTIMEOUT:
 		x = *(int *) data;
 		if (x < 1 || x > 10000) /* 10ms maximum! */
 			return EINVAL;
 		joy->timeout[i] = x;
 		break;
 	case JOY_GETTIMEOUT:
 		*(int *) data = joy->timeout[i];
 		break;
 	case JOY_SET_X_OFFSET:
 		joy->x_off[i] = *(int *) data;
 		break;
 	case JOY_SET_Y_OFFSET:
 		joy->y_off[i] = *(int *) data;
 		break;
 	case JOY_GET_X_OFFSET:
 		*(int *) data = joy->x_off[i];
 		break;
 	case JOY_GET_Y_OFFSET:
 		*(int *) data = joy->y_off[i];
 		break;
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
Index: head/sys/dev/xen/timer/timer.c
===================================================================
--- head/sys/dev/xen/timer/timer.c	(revision 336913)
+++ head/sys/dev/xen/timer/timer.c	(revision 336914)
@@ -1,559 +1,559 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Adrian Chadd
  * Copyright (c) 2012 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /**
  * \file dev/xen/timer/timer.c
  * \brief A timer driver for the Xen hypervisor's PV clock.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/time.h>
 #include <sys/timetc.h>
 #include <sys/timeet.h>
 #include <sys/smp.h>
 #include <sys/limits.h>
 #include <sys/clock.h>
 #include <sys/proc.h>
 
 #include <xen/xen-os.h>
 #include <xen/features.h>
 #include <xen/xen_intr.h>
 #include <xen/hypervisor.h>
 #include <xen/interface/io/xenbus.h>
 #include <xen/interface/vcpu.h>
 #include <xen/error.h>
 
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
 #include <machine/clock.h>
 #include <machine/_inttypes.h>
 #include <machine/smp.h>
 #include <machine/pvclock.h>
 
 #include <dev/xen/timer/timer.h>
 
 #include "clock_if.h"
 
 static devclass_t xentimer_devclass;
 
 #define	NSEC_IN_SEC	1000000000ULL
 #define	NSEC_IN_USEC	1000ULL
 /* 18446744073 = int(2^64 / NSEC_IN_SC) = 1 ns in 64-bit fractions */
 #define	FRAC_IN_NSEC	18446744073LL
 
 /* Xen timers may fire up to 100us off */
 #define	XENTIMER_MIN_PERIOD_IN_NSEC	100*NSEC_IN_USEC
 
 /*
  * The real resolution of the PV clock is 1ns, but the highest
  * resolution that FreeBSD supports is 1us, so just use that.
  */
 #define	XENCLOCK_RESOLUTION		1
 
 #define	XENTIMER_QUALITY	950
 
 struct xentimer_pcpu_data {
 	uint64_t timer;
 	uint64_t last_processed;
 	void *irq_handle;
 };
 
 DPCPU_DEFINE(struct xentimer_pcpu_data, xentimer_pcpu);
 
 DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
 
 struct xentimer_softc {
 	device_t dev;
 	struct timecounter tc;
 	struct eventtimer et;
 };
 
 static void
 xentimer_identify(driver_t *driver, device_t parent)
 {
 	if (!xen_domain())
 		return;
 
 	/* Handle all Xen PV timers in one device instance. */
 	if (devclass_get_device(xentimer_devclass, 0))
 		return;
 
 	BUS_ADD_CHILD(parent, 0, "xen_et", 0);
 }
 
 static int
 xentimer_probe(device_t dev)
 {
 	KASSERT((xen_domain()), ("Trying to use Xen timer on bare metal"));
 	/*
 	 * In order to attach, this driver requires the following:
 	 * - Vector callback support by the hypervisor, in order to deliver
 	 *   timer interrupts to the correct CPU for CPUs other than 0.
 	 * - Access to the hypervisor shared info page, in order to look up
 	 *   each VCPU's timer information and the Xen wallclock time.
 	 * - The hypervisor must say its PV clock is "safe" to use.
 	 * - The hypervisor must support VCPUOP hypercalls.
 	 * - The maximum number of CPUs supported by FreeBSD must not exceed
 	 *   the number of VCPUs supported by the hypervisor.
 	 */
 #define	XTREQUIRES(condition, reason...)	\
 	if (!(condition)) {			\
 		device_printf(dev, ## reason);	\
 		device_detach(dev);		\
 		return (ENXIO);			\
 	}
 
 	if (xen_hvm_domain()) {
 		XTREQUIRES(xen_vector_callback_enabled,
 		           "vector callbacks unavailable\n");
 		XTREQUIRES(xen_feature(XENFEAT_hvm_safe_pvclock),
 		           "HVM safe pvclock unavailable\n");
 	}
 	XTREQUIRES(HYPERVISOR_shared_info != NULL,
 	           "shared info page unavailable\n");
 	XTREQUIRES(HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 0, NULL) == 0,
 	           "VCPUOPs interface unavailable\n");
 #undef XTREQUIRES
 	device_set_desc(dev, "Xen PV Clock");
 	return (BUS_PROBE_NOWILDCARD);
 }
 
 /**
  * \brief Get the current time, in nanoseconds, since the hypervisor booted.
  *
  * \param vcpu		vcpu_info structure to fetch the time from.
  *
  */
 static uint64_t
 xen_fetch_vcpu_time(struct vcpu_info *vcpu)
 {
 	struct pvclock_vcpu_time_info *time;
 
 	time = (struct pvclock_vcpu_time_info *) &vcpu->time;
 
 	return (pvclock_get_timecount(time));
 }
 
 static uint32_t
 xentimer_get_timecount(struct timecounter *tc)
 {
 	uint64_t vcpu_time;
 
 	/*
 	 * We don't disable preemption here because the worst that can
 	 * happen is reading the vcpu_info area of a different CPU than
 	 * the one we are currently running on, but that would also
 	 * return a valid tc (and we avoid the overhead of
 	 * critical_{enter/exit} calls).
 	 */
 	vcpu_time = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info));
 
 	return (vcpu_time & UINT32_MAX);
 }
 
 /**
  * \brief Fetch the hypervisor boot time, known as the "Xen wallclock".
  *
  * \param ts		Timespec to store the current stable value.
  * \param version	Pointer to store the corresponding wallclock version.
  *
  * \note This value is updated when Domain-0 shifts its clock to follow
  *       clock drift, e.g. as detected by NTP.
  */
 static void
 xen_fetch_wallclock(struct timespec *ts)
 {
 	shared_info_t *src = HYPERVISOR_shared_info;
 	struct pvclock_wall_clock *wc;
 
 	wc = (struct pvclock_wall_clock *) &src->wc_version;
 
 	pvclock_get_wallclock(wc, ts);
 }
 
 static void
 xen_fetch_uptime(struct timespec *ts)
 {
 	uint64_t uptime;
 
 	uptime = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info));
 
 	ts->tv_sec = uptime / NSEC_IN_SEC;
 	ts->tv_nsec = uptime % NSEC_IN_SEC;
 }
 
 static int
 xentimer_settime(device_t dev __unused, struct timespec *ts)
 {
 	struct xen_platform_op settime;
 	int ret;
 
 	/*
 	 * Don't return EINVAL here; just silently fail if the domain isn't
 	 * privileged enough to set the TOD.
 	 */
 	if (!xen_initial_domain())
 		return (0);
 
 	settime.cmd = XENPF_settime64;
 	settime.u.settime64.mbz = 0;
 	settime.u.settime64.secs = ts->tv_sec;
 	settime.u.settime64.nsecs = ts->tv_nsec;
 	settime.u.settime64.system_time =
 		xen_fetch_vcpu_time(DPCPU_GET(vcpu_info));
 
 	ret = HYPERVISOR_platform_op(&settime);
 	ret = ret != 0 ? xen_translate_error(ret) : 0;
 	if (ret != 0 && bootverbose)
 		device_printf(dev, "failed to set Xen PV clock: %d\n", ret);
 
 	return (ret);
 }
 
 /**
  * \brief Return current time according to the Xen Hypervisor wallclock.
  *
  * \param dev	Xentimer device.
  * \param ts	Pointer to store the wallclock time.
  *
  * \note  The Xen time structures document the hypervisor start time and the
  *        uptime-since-hypervisor-start (in nsec.) They need to be combined
  *        in order to calculate a TOD clock.
  */
 static int
 xentimer_gettime(device_t dev, struct timespec *ts)
 {
 	struct timespec u_ts;
 
 	timespecclear(ts);
 	xen_fetch_wallclock(ts);
 	xen_fetch_uptime(&u_ts);
-	timespecadd(ts, &u_ts);
+	timespecadd(ts, &u_ts, ts);
 
 	return (0);
 }
 
 /**
  * \brief Handle a timer interrupt for the Xen PV timer driver.
  *
  * \param arg	Xen timer driver softc that is expecting the interrupt.
  */
 static int
 xentimer_intr(void *arg)
 {
 	struct xentimer_softc *sc = (struct xentimer_softc *)arg;
 	struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
 
 	pcpu->last_processed = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info));
 	if (pcpu->timer != 0 && sc->et.et_active)
 		sc->et.et_event_cb(&sc->et, sc->et.et_arg);
 
 	return (FILTER_HANDLED);
 }
 
 static int
 xentimer_vcpu_start_timer(int vcpu, uint64_t next_time)
 {
 	struct vcpu_set_singleshot_timer single;
 
 	single.timeout_abs_ns = next_time;
 	/* Get an event anyway, even if the timeout is already expired */
 	single.flags          = 0;
 	return (HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, vcpu, &single));
 }
 
 static int
 xentimer_vcpu_stop_timer(int vcpu)
 {
 
 	return (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, vcpu, NULL));
 }
 
 /**
  * \brief Set the next oneshot time for the current CPU.
  *
  * \param et	Xen timer driver event timer to schedule on.
  * \param first	Delta to the next time to schedule the interrupt for.
  * \param period Not used.
  *
  * \note See eventtimers(9) for more information.
  * \note 
  *
  * \returns 0
  */
 static int
 xentimer_et_start(struct eventtimer *et,
     sbintime_t first, sbintime_t period)
 {
 	int error;
 	struct xentimer_softc *sc = et->et_priv;
 	int cpu = PCPU_GET(vcpu_id);
 	struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
 	struct vcpu_info *vcpu = DPCPU_GET(vcpu_info);
 	uint64_t first_in_ns, next_time;
 #ifdef INVARIANTS
 	struct thread *td = curthread;
 #endif
 
 	KASSERT(td->td_critnest != 0,
 	    ("xentimer_et_start called without preemption disabled"));
 
 	/* See sbttots() for this formula. */
 	first_in_ns = (((first >> 32) * NSEC_IN_SEC) +
 	               (((uint64_t)NSEC_IN_SEC * (uint32_t)first) >> 32));
 
 	next_time = xen_fetch_vcpu_time(vcpu) + first_in_ns;
 	error = xentimer_vcpu_start_timer(cpu, next_time);
 	if (error)
 		panic("%s: Error %d setting singleshot timer to %"PRIu64"\n",
 		    device_get_nameunit(sc->dev), error, next_time);
 
 	pcpu->timer = next_time;
 	return (error);
 }
 
 /**
  * \brief Cancel the event timer's currently running timer, if any.
  */
 static int
 xentimer_et_stop(struct eventtimer *et)
 {
 	int cpu = PCPU_GET(vcpu_id);
 	struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu);
 
 	pcpu->timer = 0;
 	return (xentimer_vcpu_stop_timer(cpu));
 }
 
 /**
  * \brief Attach a Xen PV timer driver instance.
  * 
  * \param dev	Bus device object to attach.
  *
  * \note
  * \returns EINVAL 
  */
 static int
 xentimer_attach(device_t dev)
 {
 	struct xentimer_softc *sc = device_get_softc(dev);
 	int error, i;
 
 	sc->dev = dev;
 
 	/* Bind an event channel to a VIRQ on each VCPU. */
 	CPU_FOREACH(i) {
 		struct xentimer_pcpu_data *pcpu;
 
 		pcpu = DPCPU_ID_PTR(i, xentimer_pcpu);
 		error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL);
 		if (error) {
 			device_printf(dev, "Error disabling Xen periodic timer "
 			                   "on CPU %d\n", i);
 			return (error);
 		}
 
 		error = xen_intr_bind_virq(dev, VIRQ_TIMER, i, xentimer_intr,
 		    NULL, sc, INTR_TYPE_CLK, &pcpu->irq_handle);
 		if (error) {
 			device_printf(dev, "Error %d binding VIRQ_TIMER "
 			    "to VCPU %d\n", error, i);
 			return (error);
 		}
 		xen_intr_describe(pcpu->irq_handle, "c%d", i);
 	}
 
 	/* Register the event timer. */
 	sc->et.et_name = "XENTIMER";
 	sc->et.et_quality = XENTIMER_QUALITY;
 	sc->et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU;
 	sc->et.et_frequency = NSEC_IN_SEC;
 	/* See tstosbt() for this formula */
 	sc->et.et_min_period = (XENTIMER_MIN_PERIOD_IN_NSEC *
 	                        (((uint64_t)1 << 63) / 500000000) >> 32);
 	sc->et.et_max_period = ((sbintime_t)4 << 32);
 	sc->et.et_start = xentimer_et_start;
 	sc->et.et_stop = xentimer_et_stop;
 	sc->et.et_priv = sc;
 	et_register(&sc->et);
 
 	/* Register the timecounter. */
 	sc->tc.tc_name = "XENTIMER";
 	sc->tc.tc_quality = XENTIMER_QUALITY;
 	/*
 	 * FIXME: due to the lack of ordering during resume, FreeBSD cannot
 	 * guarantee that the Xen PV timer is resumed before any other device
 	 * attempts to make use of it, so mark it as not safe for suspension
 	 * (ie: remove the TC_FLAGS_SUSPEND_SAFE flag).
 	 *
 	 * NB: This was not a problem in previous FreeBSD versions because the
 	 * timer was directly attached to the nexus, but it is an issue now
 	 * that the timer is attached to the xenpv bus, and thus resumed
 	 * later.
 	 *
 	 * sc->tc.tc_flags = TC_FLAGS_SUSPEND_SAFE;
 	 */
     	/*
 	 * The underlying resolution is in nanoseconds, since the timer info
 	 * scales TSC frequencies using a fraction that represents time in
 	 * terms of nanoseconds.
 	 */
 	sc->tc.tc_frequency = NSEC_IN_SEC;
 	sc->tc.tc_counter_mask = ~0u;
 	sc->tc.tc_get_timecount = xentimer_get_timecount;
 	sc->tc.tc_priv = sc;
 	tc_init(&sc->tc);
 
 	/* Register the Hypervisor wall clock */
 	clock_register(dev, XENCLOCK_RESOLUTION);
 
 	return (0);
 }
 
 static int
 xentimer_detach(device_t dev)
 {
 
 	/* Implement Xen PV clock teardown - XXX see hpet_detach ? */
 	/* If possible:
 	 * 1. need to deregister timecounter
 	 * 2. need to deregister event timer
 	 * 3. need to deregister virtual IRQ event channels
 	 */
 	return (EBUSY);
 }
 
 static void
 xentimer_percpu_resume(void *arg)
 {
 	device_t dev = (device_t) arg;
 	struct xentimer_softc *sc = device_get_softc(dev);
 
 	xentimer_et_start(&sc->et, sc->et.et_min_period, 0);
 }
 
 static int
 xentimer_resume(device_t dev)
 {
 	int error;
 	int i;
 
 	/* Disable the periodic timer */
 	CPU_FOREACH(i) {
 		error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL);
 		if (error != 0) {
 			device_printf(dev,
 			    "Error disabling Xen periodic timer on CPU %d\n",
 			    i);
 			return (error);
 		}
 	}
 
 	/* Reset the last uptime value */
 	pvclock_resume();
 
 	/* Reset the RTC clock */
 	inittodr(time_second);
 
 	/* Kick the timers on all CPUs */
 	smp_rendezvous(NULL, xentimer_percpu_resume, NULL, dev);
 
 	if (bootverbose)
 		device_printf(dev, "resumed operation after suspension\n");
 
 	return (0);
 }
 
 static int
 xentimer_suspend(device_t dev)
 {
 	return (0);
 }
 
 /*
  * Xen early clock init
  */
 void
 xen_clock_init(void)
 {
 }
 
 /*
  * Xen PV DELAY function
  *
  * When running on PVH mode we don't have an emulated i8524, so
  * make use of the Xen time info in order to code a simple DELAY
  * function that can be used during early boot.
  */
 void
 xen_delay(int n)
 {
 	struct vcpu_info *vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
 	uint64_t end_ns;
 	uint64_t current;
 
 	end_ns = xen_fetch_vcpu_time(vcpu);
 	end_ns += n * NSEC_IN_USEC;
 
 	for (;;) {
 		current = xen_fetch_vcpu_time(vcpu);
 		if (current >= end_ns)
 			break;
 	}
 }
 
 static device_method_t xentimer_methods[] = {
 	DEVMETHOD(device_identify, xentimer_identify),
 	DEVMETHOD(device_probe, xentimer_probe),
 	DEVMETHOD(device_attach, xentimer_attach),
 	DEVMETHOD(device_detach, xentimer_detach),
 	DEVMETHOD(device_suspend, xentimer_suspend),
 	DEVMETHOD(device_resume, xentimer_resume),
 	/* clock interface */
 	DEVMETHOD(clock_gettime, xentimer_gettime),
 	DEVMETHOD(clock_settime, xentimer_settime),
 	DEVMETHOD_END
 };
 
 static driver_t xentimer_driver = {
 	"xen_et",
 	xentimer_methods,
 	sizeof(struct xentimer_softc),
 };
 
 DRIVER_MODULE(xentimer, xenpv, xentimer_driver, xentimer_devclass, 0, 0);
 MODULE_DEPEND(xentimer, xenpv, 1, 1, 1);
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 336913)
+++ head/sys/kern/kern_sig.c	(revision 336914)
@@ -1,3816 +1,3814 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/ctype.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <sys/jail.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, , , signal__send,
     "struct thread *", "struct proc *", "int");
 SDT_PROBE_DEFINE2(proc, , , signal__clear,
     "int", "ksiginfo_t *");
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
 static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static int	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
 };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
     &kern_logsigexit, 0,
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0,
     "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 static int	kern_lognosys = 0;
 SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0,
     "Log invalid syscalls");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 static int	sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
     &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
 
 static int	capmode_coredump;
 SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
     &capmode_coredump, 0, "Allow processes in capability mode to dump core");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 static int	coredump_devctl = 0;
 SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
 	0, "Generate a devctl notification when processes coredump");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SIGPROP_KILL		0x01	/* terminates process by default */
 #define	SIGPROP_CORE		0x02	/* ditto and coredumps */
 #define	SIGPROP_STOP		0x04	/* suspend process */
 #define	SIGPROP_TTYSTOP		0x08	/* ditto, from tty */
 #define	SIGPROP_IGNORE		0x10	/* ignore by default */
 #define	SIGPROP_CONT		0x20	/* continue if suspended */
 #define	SIGPROP_CANTMASK	0x40	/* non-maskable, catchable */
 
 static int sigproptbl[NSIG] = {
 	[SIGHUP] =	SIGPROP_KILL,
 	[SIGINT] =	SIGPROP_KILL,
 	[SIGQUIT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGILL] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGTRAP] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGABRT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGEMT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGFPE] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGKILL] =	SIGPROP_KILL,
 	[SIGBUS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSEGV] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSYS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGPIPE] =	SIGPROP_KILL,
 	[SIGALRM] =	SIGPROP_KILL,
 	[SIGTERM] =	SIGPROP_KILL,
 	[SIGURG] =	SIGPROP_IGNORE,
 	[SIGSTOP] =	SIGPROP_STOP,
 	[SIGTSTP] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGCONT] =	SIGPROP_IGNORE | SIGPROP_CONT,
 	[SIGCHLD] =	SIGPROP_IGNORE,
 	[SIGTTIN] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGTTOU] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGIO] =	SIGPROP_IGNORE,
 	[SIGXCPU] =	SIGPROP_KILL,
 	[SIGXFSZ] =	SIGPROP_KILL,
 	[SIGVTALRM] =	SIGPROP_KILL,
 	[SIGPROF] =	SIGPROP_KILL,
 	[SIGWINCH] =	SIGPROP_IGNORE,
 	[SIGINFO] =	SIGPROP_IGNORE,
 	[SIGUSR1] =	SIGPROP_KILL,
 	[SIGUSR2] =	SIGPROP_KILL,
 };
 
 static void reschedule_signals(struct proc *p, sigset_t block, int flags);
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	SIGEMPTYSET(list->sq_ptrace);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  *	0	-	signal not found
  *	others	-	signal number
  */
 static int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_ptrace, signo)) {
 		count++;
 		SIGDELSET(sq->sq_ptrace, signo);
 		si->ksi_flags |= KSI_PTRACE;
 	}
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		if (count == 1)
 			SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) &&
 	    !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 static int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	/*
 	 * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path
 	 * for these signals.
 	 */
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if (ret != 0) {
 		if ((si->ksi_flags & KSI_PTRACE) != 0) {
 			SIGADDSET(sq->sq_ptrace, signo);
 			ret = 0;
 			goto out_set_bit;
 		} else if ((si->ksi_flags & KSI_TRAP) != 0 ||
 		    (si->ksi_flags & KSI_SIGQ) == 0) {
 			SIGADDSET(sq->sq_kill, signo);
 			ret = 0;
 			goto out_set_bit;
 		}
 		return (ret);
 	}
 
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 	SIGEMPTYSET(sq->sq_ptrace);
 }
 
 static void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
 {
 	sigset_t tmp;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_ptrace;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_ptrace, tmp);
 	SIGSETNAND(src->sq_ptrace, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 }
 
 #if 0
 static void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 #endif
 
 static void
 sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_ptrace, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 static void
 sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 static void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to thread td, the current
  * thread, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < nitems(sigproptbl))
 		return (sigproptbl[sig]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
 
 	/*
 	 * SA_SIGINFO is reset when signal disposition is set to
 	 * ignore or default.  Other flags are kept according to user
 	 * settings.
 	 */
 	return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO ||
 	    ((__sighandler_t *)act->sa_sigaction != SIG_IGN &&
 	    (__sighandler_t *)act->sa_sigaction != SIG_DFL)));
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
     struct sigaction *oact, int flags)
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 	if (act != NULL && act->sa_handler != SIG_DFL &&
 	    act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK |
 	    SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER |
 	    SA_NOCLDWAIT | SA_SIGINFO)) != 0)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		memset(oact, 0, sizeof(*oact));
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (sigact_flag_test(act, SA_SIGINFO)) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!sigact_flag_test(act, SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (sigact_flag_test(act, SA_ONSTACK))
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (sigact_flag_test(act, SA_RESETHAND))
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (sigact_flag_test(act, SA_NODEFER))
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SIGPROP_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sys_sigaction(struct thread *td, struct sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(struct thread *td, struct osigaction_args *uap)
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(struct proc *p)
 {
 	int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++) {
 		if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) {
 			SIGADDSET(ps->ps_sigignore, i);
 		}
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset specified signal to the default disposition.
  */
 static void
 sigdflt(struct sigacts *ps, int sig)
 {
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIGDELSET(ps->ps_sigcatch, sig);
 	if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT)
 		SIGADDSET(ps->ps_sigignore, sig);
 	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	SIGDELSET(ps->ps_siginfo, sig);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	sigset_t osigignore;
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		sigdflt(ps, sig);
 		if ((sigprop(sig) & SIGPROP_IGNORE) != 0)
 			sigqueue_delete_proc(p, sig);
 	}
 
 	/*
 	 * As CloudABI processes cannot modify signal handlers, fully
 	 * reset all signals to their default behavior. Do ignore
 	 * SIGPIPE, as it would otherwise be impossible to recover from
 	 * writes to broken pipes and sockets.
 	 */
 	if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) {
 		osigignore = ps->ps_sigignore;
 		while (SIGNOTEMPTY(osigignore)) {
 			sig = sig_ffs(&osigignore);
 			SIGDELSET(osigignore, sig);
 			if (sig != SIGPIPE)
 				sigdflt(ps, sig);
 		}
 		SIGADDSET(ps->ps_sigignore, SIGPIPE);
 	}
 
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td = curthread;
 	MPASS(td->td_proc == p);
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
     int flags)
 {
 	sigset_t new_block, oset1;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	if ((flags & SIGPROCMASK_PROC_LOCKED) != 0)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 	else
 		PROC_LOCK(p);
 	mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0
 	    ? MA_OWNED : MA_NOTOWNED);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			SIGSETOR(td->td_sigmask, *set);
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			goto out;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			if (flags & SIGPROCMASK_OLD)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The new_block set contains signals that were not previously
 		 * blocked, but are blocked now.
 		 *
 		 * In case we block any signal that was not previously blocked
 		 * for td, and process has the signal pending, try to schedule
 		 * signal delivery to some thread that does not block the
 		 * signal, possibly waking it up.
 		 */
 		if (p->p_numthreads != 1)
 			reschedule_signals(p, new_block, flags);
 	}
 
 out:
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(struct thread *td, struct osigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sys_sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
 			error = ERESTART;
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 static void
 proc_td_siginfo_capture(struct thread *td, siginfo_t *si)
 {
 	struct thread *thr;
 
 	FOREACH_THREAD_IN_PROC(td->td_proc, thr) {
 		if (thr == td)
 			thr->td_si = *si;
 		else
 			thr->td_si.si_signo = 0;
 	}
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t saved_mask, new_block;
 	struct proc *p;
 	int error, sig, timo, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	ets.tv_sec = 0;
 	ets.tv_nsec = 0;
 
 	if (timeout != NULL) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
-			ets = rts;
-			timespecadd(&ets, timeout);
+			timespecadd(&rts, timeout, &ets);
 		}
 	}
 	ksiginfo_init(ksi);
 	/* Some signals can not be waited for. */
 	SIG_CANTMASK(waitset);
 	ps = p->p_sigacts;
 	PROC_LOCK(p);
 	saved_mask = td->td_sigmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	for (;;) {
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT(sig >= 0, ("sig %d", sig));
 		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
 			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
 			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
 				error = 0;
 				break;
 			}
 		}
 
 		if (error != 0)
 			break;
 
 		/*
 		 * POSIX says this must be checked after looking for pending
 		 * signals.
 		 */
 		if (timeout != NULL) {
 			if (!timevalid) {
 				error = EINVAL;
 				break;
 			}
 			getnanouptime(&rts);
 			if (timespeccmp(&rts, &ets, >=)) {
 				error = EAGAIN;
 				break;
 			}
-			ts = ets;
-			timespecsub(&ts, &rts);
+			timespecsub(&ets, &rts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			timo = tvtohz(&tv);
 		} else {
 			timo = 0;
 		}
 
 		error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
 
 		if (timeout != NULL) {
 			if (error == ERESTART) {
 				/* Timeout can not be restarted. */
 				error = EINTR;
 			} else if (error == EAGAIN) {
 				/* We will calculate timeout by ourself. */
 				error = 0;
 			}
 		}
 	}
 
 	new_block = saved_mask;
 	SIGSETNAND(new_block, td->td_sigmask);
 	td->td_sigmask = saved_mask;
 	/*
 	 * Fewer signals can be delivered to us, reschedule signal
 	 * notification.
 	 */
 	if (p->p_numthreads != 1)
 		reschedule_signals(p, new_block, 0);
 
 	if (error == 0) {
 		SDT_PROBE2(proc, , , signal__clear, sig, ksi);
 
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
 		}
 #endif
 		if (sig == SIGKILL) {
 			proc_td_siginfo_capture(td, &ksi->ksi_info);
 			sigexit(td, sig);
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sys_sigpending(struct thread *td, struct sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(struct thread *td, struct osigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(struct thread *td, struct osigvec_args *uap)
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(struct thread *td, struct osigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(struct thread *td, struct osigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap)
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 	int has_sig, sig;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
 	    SIGPROCMASK_PROC_LOCKED);
 	td->td_pflags |= TDP_OLDMASK;
 
 	/*
 	 * Process signals now. Otherwise, we can get spurious wakeup
 	 * due to signal entered process queue, but delivered to other
 	 * thread. But sigsuspend should return only on signal
 	 * delivery.
 	 */
 	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
 	for (has_sig = 0; !has_sig;) {
 		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
 			0) == 0)
 			/* void */;
 		thread_suspend_check(0);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			has_sig += postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 	}
 	PROC_UNLOCK(p);
 	td->td_errno = EINTR;
 	td->td_pflags |= TDP_NERRNO;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(struct thread *td, struct osigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(struct thread *td, struct osigstack_args *uap)
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap)
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct pgrp *pgrp;
 	int err;
 	int ret;
 
 	ret = ESRCH;
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				continue;
 			}
 			PROC_LOCK(p);
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (ret);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 sys_kill(struct thread *td, struct kill_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	/*
 	 * A process in capability mode can send signals only to himself.
 	 * The main rationale behind this is that abort(3) is implemented as
 	 * kill(getpid(), SIGABRT).
 	 */
 	if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid)
 		return (ECAPMODE);
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind_any(uap->pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			pksignal(p, uap->signum, &ksi);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
 	}
 	/* NOTREACHED */
 }
 
 int
 sys_pdkill(struct thread *td, struct pdkill_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_FD(uap->fd);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p);
 	if (error)
 		return (error);
 	AUDIT_ARG_PROCESS(p);
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum)
 		kern_psignal(p, uap->signum);
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(struct thread *td, struct okillpg_args *uap)
 {
 	ksiginfo_t ksi;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	union sigval sv;
 
 	sv.sival_ptr = uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind_any(pid)) == NULL)
 		return (ESRCH);
 	error = p_cansignal(td, p, signum);
 	if (error == 0 && signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_flags = KSI_SIGQ;
 		ksi.ksi_signo = signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value = *value;
 		error = pksignal(p, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(int pgid, int sig, ksiginfo_t *ksi)
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0, ksi);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
 {
 	struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    (checkctty == 0 || p->p_flag & P_CONTROLT))
 				pksignal(p, sig, ksi);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 
 /*
  * Recalculate the signal mask and reset the signal disposition after
  * usermode frame for delivery is formed.  Should be called after
  * mach-specific routine, because sysent->sv_sendsig() needs correct
  * ps_siginfo and signal mask.
  */
 static void
 postsig_done(int sig, struct thread *td, struct sigacts *ps)
 {
 	sigset_t mask;
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	td->td_ru.ru_nsignals++;
 	mask = ps->ps_catchmask[_SIG_IDX(sig)];
 	if (!SIGISMEMBER(ps->ps_signodefer, sig))
 		SIGADDSET(mask, sig);
 	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 	    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 	if (SIGISMEMBER(ps->ps_sigreset, sig))
 		sigdflt(ps, sig);
 }
 
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
 				ksi, &td->td_sigmask);
 		postsig_done(sig, td, ps);
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsendsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching context to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 kern_psignal(struct proc *p, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(p, NULL, sig, &ksi);
 }
 
 int
 pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
 {
 
 	return (tdsendsignal(p, NULL, sig, ksi));
 }
 
 /* Utility function for finding a thread to send signal event to. */
 int
 sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
 {
 	struct thread *td;
 
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
 		if (td == NULL)
 			return (ESRCH);
 		*ttd = td;
 	} else {
 		*ttd = NULL;
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 tdsignal(struct thread *td, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
 }
 
 void
 tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 
 	(void) tdsendsignal(td->td_proc, td, sig, ksi);
 }
 
 int
 tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 	int wakeup_swapper;
 
 	MPASS(td == NULL || p == td->td_proc);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 		panic("%s(): invalid signal %d", __func__, sig);
 
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		sigqueue = &p->p_sigqueue;
 	} else
 		sigqueue = &td->td_sigqueue;
 
 	SDT_PROBE3(proc, , , signal__send, td, p, sig);
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		SDT_PROBE3(proc, , , signal__discard, td, p, sig);
 
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SIGPROP_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SIGPROP_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SIGPROP_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 
 	/* SIGKILL: Remove procfs STOPEVENTs. */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediately, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try to do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		KASSERT(!(p->p_flag & P_WEXIT),
 		    ("signal to stopped but exiting process"));
 		if (sig == SIGKILL) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SIGPROP_CONT) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			PROC_SLOCK(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xsig = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out;
 			}
 			if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				PROC_SUNLOCK(p);
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (prop & SIGPROP_STOP) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		wakeup_swapper = 0;
 		PROC_SLOCK(p);
 		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			wakeup_swapper = sleepq_abort(td, intrval);
 		thread_unlock(td);
 		PROC_SUNLOCK(p);
 		if (wakeup_swapper)
 			kick_proc0();
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			tdsigwakeup(td, sig, action, intrval);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SIGPROP_STOP) {
 			if (p->p_flag & (P_PPWAIT|P_WEXIT))
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			wakeup_swapper = sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xsig);
 			} else
 				PROC_SUNLOCK(p);
 			if (wakeup_swapper)
 				kick_proc0();
 			goto out;
 		}
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 runfast:
 	tdsigwakeup(td, sig, action, intrval);
 	PROC_SLOCK(p);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	int prop;
 	int wakeup_swapper;
 
 	wakeup_swapper = 0;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	PROC_SLOCK(p);
 	thread_lock(td);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.  Be careful to avoid bumping the
 	 * priority of the idle thread, since we still allow to signal
 	 * kernel processes.
 	 */
 	if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 &&
 	    td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 		sched_prio(td, PUSER);
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			goto out;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SIGPROP_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			return;
 		}
 
 		/*
 		 * Don't awaken a sleeping thread for SIGSTOP if the
 		 * STOP signal is deferred.
 		 */
 		if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY |
 		    TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			goto out;
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 			sched_prio(td, PUSER);
 
 		wakeup_swapper = sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 out:
 	PROC_SUNLOCK(p);
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 static int
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	MPASS(sending || td == curthread);
 
 	wakeup_swapper = 0;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR)) {
 			if (td2->td_flags & TDF_SBDRY) {
 				/*
 				 * Once a thread is asleep with
 				 * TDF_SBDRY and without TDF_SERESTART
 				 * or TDF_SEINTR set, it should never
 				 * become suspended due to this check.
 				 */
 				KASSERT(!TD_IS_SUSPENDED(td2),
 				    ("thread with deferred stops suspended"));
 				if (TD_SBDRY_INTR(td2))
 					wakeup_swapper |= sleepq_abort(td2,
 					    TD_SBDRY_ERRNO(td2));
 			} else if (!TD_IS_SUSPENDED(td2)) {
 				thread_suspend_one(td2);
 			}
 		} else if (!TD_IS_SUSPENDED(td2)) {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Stop the process for an event deemed interesting to the debugger. If si is
  * non-NULL, this is a signal exchange; the new signal requested by the
  * debugger will be returned for handling. If si is NULL, this is some other
  * type of interesting event. The debugger may request a signal be delivered in
  * that case as well, however it will be deferred until it can be handled.
  */
 int
 ptracestop(struct thread *td, int sig, ksiginfo_t *si)
 {
 	struct proc *p = td->td_proc;
 	struct thread *td2;
 	ksiginfo_t ksi;
 	int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	td->td_xsig = sig;
 
 	if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) {
 		td->td_dbgflags |= TDB_XSIG;
 		CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d",
 		    td->td_tid, p->p_pid, td->td_dbgflags, sig);
 		PROC_SLOCK(p);
 		while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
 			if (P_KILLED(p)) {
 				/*
 				 * Ensure that, if we've been PT_KILLed, the
 				 * exit status reflects that. Another thread
 				 * may also be in ptracestop(), having just
 				 * received the SIGKILL, but this thread was
 				 * unsuspended first.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				td->td_xsig = SIGKILL;
 				p->p_ptevents = 0;
 				break;
 			}
 			if (p->p_flag & P_SINGLE_EXIT &&
 			    !(td->td_dbgflags & TDB_EXIT)) {
 				/*
 				 * Ignore ptrace stops except for thread exit
 				 * events when the process exits.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				PROC_SUNLOCK(p);
 				return (0);
 			}
 
 			/*
 			 * Make wait(2) work.  Ensure that right after the
 			 * attach, the thread which was decided to become the
 			 * leader of attach gets reported to the waiter.
 			 * Otherwise, just avoid overwriting another thread's
 			 * assignment to p_xthread.  If another thread has
 			 * already set p_xthread, the current thread will get
 			 * a chance to report itself upon the next iteration.
 			 */
 			if ((td->td_dbgflags & TDB_FSTP) != 0 ||
 			    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 			    p->p_xthread == NULL)) {
 				p->p_xsig = sig;
 				p->p_xthread = td;
 				td->td_dbgflags &= ~TDB_FSTP;
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 				p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE;
 				sig_suspend_threads(td, p, 0);
 			}
 			if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
 				td->td_dbgflags &= ~TDB_STOPATFORK;
 			}
 stopme:
 			thread_suspend_switch(td, p);
 			if (p->p_xthread == td)
 				p->p_xthread = NULL;
 			if (!(p->p_flag & P_TRACED))
 				break;
 			if (td->td_dbgflags & TDB_SUSPEND) {
 				if (p->p_flag & P_SINGLE_EXIT)
 					break;
 				goto stopme;
 			}
 		}
 		PROC_SUNLOCK(p);
 	}
 
 	if (si != NULL && sig == td->td_xsig) {
 		/* Parent wants us to take the original signal unchanged. */
 		si->ksi_flags |= KSI_HEAD;
 		if (sigqueue_add(&td->td_sigqueue, sig, si) != 0)
 			si->ksi_signo = 0;
 	} else if (td->td_xsig != 0) {
 		/*
 		 * If parent wants us to take a new signal, then it will leave
 		 * it in td->td_xsig; otherwise we just look for signals again.
 		 */
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = td->td_xsig;
 		ksi.ksi_flags |= KSI_PTRACE;
 		prop = sigprop(td->td_xsig);
 		td2 = sigtd(p, td->td_xsig, prop);
 		tdsendsignal(p, td2, td->td_xsig, &ksi);
 		if (td != td2)
 			return (0);
 	}
 
 	return (td->td_xsig);
 }
 
 static void
 reschedule_signals(struct proc *p, sigset_t block, int flags)
 {
 	struct sigacts *ps;
 	struct thread *td;
 	int sig;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ?
 	    MA_OWNED : MA_NOTOWNED);
 	if (SIGISEMPTY(p->p_siglist))
 		return;
 	SIGSETAND(block, p->p_siglist);
 	while ((sig = sig_ffs(&block)) != 0) {
 		SIGDELSET(block, sig);
 		td = sigtd(p, sig, 0);
 		signotify(td);
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_lock(&ps->ps_mtx);
 		if (p->p_flag & P_TRACED ||
 		    (SIGISMEMBER(ps->ps_sigcatch, sig) &&
 		    !SIGISMEMBER(td->td_sigmask, sig)))
 			tdsigwakeup(td, sig, SIG_CATCH,
 			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
 			     ERESTART));
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_unlock(&ps->ps_mtx);
 	}
 }
 
 void
 tdsigcleanup(struct thread *td)
 {
 	struct proc *p;
 	sigset_t unblocked;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_flush(&td->td_sigqueue);
 	if (p->p_numthreads == 1)
 		return;
 
 	/*
 	 * Since we cannot handle signals, notify signal post code
 	 * about this by filling the sigmask.
 	 *
 	 * Also, if needed, wake up thread(s) that do not block the
 	 * same signals as the exiting thread, since the thread might
 	 * have been selected for delivery and woken up.
 	 */
 	SIGFILLSET(unblocked);
 	SIGSETNAND(unblocked, td->td_sigmask);
 	SIGFILLSET(td->td_sigmask);
 	reschedule_signals(p, unblocked, 0);
 
 }
 
 static int
 sigdeferstop_curr_flags(int cflags)
 {
 
 	MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 ||
 	    (cflags & TDF_SBDRY) != 0);
 	return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART));
 }
 
 /*
  * Defer the delivery of SIGSTOP for the current thread, according to
  * the requested mode.  Returns previous flags, which must be restored
  * by sigallowstop().
  *
  * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and
  * cleared by the current thread, which allow the lock-less read-only
  * accesses below.
  */
 int
 sigdeferstop_impl(int mode)
 {
 	struct thread *td;
 	int cflags, nflags;
 
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	switch (mode) {
 	case SIGDEFERSTOP_NOP:
 		nflags = cflags;
 		break;
 	case SIGDEFERSTOP_OFF:
 		nflags = 0;
 		break;
 	case SIGDEFERSTOP_SILENT:
 		nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART);
 		break;
 	case SIGDEFERSTOP_EINTR:
 		nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART;
 		break;
 	case SIGDEFERSTOP_ERESTART:
 		nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR;
 		break;
 	default:
 		panic("sigdeferstop: invalid mode %x", mode);
 		break;
 	}
 	if (cflags == nflags)
 		return (SIGDEFERSTOP_VAL_NCHG);
 	thread_lock(td);
 	td->td_flags = (td->td_flags & ~cflags) | nflags;
 	thread_unlock(td);
 	return (cflags);
 }
 
 /*
  * Restores the STOP handling mode, typically permitting the delivery
  * of SIGSTOP for the current thread.  This does not immediately
  * suspend if a stop was posted.  Instead, the thread will suspend
  * either via ast() or a subsequent interruptible sleep.
  */
 void
 sigallowstop_impl(int prev)
 {
 	struct thread *td;
 	int cflags;
 
 	KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop"));
 	KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0,
 	    ("sigallowstop: incorrect previous mode %x", prev));
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	if (cflags != prev) {
 		thread_lock(td);
 		td->td_flags = (td->td_flags & ~cflags) | prev;
 		thread_unlock(td);
 	}
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(struct thread *td)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	struct sigqueue *queue;
 	sigset_t sigpending;
 	ksiginfo_t ksi;
 	int prop, sig, traced;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags &
 		    (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED &&
 		    (p->p_flag2 & P2_PTRACE_FSTP) != 0 &&
 		    SIGISMEMBER(sigpending, SIGSTOP)) {
 			/*
 			 * If debugger just attached, always consume
 			 * SIGSTOP from ptrace(PT_ATTACH) first, to
 			 * execute the debugger attach ritual in
 			 * order.
 			 */
 			sig = SIGSTOP;
 			td->td_dbgflags |= TDB_FSTP;
 		} else {
 			sig = sig_ffs(&sigpending);
 		}
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			continue;
 		}
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) {
 			/*
 			 * If traced, always stop.
 			 * Remove old signal from queue before the stop.
 			 * XXX shrug off debugger, it causes siginfo to
 			 * be thrown away.
 			 */
 			queue = &td->td_sigqueue;
 			ksiginfo_init(&ksi);
 			if (sigqueue_get(queue, sig, &ksi) == 0) {
 				queue = &p->p_sigqueue;
 				sigqueue_get(queue, sig, &ksi);
 			}
 			td->td_si = ksi.ksi_info;
 
 			mtx_unlock(&ps->ps_mtx);
 			sig = ptracestop(td, sig, &ksi);
 			mtx_lock(&ps->ps_mtx);
 
 			/* 
 			 * Keep looking if the debugger discarded or
 			 * replaced the signal.
 			 */
 			if (sig == 0)
 				continue;
 
 			/*
 			 * If the signal became masked, re-queue it.
 			 */
 			if (SIGISMEMBER(td->td_sigmask, sig)) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(&p->p_sigqueue, sig, &ksi);
 				continue;
 			}
 
 			/*
 			 * If the traced bit got turned off, requeue
 			 * the signal and go back up to the top to
 			 * rescan signals.  This ensures that p_sig*
 			 * and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(queue, sig, &ksi);
 				continue;
 			}
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process with
 			 * default action, stop here, then clear the signal.
 			 * Traced or exiting processes should ignore stops.
 			 * Additionally, a member of an orphaned process group
 			 * should ignore tty stops.
 			 */
 			if (prop & SIGPROP_STOP) {
 				if (p->p_flag &
 				    (P_TRACED | P_WEXIT | P_SINGLE_EXIT) ||
 				    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SIGPROP_TTYSTOP))
 					break;	/* == ignore */
 				if (TD_SBDRY_INTR(td)) {
 					KASSERT((td->td_flags & TDF_SBDRY) != 0,
 					    ("lost TDF_SBDRY"));
 					return (-1);
 				}
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				sigqueue_delete(&td->td_sigqueue, sig);
 				sigqueue_delete(&p->p_sigqueue, sig);
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xsig = sig;
 				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
 				thread_suspend_switch(td, p);
 				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				goto next;
 			} else if (prop & SIGPROP_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SIGPROP_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);	/* take the signal! */
 		sigqueue_delete(&p->p_sigqueue, sig);
 next:;
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 int
 postsig(int sig)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
 	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
 		return (0);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
 #endif
 	if ((p->p_stops & S_SIG) != 0) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		proc_td_siginfo_capture(td, &ksi.ksi_info);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN, ("postsig action %p", action));
 		KASSERT(!SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action: blocked sig %d", sig));
 
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		if (p->p_sig == sig) {
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 		postsig_done(sig, td, ps);
 	}
 	return (1);
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(struct proc *p, char *why)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid,
 	    p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	p->p_flag |= P_WKILLED;
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SIGPROP_CORE) &&
 	    thread_single(p, SINGLE_NO_EXIT) == 0) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, 0, sig);
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int sig)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, sig);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 
 	childproc_jobstate(p, reason, p->p_xsig);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason, status;
 
 	if (WCOREDUMP(p->p_xsig)) {
 		reason = CLD_DUMPED;
 		status = WTERMSIG(p->p_xsig);
 	} else if (WIFSIGNALED(p->p_xsig)) {
 		reason = CLD_KILLED;
 		status = WTERMSIG(p->p_xsig);
 	} else {
 		reason = CLD_EXITED;
 		status = p->p_xexit;
 	}
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 #define	MAX_NUM_CORE_FILES 100000
 #ifndef NUM_CORE_FILES
 #define	NUM_CORE_FILES 5
 #endif
 CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
 static int num_cores = NUM_CORE_FILES;
 
 static int
 sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int new_val;
 
 	new_val = num_cores;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val > MAX_NUM_CORE_FILES)
 		new_val = MAX_NUM_CORE_FILES;
 	if (new_val < 0)
 		new_val = 0;
 	num_cores = new_val;
 	return (0);
 }
 SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
 	    0, sizeof(int), sysctl_debug_num_cores_check, "I",
 	    "Maximum number of generated process corefiles while using index format");
 
 #define	GZIP_SUFFIX	".gz"
 #define	ZSTD_SUFFIX	".zst"
 
 int compress_user_cores = 0;
 
 static int
 sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = compress_user_cores;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0 && !compressor_avail(val))
 		return (EINVAL);
 	compress_user_cores = val;
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN,
     0, sizeof(int), sysctl_compress_user_cores, "I",
     "Enable compression of user corefiles ("
     __XSTRING(COMPRESS_GZIP) " = gzip, "
     __XSTRING(COMPRESS_ZSTD) " = zstd)");
 
 int compress_user_cores_level = 6;
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
     &compress_user_cores_level, 0,
     "Corefile compression level");
 
 /*
  * Protect the access to corefilename[] by allproc_lock.
  */
 #define	corefilename_lock	allproc_lock
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
 
 static int
 sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	sx_xlock(&corefilename_lock);
 	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
 	    req);
 	sx_xunlock(&corefilename_lock);
 
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
     CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
     "Process corefile name format string");
 
 static void
 vnode_close_locked(struct thread *td, struct vnode *vp)
 {
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, FWRITE, td->td_ucred, td);
 }
 
 /*
  * If the core format has a %I in it, then we need to check
  * for existing corefiles before defining a name.
  * To do this we iterate over 0..ncores to find a
  * non-existing core file name to use. If all core files are
  * already used we choose the oldest one.
  */
 static int
 corefile_open_last(struct thread *td, char *name, int indexpos,
     int indexlen, int ncores, struct vnode **vpp)
 {
 	struct vnode *oldvp, *nextvp, *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, i, flags, oflags, cmode;
 	char ch;
 	struct timespec lasttime;
 
 	nextvp = oldvp = NULL;
 	cmode = S_IRUSR | S_IWUSR;
 	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 
 	for (i = 0; i < ncores; i++) {
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 
 		ch = name[indexpos + indexlen];
 		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
 		    i);
 		name[indexpos + indexlen] = ch;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error != 0)
 			break;
 
 		vp = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if ((flags & O_CREAT) == O_CREAT) {
 			nextvp = vp;
 			break;
 		}
 
 		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 		if (error != 0) {
 			vnode_close_locked(td, vp);
 			break;
 		}
 
 		if (oldvp == NULL ||
 		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
 		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
 		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
 			if (oldvp != NULL)
 				vnode_close_locked(td, oldvp);
 			oldvp = vp;
 			lasttime = vattr.va_mtime;
 		} else {
 			vnode_close_locked(td, vp);
 		}
 	}
 
 	if (oldvp != NULL) {
 		if (nextvp == NULL)
 			nextvp = oldvp;
 		else
 			vnode_close_locked(td, oldvp);
 	}
 	if (error != 0) {
 		if (nextvp != NULL)
 			vnode_close_locked(td, oldvp);
 	} else {
 		*vpp = nextvp;
 	}
 
 	return (error);
 }
 
 /*
  * corefile_open(comm, uid, pid, td, compress, vpp, namep)
  * Expand the name described in corefilename, using name, uid, and pid
  * and open/create core file.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 static int
 corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
     int compress, struct vnode **vpp, char **namep)
 {
 	struct sbuf sb;
 	struct nameidata nd;
 	const char *format;
 	char *hostname, *name;
 	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
 
 	hostname = NULL;
 	format = corefilename;
 	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
 	indexlen = 0;
 	indexpos = -1;
 	ncores = num_cores;
 	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
 	sx_slock(&corefilename_lock);
 	for (i = 0; format[i] != '\0'; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				sbuf_putc(&sb, '%');
 				break;
 			case 'H':	/* hostname */
 				if (hostname == NULL) {
 					hostname = malloc(MAXHOSTNAMELEN,
 					    M_TEMP, M_WAITOK);
 				}
 				getcredhostname(td->td_ucred, hostname,
 				    MAXHOSTNAMELEN);
 				sbuf_printf(&sb, "%s", hostname);
 				break;
 			case 'I':	/* autoincrementing index */
 				if (indexpos != -1) {
 					sbuf_printf(&sb, "%%I");
 					break;
 				}
 
 				indexpos = sbuf_len(&sb);
 				sbuf_printf(&sb, "%u", ncores - 1);
 				indexlen = sbuf_len(&sb) - indexpos;
 				break;
 			case 'N':	/* process name */
 				sbuf_printf(&sb, "%s", comm);
 				break;
 			case 'P':	/* process id */
 				sbuf_printf(&sb, "%u", pid);
 				break;
 			case 'U':	/* user id */
 				sbuf_printf(&sb, "%u", uid);
 				break;
 			default:
 				log(LOG_ERR,
 				    "Unknown format character %c in "
 				    "corename `%s'\n", format[i], format);
 				break;
 			}
 			break;
 		default:
 			sbuf_putc(&sb, format[i]);
 			break;
 		}
 	}
 	sx_sunlock(&corefilename_lock);
 	free(hostname, M_TEMP);
 	if (compress == COMPRESS_GZIP)
 		sbuf_printf(&sb, GZIP_SUFFIX);
 	else if (compress == COMPRESS_ZSTD)
 		sbuf_printf(&sb, ZSTD_SUFFIX);
 	if (sbuf_error(&sb) != 0) {
 		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
 		    "long\n", (long)pid, comm, (u_long)uid);
 		sbuf_delete(&sb);
 		free(name, M_TEMP);
 		return (ENOMEM);
 	}
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (indexpos != -1) {
 		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
 		    vpp);
 		if (error != 0) {
 			log(LOG_ERR,
 			    "pid %d (%s), uid (%u):  Path `%s' failed "
 			    "on initial open test, error = %d\n",
 			    pid, comm, uid, name, error);
 		}
 	} else {
 		cmode = S_IRUSR | S_IWUSR;
 		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error == 0) {
 			*vpp = nd.ni_vp;
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		}
 	}
 
 	if (error != 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	*namep = name;
 	return (0);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp;
 	struct flock lf;
 	struct vattr vattr;
 	int error, error1, locked;
 	char *name;			/* name of corefile */
 	void *rl_cookie;
 	off_t limit;
 	char *fullpath, *freepath = NULL;
 	struct sbuf *sb;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
 	    (p->p_flag2 & P2_NOTRACE) != 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(td, RLIMIT_CORE);
 	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(p);
 
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
 	    compress_user_cores, &vp, &name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Don't dump to non-regular files or files with links.
 	 * Do not dump into system files.
 	 */
 	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
 	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0) {
 		VOP_UNLOCK(vp, 0);
 		error = EFAULT;
 		goto out;
 	}
 
 	VOP_UNLOCK(vp, 0);
 
 	/* Postpone other writers, including core dumps of other processes. */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp, 0);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	if (p->p_sysent->sv_coredump != NULL) {
 		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
 	} else {
 		error = ENOSYS;
 	}
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 	vn_rangelock_unlock(vp, rl_cookie);
 
 	/*
 	 * Notify the userland helper that a process triggered a core dump.
 	 * This allows the helper to run an automated debugging session.
 	 */
 	if (error != 0 || coredump_devctl == 0)
 		goto out;
 	sb = sbuf_new_auto();
 	if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0)
 		goto out2;
 	sbuf_printf(sb, "comm=\"");
 	devctl_safe_quote_sb(sb, fullpath);
 	free(freepath, M_TEMP);
 	sbuf_printf(sb, "\" core=\"");
 
 	/*
 	 * We can't lookup core file vp directly. When we're replacing a core, and
 	 * other random times, we flush the name cache, so it will fail. Instead,
 	 * if the path of the core is relative, add the current dir in front if it.
 	 */
 	if (name[0] != '/') {
 		fullpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		if (kern___getcwd(td, fullpath, UIO_SYSSPACE, MAXPATHLEN, MAXPATHLEN) != 0) {
 			free(fullpath, M_TEMP);
 			goto out2;
 		}
 		devctl_safe_quote_sb(sb, fullpath);
 		free(fullpath, M_TEMP);
 		sbuf_putc(sb, '/');
 	}
 	devctl_safe_quote_sb(sb, name);
 	sbuf_printf(sb, "\"");
 	if (sbuf_finish(sb) == 0)
 		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
 out2:
 	sbuf_delete(sb);
 out:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	tdsignal(td, SIGSYS);
 	PROC_UNLOCK(p);
 	if (kern_lognosys == 1 || kern_lognosys == 3) {
 		uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	if (kern_lognosys == 2 || kern_lognosys == 3) {
 		printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(struct sigio **sigiop, int sig, int checkctty)
 {
 	ksiginfo_t ksi;
 	struct sigio *sigio;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			kern_psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				kern_psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	refcount_init(&ps->ps_refcnt, 1);
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	if (refcount_release(&ps->ps_refcnt) == 0)
 		return;
 	mtx_destroy(&ps->ps_mtx);
 	free(ps, M_SUBPROC);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 
 	refcount_acquire(&ps->ps_refcnt);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 
 	return (ps->ps_refcnt > 1);
 }
Index: head/sys/kern/kern_tc.c
===================================================================
--- head/sys/kern/kern_tc.c	(revision 336913)
+++ head/sys/kern/kern_tc.c	(revision 336914)
@@ -1,2202 +1,2202 @@
 /*-
  * SPDX-License-Identifier: Beerware
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Julien Ridoux at the University
  * of Melbourne under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ntp.h"
 #include "opt_ffclock.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/timeffc.h>
 #include <sys/timepps.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
 #include <sys/vdso.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
  * It is relatively small so that ntp_update_second gets called enough
  * in the typical 'missed a couple of seconds' case, but doesn't loop
  * forever when the time step is large.
  */
 #define LARGE_STEP	200
 
 /*
  * Implement a dummy timecounter which we can use until we get a real one
  * in the air.  This allows the console and other early stuff to use
  * time services.
  */
 
 static u_int
 dummy_get_timecount(struct timecounter *tc)
 {
 	static u_int now;
 
 	return (++now);
 }
 
 static struct timecounter dummy_timecounter = {
 	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
 };
 
 struct timehands {
 	/* These fields must be initialized by the driver. */
 	struct timecounter	*th_counter;
 	int64_t			th_adjustment;
 	uint64_t		th_scale;
 	u_int	 		th_offset_count;
 	struct bintime		th_offset;
 	struct bintime		th_bintime;
 	struct timeval		th_microtime;
 	struct timespec		th_nanotime;
 	struct bintime		th_boottime;
 	/* Fields not to be copied in tc_windup start with th_generation. */
 	u_int			th_generation;
 	struct timehands	*th_next;
 };
 
 static struct timehands th0;
 static struct timehands th1 = {
 	.th_next = &th0
 };
 static struct timehands th0 = {
 	.th_counter = &dummy_timecounter,
 	.th_scale = (uint64_t)-1 / 1000000,
 	.th_offset = { .sec = 1 },
 	.th_generation = 1,
 	.th_next = &th1
 };
 
 static struct timehands *volatile timehands = &th0;
 struct timecounter *timecounter = &dummy_timecounter;
 static struct timecounter *timecounters = &dummy_timecounter;
 
 int tc_min_ticktock_freq = 1;
 
 volatile time_t time_second = 1;
 volatile time_t time_uptime = 1;
 
 static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
 
 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
 
 static int timestepwarnings;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
     &timestepwarnings, 0, "Log time steps");
 
 struct bintime bt_timethreshold;
 struct bintime bt_tickthreshold;
 sbintime_t sbt_timethreshold;
 sbintime_t sbt_tickthreshold;
 struct bintime tc_tick_bt;
 sbintime_t tc_tick_sbt;
 int tc_precexp;
 int tc_timepercentage = TC_DEFAULTPERC;
 static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_timecounter_adjprecision, "I",
     "Allowed time interval deviation in percents");
 
 volatile int rtc_generation = 1;
 
 static int tc_chosen;	/* Non-zero if a specific tc was chosen via sysctl. */
 
 static void tc_windup(struct bintime *new_boottimebin);
 static void cpu_tick_calibrate(int);
 
 void dtrace_getnanotime(struct timespec *tsp);
 
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
 {
 	struct timeval boottime;
 
 	getboottime(&boottime);
 
 #ifndef __mips__
 #ifdef SCTL_MASK32
 	int tv[2];
 
 	if (req->flags & SCTL_MASK32) {
 		tv[0] = boottime.tv_sec;
 		tv[1] = boottime.tv_usec;
 		return (SYSCTL_OUT(req, tv, sizeof(tv)));
 	}
 #endif
 #endif
 	return (SYSCTL_OUT(req, &boottime, sizeof(boottime)));
 }
 
 static int
 sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
 {
 	u_int ncount;
 	struct timecounter *tc = arg1;
 
 	ncount = tc->tc_get_timecount(tc);
 	return (sysctl_handle_int(oidp, &ncount, 0, req));
 }
 
 static int
 sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t freq;
 	struct timecounter *tc = arg1;
 
 	freq = tc->tc_frequency;
 	return (sysctl_handle_64(oidp, &freq, 0, req));
 }
 
 /*
  * Return the difference between the timehands' counter value now and what
  * was when we copied it to the timehands' offset_count.
  */
 static __inline u_int
 tc_delta(struct timehands *th)
 {
 	struct timecounter *tc;
 
 	tc = th->th_counter;
 	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
 	    tc->tc_counter_mask);
 }
 
 /*
  * Functions for reading the time.  We have to loop until we are sure that
  * the timehands that we operated on was not updated under our feet.  See
  * the comment in <sys/time.h> for a description of these 12 functions.
  */
 
 #ifdef FFCLOCK
 void
 fbclock_binuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_bintime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_getbinuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getnanouptime(struct timespec *tsp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timespec(&th->th_offset, tsp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getmicrouptime(struct timeval *tvp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timeval(&th->th_offset, tvp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getbintime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getmicrotime(struct timeval *tvp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tvp = th->th_microtime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 #else /* !FFCLOCK */
 void
 binuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 bintime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timespec(&th->th_offset, tsp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timeval(&th->th_offset, tvp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tvp = th->th_microtime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 #endif /* FFCLOCK */
 
 void
 getboottime(struct timeval *boottime)
 {
 	struct bintime boottimebin;
 
 	getboottimebin(&boottimebin);
 	bintime2timeval(&boottimebin, boottime);
 }
 
 void
 getboottimebin(struct bintime *boottimebin)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*boottimebin = th->th_boottime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 #ifdef FFCLOCK
 /*
  * Support for feed-forward synchronization algorithms. This is heavily inspired
  * by the timehands mechanism but kept independent from it. *_windup() functions
  * have some connection to avoid accessing the timecounter hardware more than
  * necessary.
  */
 
 /* Feed-forward clock estimates kept updated by the synchronization daemon. */
 struct ffclock_estimate ffclock_estimate;
 struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
 uint32_t ffclock_status;		/* Feed-forward clock status. */
 int8_t ffclock_updated;			/* New estimates are available. */
 struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
 
 struct fftimehands {
 	struct ffclock_estimate	cest;
 	struct bintime		tick_time;
 	struct bintime		tick_time_lerp;
 	ffcounter		tick_ffcount;
 	uint64_t		period_lerp;
 	volatile uint8_t	gen;
 	struct fftimehands	*next;
 };
 
 #define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
 
 static struct fftimehands ffth[10];
 static struct fftimehands *volatile fftimehands = ffth;
 
 static void
 ffclock_init(void)
 {
 	struct fftimehands *cur;
 	struct fftimehands *last;
 
 	memset(ffth, 0, sizeof(ffth));
 
 	last = ffth + NUM_ELEMENTS(ffth) - 1;
 	for (cur = ffth; cur < last; cur++)
 		cur->next = cur + 1;
 	last->next = ffth;
 
 	ffclock_updated = 0;
 	ffclock_status = FFCLOCK_STA_UNSYNC;
 	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
 }
 
 /*
  * Reset the feed-forward clock estimates. Called from inittodr() to get things
  * kick started and uses the timecounter nominal frequency as a first period
  * estimate. Note: this function may be called several time just after boot.
  * Note: this is the only function that sets the value of boot time for the
  * monotonic (i.e. uptime) version of the feed-forward clock.
  */
 void
 ffclock_reset_clock(struct timespec *ts)
 {
 	struct timecounter *tc;
 	struct ffclock_estimate cest;
 
 	tc = timehands->th_counter;
 	memset(&cest, 0, sizeof(struct ffclock_estimate));
 
 	timespec2bintime(ts, &ffclock_boottime);
 	timespec2bintime(ts, &(cest.update_time));
 	ffclock_read_counter(&cest.update_ffcount);
 	cest.leapsec_next = 0;
 	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
 	cest.errb_abs = 0;
 	cest.errb_rate = 0;
 	cest.status = FFCLOCK_STA_UNSYNC;
 	cest.leapsec_total = 0;
 	cest.leapsec = 0;
 
 	mtx_lock(&ffclock_mtx);
 	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
 	ffclock_updated = INT8_MAX;
 	mtx_unlock(&ffclock_mtx);
 
 	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
 	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
 	    (unsigned long)ts->tv_nsec);
 }
 
 /*
  * Sub-routine to convert a time interval measured in RAW counter units to time
  * in seconds stored in bintime format.
  * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
  * larger than the max value of u_int (on 32 bit architecture). Loop to consume
  * extra cycles.
  */
 static void
 ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
 {
 	struct bintime bt2;
 	ffcounter delta, delta_max;
 
 	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
 	bintime_clear(bt);
 	do {
 		if (ffdelta > delta_max)
 			delta = delta_max;
 		else
 			delta = ffdelta;
 		bt2.sec = 0;
 		bt2.frac = period;
 		bintime_mul(&bt2, (unsigned int)delta);
 		bintime_add(bt, &bt2);
 		ffdelta -= delta;
 	} while (ffdelta > 0);
 }
 
 /*
  * Update the fftimehands.
  * Push the tick ffcount and time(s) forward based on current clock estimate.
  * The conversion from ffcounter to bintime relies on the difference clock
  * principle, whose accuracy relies on computing small time intervals. If a new
  * clock estimate has been passed by the synchronisation daemon, make it
  * current, and compute the linear interpolation for monotonic time if needed.
  */
 static void
 ffclock_windup(unsigned int delta)
 {
 	struct ffclock_estimate *cest;
 	struct fftimehands *ffth;
 	struct bintime bt, gap_lerp;
 	ffcounter ffdelta;
 	uint64_t frac;
 	unsigned int polling;
 	uint8_t forward_jump, ogen;
 
 	/*
 	 * Pick the next timehand, copy current ffclock estimates and move tick
 	 * times and counter forward.
 	 */
 	forward_jump = 0;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 	cest = &ffth->cest;
 	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
 	ffdelta = (ffcounter)delta;
 	ffth->period_lerp = fftimehands->period_lerp;
 
 	ffth->tick_time = fftimehands->tick_time;
 	ffclock_convert_delta(ffdelta, cest->period, &bt);
 	bintime_add(&ffth->tick_time, &bt);
 
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
 	bintime_add(&ffth->tick_time_lerp, &bt);
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
 
 	/*
 	 * Assess the status of the clock, if the last update is too old, it is
 	 * likely the synchronisation daemon is dead and the clock is free
 	 * running.
 	 */
 	if (ffclock_updated == 0) {
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
 			ffclock_status |= FFCLOCK_STA_UNSYNC;
 	}
 
 	/*
 	 * If available, grab updated clock estimates and make them current.
 	 * Recompute time at this tick using the updated estimates. The clock
 	 * estimates passed the feed-forward synchronisation daemon may result
 	 * in time conversion that is not monotonically increasing (just after
 	 * the update). time_lerp is a particular linear interpolation over the
 	 * synchronisation algo polling period that ensures monotonicity for the
 	 * clock ids requesting it.
 	 */
 	if (ffclock_updated > 0) {
 		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffth->tick_time = cest->update_time;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		bintime_add(&ffth->tick_time, &bt);
 
 		/* ffclock_reset sets ffclock_updated to INT8_MAX */
 		if (ffclock_updated == INT8_MAX)
 			ffth->tick_time_lerp = ffth->tick_time;
 
 		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
 			forward_jump = 1;
 		else
 			forward_jump = 0;
 
 		bintime_clear(&gap_lerp);
 		if (forward_jump) {
 			gap_lerp = ffth->tick_time;
 			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
 		} else {
 			gap_lerp = ffth->tick_time_lerp;
 			bintime_sub(&gap_lerp, &ffth->tick_time);
 		}
 
 		/*
 		 * The reset from the RTC clock may be far from accurate, and
 		 * reducing the gap between real time and interpolated time
 		 * could take a very long time if the interpolated clock insists
 		 * on strict monotonicity. The clock is reset under very strict
 		 * conditions (kernel time is known to be wrong and
 		 * synchronization daemon has been restarted recently.
 		 * ffclock_boottime absorbs the jump to ensure boot time is
 		 * correct and uptime functions stay consistent.
 		 */
 		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
 		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
 		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
 			if (forward_jump)
 				bintime_add(&ffclock_boottime, &gap_lerp);
 			else
 				bintime_sub(&ffclock_boottime, &gap_lerp);
 			ffth->tick_time_lerp = ffth->tick_time;
 			bintime_clear(&gap_lerp);
 		}
 
 		ffclock_status = cest->status;
 		ffth->period_lerp = cest->period;
 
 		/*
 		 * Compute corrected period used for the linear interpolation of
 		 * time. The rate of linear interpolation is capped to 5000PPM
 		 * (5ms/s).
 		 */
 		if (bintime_isset(&gap_lerp)) {
 			ffdelta = cest->update_ffcount;
 			ffdelta -= fftimehands->cest.update_ffcount;
 			ffclock_convert_delta(ffdelta, cest->period, &bt);
 			polling = bt.sec;
 			bt.sec = 0;
 			bt.frac = 5000000 * (uint64_t)18446744073LL;
 			bintime_mul(&bt, polling);
 			if (bintime_cmp(&gap_lerp, &bt, >))
 				gap_lerp = bt;
 
 			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
 			frac = 0;
 			if (gap_lerp.sec > 0) {
 				frac -= 1;
 				frac /= ffdelta / gap_lerp.sec;
 			}
 			frac += gap_lerp.frac / ffdelta;
 
 			if (forward_jump)
 				ffth->period_lerp += frac;
 			else
 				ffth->period_lerp -= frac;
 		}
 
 		ffclock_updated = 0;
 	}
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
  * the old and new hardware counter cannot be read simultaneously. tc_windup()
  * does read the two counters 'back to back', but a few cycles are effectively
  * lost, and not accumulated in tick_ffcount. This is a fairly radical
  * operation for a feed-forward synchronization daemon, and it is its job to not
  * pushing irrelevant data to the kernel. Because there is no locking here,
  * simply force to ignore pending or next update to give daemon a chance to
  * realize the counter has changed.
  */
 static void
 ffclock_change_tc(struct timehands *th)
 {
 	struct fftimehands *ffth;
 	struct ffclock_estimate *cest;
 	struct timecounter *tc;
 	uint8_t ogen;
 
 	tc = th->th_counter;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 
 	cest = &ffth->cest;
 	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
 	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
 	cest->errb_abs = 0;
 	cest->errb_rate = 0;
 	cest->status |= FFCLOCK_STA_UNSYNC;
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount;
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffth->tick_time = fftimehands->tick_time;
 	ffth->period_lerp = cest->period;
 
 	/* Do not lock but ignore next update from synchronization daemon. */
 	ffclock_updated--;
 
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Retrieve feed-forward counter and time of last kernel tick.
  */
 void
 ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
 			*bt = ffth->tick_time_lerp;
 		else
 			*bt = ffth->tick_time;
 		*ffcount = ffth->tick_ffcount;
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Absolute clock conversion. Low level function to convert ffcounter to
  * bintime. The ffcounter is converted using the current ffclock period estimate
  * or the "interpolated period" to ensure monotonicity.
  * NOTE: this conversion may have been deferred, and the clock updated since the
  * hardware counter has been read.
  */
 void
 ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	struct bintime bt2;
 	ffcounter ffdelta;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if (ffcount > ffth->tick_ffcount)
 			ffdelta = ffcount - ffth->tick_ffcount;
 		else
 			ffdelta = ffth->tick_ffcount - ffcount;
 
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
 			*bt = ffth->tick_time_lerp;
 			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
 		} else {
 			*bt = ffth->tick_time;
 			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
 		}
 
 		if (ffcount > ffth->tick_ffcount)
 			bintime_add(bt, &bt2);
 		else
 			bintime_sub(bt, &bt2);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Difference clock conversion.
  * Low level function to Convert a time interval measured in RAW counter units
  * into bintime. The difference clock allows measuring small intervals much more
  * reliably than the absolute clock.
  */
 void
 ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/* No locking but check generation has not changed. */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Access to current ffcounter value.
  */
 void
 ffclock_read_counter(ffcounter *ffcount)
 {
 	struct timehands *th;
 	struct fftimehands *ffth;
 	unsigned int gen, delta;
 
 	/*
 	 * ffclock_windup() called from tc_windup(), safe to rely on
 	 * th->th_generation only, for correct delta and ffcounter.
 	 */
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		ffth = fftimehands;
 		delta = tc_delta(th);
 		*ffcount = ffth->tick_ffcount;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	*ffcount += delta;
 }
 
 void
 binuptime(struct bintime *bt)
 {
 
 	binuptime_fromclock(bt, sysclock_active);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 
 	nanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 
 	microuptime_fromclock(tvp, sysclock_active);
 }
 
 void
 bintime(struct bintime *bt)
 {
 
 	bintime_fromclock(bt, sysclock_active);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 
 	nanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 
 	microtime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 
 	getbinuptime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 
 	getnanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 
 	getbintime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 
 	getnanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 #endif /* FFCLOCK */
 
 /*
  * This is a clone of getnanotime and used for walltimestamps.
  * The dtrace_ prefix prevents fbt from creating probes for
  * it so walltimestamp can be safely used in all fbt probes.
  */
 void
 dtrace_getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 /*
  * System clock currently providing time to the system. Modifiable via sysctl
  * when the FFCLOCK option is defined.
  */
 int sysclock_active = SYSCLOCK_FBCK;
 
 /* Internal NTP status and error estimates. */
 extern int time_status;
 extern long time_esterror;
 
 /*
  * Take a snapshot of sysclock data which can be used to compare system clocks
  * and generate timestamps after the fact.
  */
 void
 sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
 {
 	struct fbclock_info *fbi;
 	struct timehands *th;
 	struct bintime bt;
 	unsigned int delta, gen;
 #ifdef FFCLOCK
 	ffcounter ffcount;
 	struct fftimehands *ffth;
 	struct ffclock_info *ffi;
 	struct ffclock_estimate cest;
 
 	ffi = &clock_snap->ff_info;
 #endif
 
 	fbi = &clock_snap->fb_info;
 	delta = 0;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		fbi->th_scale = th->th_scale;
 		fbi->tick_time = th->th_offset;
 #ifdef FFCLOCK
 		ffth = fftimehands;
 		ffi->tick_time = ffth->tick_time_lerp;
 		ffi->tick_time_lerp = ffth->tick_time_lerp;
 		ffi->period = ffth->cest.period;
 		ffi->period_lerp = ffth->period_lerp;
 		clock_snap->ffcount = ffth->tick_ffcount;
 		cest = ffth->cest;
 #endif
 		if (!fast)
 			delta = tc_delta(th);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	clock_snap->delta = delta;
 	clock_snap->sysclock_active = sysclock_active;
 
 	/* Record feedback clock status and error. */
 	clock_snap->fb_info.status = time_status;
 	/* XXX: Very crude estimate of feedback clock error. */
 	bt.sec = time_esterror / 1000000;
 	bt.frac = ((time_esterror - bt.sec) * 1000000) *
 	    (uint64_t)18446744073709ULL;
 	clock_snap->fb_info.error = bt;
 
 #ifdef FFCLOCK
 	if (!fast)
 		clock_snap->ffcount += delta;
 
 	/* Record feed-forward clock leap second adjustment. */
 	ffi->leapsec_adjustment = cest.leapsec_total;
 	if (clock_snap->ffcount > cest.leapsec_next)
 		ffi->leapsec_adjustment -= cest.leapsec;
 
 	/* Record feed-forward clock status and error. */
 	clock_snap->ff_info.status = cest.status;
 	ffcount = clock_snap->ffcount - cest.update_ffcount;
 	ffclock_convert_delta(ffcount, cest.period, &bt);
 	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
 	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
 	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
 	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
 	clock_snap->ff_info.error = bt;
 #endif
 }
 
 /*
  * Convert a sysclock snapshot into a struct bintime based on the specified
  * clock source and flags.
  */
 int
 sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
     int whichclock, uint32_t flags)
 {
 	struct bintime boottimebin;
 #ifdef FFCLOCK
 	struct bintime bt2;
 	uint64_t period;
 #endif
 
 	switch (whichclock) {
 	case SYSCLOCK_FBCK:
 		*bt = cs->fb_info.tick_time;
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0)
 			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
 
 		if ((flags & FBCLOCK_UPTIME) == 0) {
 			getboottimebin(&boottimebin);
 			bintime_add(bt, &boottimebin);
 		}
 		break;
 #ifdef FFCLOCK
 	case SYSCLOCK_FFWD:
 		if (flags & FFCLOCK_LERP) {
 			*bt = cs->ff_info.tick_time_lerp;
 			period = cs->ff_info.period_lerp;
 		} else {
 			*bt = cs->ff_info.tick_time;
 			period = cs->ff_info.period;
 		}
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0) {
 			ffclock_convert_delta(cs->delta, period, &bt2);
 			bintime_add(bt, &bt2);
 		}
 
 		/* Leap second adjustment. */
 		if (flags & FFCLOCK_LEAPSEC)
 			bt->sec -= cs->ff_info.leapsec_adjustment;
 
 		/* Boot time adjustment, for uptime/monotonic clocks. */
 		if (flags & FFCLOCK_UPTIME)
 			bintime_sub(bt, &ffclock_boottime);
 		break;
 #endif
 	default:
 		return (EINVAL);
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Initialize a new timecounter and possibly use it.
  */
 void
 tc_init(struct timecounter *tc)
 {
 	u_int u;
 	struct sysctl_oid *tc_root;
 
 	u = tc->tc_frequency / tc->tc_counter_mask;
 	/* XXX: We need some margin here, 10% is a guess */
 	u *= 11;
 	u /= 10;
 	if (u > hz && tc->tc_quality >= 0) {
 		tc->tc_quality = -2000;
 		if (bootverbose) {
 			printf("Timecounter \"%s\" frequency %ju Hz",
 			    tc->tc_name, (uintmax_t)tc->tc_frequency);
 			printf(" -- Insufficient hz, needs at least %u\n", u);
 		}
 	} else if (tc->tc_quality >= 0 || bootverbose) {
 		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
 		    tc->tc_name, (uintmax_t)tc->tc_frequency,
 		    tc->tc_quality);
 	}
 
 	tc->tc_next = timecounters;
 	timecounters = tc;
 	/*
 	 * Set up sysctl tree for this counter.
 	 */
 	tc_root = SYSCTL_ADD_NODE_WITH_LABEL(NULL,
 	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
 	    CTLFLAG_RW, 0, "timecounter description", "timecounter");
 	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
 	    "mask for implemented bits");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
 	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
 	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
 	    "goodness of time counter");
 	/*
 	 * Do not automatically switch if the current tc was specifically
 	 * chosen.  Never automatically use a timecounter with negative quality.
 	 * Even though we run on the dummy counter, switching here may be
 	 * worse since this timecounter may not be monotonic.
 	 */
 	if (tc_chosen)
 		return;
 	if (tc->tc_quality < 0)
 		return;
 	if (tc->tc_quality < timecounter->tc_quality)
 		return;
 	if (tc->tc_quality == timecounter->tc_quality &&
 	    tc->tc_frequency < timecounter->tc_frequency)
 		return;
 	(void)tc->tc_get_timecount(tc);
 	(void)tc->tc_get_timecount(tc);
 	timecounter = tc;
 }
 
 /* Report the frequency of the current timecounter. */
 uint64_t
 tc_getfrequency(void)
 {
 
 	return (timehands->th_counter->tc_frequency);
 }
 
 static bool
 sleeping_on_old_rtc(struct thread *td)
 {
 
 	/*
 	 * td_rtcgen is modified by curthread when it is running,
 	 * and by other threads in this function.  By finding the thread
 	 * on a sleepqueue and holding the lock on the sleepqueue
 	 * chain, we guarantee that the thread is not running and that
 	 * modifying td_rtcgen is safe.  Setting td_rtcgen to zero informs
 	 * the thread that it was woken due to a real-time clock adjustment.
 	 * (The declaration of td_rtcgen refers to this comment.)
 	 */
 	if (td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation) {
 		td->td_rtcgen = 0;
 		return (true);
 	}
 	return (false);
 }
 
 static struct mtx tc_setclock_mtx;
 MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN);
 
 /*
  * Step our concept of UTC.  This is done by modifying our estimate of
  * when we booted.
  */
 void
 tc_setclock(struct timespec *ts)
 {
 	struct timespec tbef, taft;
 	struct bintime bt, bt2;
 
 	timespec2bintime(ts, &bt);
 	nanotime(&tbef);
 	mtx_lock_spin(&tc_setclock_mtx);
 	cpu_tick_calibrate(1);
 	binuptime(&bt2);
 	bintime_sub(&bt, &bt2);
 
 	/* XXX fiddle all the little crinkly bits around the fiords... */
 	tc_windup(&bt);
 	mtx_unlock_spin(&tc_setclock_mtx);
 
 	/* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
 	atomic_add_rel_int(&rtc_generation, 2);
 	sleepq_chains_remove_matching(sleeping_on_old_rtc);
 	if (timestepwarnings) {
 		nanotime(&taft);
 		log(LOG_INFO,
 		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
 		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
 		    (intmax_t)taft.tv_sec, taft.tv_nsec,
 		    (intmax_t)ts->tv_sec, ts->tv_nsec);
 	}
 }
 
 /*
  * Initialize the next struct timehands in the ring and make
  * it the active timehands.  Along the way we might switch to a different
  * timecounter and/or do seconds processing in NTP.  Slightly magic.
  */
 static void
 tc_windup(struct bintime *new_boottimebin)
 {
 	struct bintime bt;
 	struct timehands *th, *tho;
 	uint64_t scale;
 	u_int delta, ncount, ogen;
 	int i;
 	time_t t;
 
 	/*
 	 * Make the next timehands a copy of the current one, but do
 	 * not overwrite the generation or next pointer.  While we
 	 * update the contents, the generation must be zero.  We need
 	 * to ensure that the zero generation is visible before the
 	 * data updates become visible, which requires release fence.
 	 * For similar reasons, re-reading of the generation after the
 	 * data is read should use acquire fence.
 	 */
 	tho = timehands;
 	th = tho->th_next;
 	ogen = th->th_generation;
 	th->th_generation = 0;
 	atomic_thread_fence_rel();
 	memcpy(th, tho, offsetof(struct timehands, th_generation));
 	if (new_boottimebin != NULL)
 		th->th_boottime = *new_boottimebin;
 
 	/*
 	 * Capture a timecounter delta on the current timecounter and if
 	 * changing timecounters, a counter value from the new timecounter.
 	 * Update the offset fields accordingly.
 	 */
 	delta = tc_delta(th);
 	if (th->th_counter != timecounter)
 		ncount = timecounter->tc_get_timecount(timecounter);
 	else
 		ncount = 0;
 #ifdef FFCLOCK
 	ffclock_windup(delta);
 #endif
 	th->th_offset_count += delta;
 	th->th_offset_count &= th->th_counter->tc_counter_mask;
 	while (delta > th->th_counter->tc_frequency) {
 		/* Eat complete unadjusted seconds. */
 		delta -= th->th_counter->tc_frequency;
 		th->th_offset.sec++;
 	}
 	if ((delta > th->th_counter->tc_frequency / 2) &&
 	    (th->th_scale * delta < ((uint64_t)1 << 63))) {
 		/* The product th_scale * delta just barely overflows. */
 		th->th_offset.sec++;
 	}
 	bintime_addx(&th->th_offset, th->th_scale * delta);
 
 	/*
 	 * Hardware latching timecounters may not generate interrupts on
 	 * PPS events, so instead we poll them.  There is a finite risk that
 	 * the hardware might capture a count which is later than the one we
 	 * got above, and therefore possibly in the next NTP second which might
 	 * have a different rate than the current NTP second.  It doesn't
 	 * matter in practice.
 	 */
 	if (tho->th_counter->tc_poll_pps)
 		tho->th_counter->tc_poll_pps(tho->th_counter);
 
 	/*
 	 * Deal with NTP second processing.  The for loop normally
 	 * iterates at most once, but in extreme situations it might
 	 * keep NTP sane if timeouts are not run for several seconds.
 	 * At boot, the time step can be large when the TOD hardware
 	 * has been read, so on really large steps, we call
 	 * ntp_update_second only twice.  We need to call it twice in
 	 * case we missed a leap second.
 	 */
 	bt = th->th_offset;
 	bintime_add(&bt, &th->th_boottime);
 	i = bt.sec - tho->th_microtime.tv_sec;
 	if (i > LARGE_STEP)
 		i = 2;
 	for (; i > 0; i--) {
 		t = bt.sec;
 		ntp_update_second(&th->th_adjustment, &bt.sec);
 		if (bt.sec != t)
 			th->th_boottime.sec += bt.sec - t;
 	}
 	/* Update the UTC timestamps used by the get*() functions. */
 	th->th_bintime = bt;
 	bintime2timeval(&bt, &th->th_microtime);
 	bintime2timespec(&bt, &th->th_nanotime);
 
 	/* Now is a good time to change timecounters. */
 	if (th->th_counter != timecounter) {
 #ifndef __arm__
 		if ((timecounter->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep++;
 		if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep--;
 #endif
 		th->th_counter = timecounter;
 		th->th_offset_count = ncount;
 		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
 		    (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
 #ifdef FFCLOCK
 		ffclock_change_tc(th);
 #endif
 	}
 
 	/*-
 	 * Recalculate the scaling factor.  We want the number of 1/2^64
 	 * fractions of a second per period of the hardware counter, taking
 	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
 	 * processing provides us with.
 	 *
 	 * The th_adjustment is nanoseconds per second with 32 bit binary
 	 * fraction and we want 64 bit binary fraction of second:
 	 *
 	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
 	 *
 	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
 	 * we can only multiply by about 850 without overflowing, that
 	 * leaves no suitably precise fractions for multiply before divide.
 	 *
 	 * Divide before multiply with a fraction of 2199/512 results in a
 	 * systematic undercompensation of 10PPM of th_adjustment.  On a
 	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
  	 *
 	 * We happily sacrifice the lowest of the 64 bits of our result
 	 * to the goddess of code clarity.
 	 *
 	 */
 	scale = (uint64_t)1 << 63;
 	scale += (th->th_adjustment / 1024) * 2199;
 	scale /= th->th_counter->tc_frequency;
 	th->th_scale = scale * 2;
 
 	/*
 	 * Now that the struct timehands is again consistent, set the new
 	 * generation number, making sure to not make it zero.
 	 */
 	if (++ogen == 0)
 		ogen = 1;
 	atomic_store_rel_int(&th->th_generation, ogen);
 
 	/* Go live with the new struct timehands. */
 #ifdef FFCLOCK
 	switch (sysclock_active) {
 	case SYSCLOCK_FBCK:
 #endif
 		time_second = th->th_microtime.tv_sec;
 		time_uptime = th->th_offset.sec;
 #ifdef FFCLOCK
 		break;
 	case SYSCLOCK_FFWD:
 		time_second = fftimehands->tick_time_lerp.sec;
 		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
 		break;
 	}
 #endif
 
 	timehands = th;
 	timekeep_push_vdso();
 }
 
 /* Report or change the active timecounter hardware. */
 static int
 sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
 {
 	char newname[32];
 	struct timecounter *newtc, *tc;
 	int error;
 
 	tc = timecounter;
 	strlcpy(newname, tc->tc_name, sizeof(newname));
 
 	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	/* Record that the tc in use now was specifically chosen. */
 	tc_chosen = 1;
 	if (strcmp(newname, tc->tc_name) == 0)
 		return (0);
 	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
 		if (strcmp(newname, newtc->tc_name) != 0)
 			continue;
 
 		/* Warm up new timecounter. */
 		(void)newtc->tc_get_timecount(newtc);
 		(void)newtc->tc_get_timecount(newtc);
 
 		timecounter = newtc;
 
 		/*
 		 * The vdso timehands update is deferred until the next
 		 * 'tc_windup()'.
 		 *
 		 * This is prudent given that 'timekeep_push_vdso()' does not
 		 * use any locking and that it can be called in hard interrupt
 		 * context via 'tc_windup()'.
 		 */
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
     0, 0, sysctl_kern_timecounter_hardware, "A",
     "Timecounter hardware selected");
 
 
 /* Report the available timecounter hardware. */
 static int
 sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct timecounter *tc;
 	int error;
 
 	sbuf_new_for_sysctl(&sb, NULL, 0, req);
 	for (tc = timecounters; tc != NULL; tc = tc->tc_next) {
 		if (tc != timecounters)
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality);
 	}
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
 
 /*
  * RFC 2783 PPS-API implementation.
  */
 
 /*
  *  Return true if the driver is aware of the abi version extensions in the
  *  pps_state structure, and it supports at least the given abi version number.
  */
 static inline int
 abi_aware(struct pps_state *pps, int vers)
 {
 
 	return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers);
 }
 
 static int
 pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
 {
 	int err, timo;
 	pps_seq_t aseq, cseq;
 	struct timeval tv;
 
 	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
 		return (EINVAL);
 
 	/*
 	 * If no timeout is requested, immediately return whatever values were
 	 * most recently captured.  If timeout seconds is -1, that's a request
 	 * to block without a timeout.  WITNESS won't let us sleep forever
 	 * without a lock (we really don't need a lock), so just repeatedly
 	 * sleep a long time.
 	 */
 	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
 		if (fapi->timeout.tv_sec == -1)
 			timo = 0x7fffffff;
 		else {
 			tv.tv_sec = fapi->timeout.tv_sec;
 			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
 			timo = tvtohz(&tv);
 		}
 		aseq = atomic_load_int(&pps->ppsinfo.assert_sequence);
 		cseq = atomic_load_int(&pps->ppsinfo.clear_sequence);
 		while (aseq == atomic_load_int(&pps->ppsinfo.assert_sequence) &&
 		    cseq == atomic_load_int(&pps->ppsinfo.clear_sequence)) {
 			if (abi_aware(pps, 1) && pps->driver_mtx != NULL) {
 				if (pps->flags & PPSFLAG_MTX_SPIN) {
 					err = msleep_spin(pps, pps->driver_mtx,
 					    "ppsfch", timo);
 				} else {
 					err = msleep(pps, pps->driver_mtx, PCATCH,
 					    "ppsfch", timo);
 				}
 			} else {
 				err = tsleep(pps, PCATCH, "ppsfch", timo);
 			}
 			if (err == EWOULDBLOCK) {
 				if (fapi->timeout.tv_sec == -1) {
 					continue;
 				} else {
 					return (ETIMEDOUT);
 				}
 			} else if (err != 0) {
 				return (err);
 			}
 		}
 	}
 
 	pps->ppsinfo.current_mode = pps->ppsparam.mode;
 	fapi->pps_info_buf = pps->ppsinfo;
 
 	return (0);
 }
 
 int
 pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
 {
 	pps_params_t *app;
 	struct pps_fetch_args *fapi;
 #ifdef FFCLOCK
 	struct pps_fetch_ffc_args *fapi_ffc;
 #endif
 #ifdef PPS_SYNC
 	struct pps_kcbind_args *kapi;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
 	switch (cmd) {
 	case PPS_IOC_CREATE:
 		return (0);
 	case PPS_IOC_DESTROY:
 		return (0);
 	case PPS_IOC_SETPARAMS:
 		app = (pps_params_t *)data;
 		if (app->mode & ~pps->ppscap)
 			return (EINVAL);
 #ifdef FFCLOCK
 		/* Ensure only a single clock is selected for ffc timestamp. */
 		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
 			return (EINVAL);
 #endif
 		pps->ppsparam = *app;
 		return (0);
 	case PPS_IOC_GETPARAMS:
 		app = (pps_params_t *)data;
 		*app = pps->ppsparam;
 		app->api_version = PPS_API_VERS_1;
 		return (0);
 	case PPS_IOC_GETCAP:
 		*(int*)data = pps->ppscap;
 		return (0);
 	case PPS_IOC_FETCH:
 		fapi = (struct pps_fetch_args *)data;
 		return (pps_fetch(fapi, pps));
 #ifdef FFCLOCK
 	case PPS_IOC_FETCH_FFCOUNTER:
 		fapi_ffc = (struct pps_fetch_ffc_args *)data;
 		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
 		    PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
 			return (EOPNOTSUPP);
 		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
 		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
 		/* Overwrite timestamps if feedback clock selected. */
 		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
 		case PPS_TSCLK_FBCK:
 			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
 			    pps->ppsinfo.assert_timestamp;
 			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
 			    pps->ppsinfo.clear_timestamp;
 			break;
 		case PPS_TSCLK_FFWD:
 			break;
 		default:
 			break;
 		}
 		return (0);
 #endif /* FFCLOCK */
 	case PPS_IOC_KCBIND:
 #ifdef PPS_SYNC
 		kapi = (struct pps_kcbind_args *)data;
 		/* XXX Only root should be able to do this */
 		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
 			return (EINVAL);
 		if (kapi->edge & ~pps->ppscap)
 			return (EINVAL);
 		pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) |
 		    (pps->kcmode & KCMODE_ABIFLAG);
 		return (0);
 #else
 		return (EOPNOTSUPP);
 #endif
 	default:
 		return (ENOIOCTL);
 	}
 }
 
 void
 pps_init(struct pps_state *pps)
 {
 	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
 	if (pps->ppscap & PPS_CAPTUREASSERT)
 		pps->ppscap |= PPS_OFFSETASSERT;
 	if (pps->ppscap & PPS_CAPTURECLEAR)
 		pps->ppscap |= PPS_OFFSETCLEAR;
 #ifdef FFCLOCK
 	pps->ppscap |= PPS_TSCLK_MASK;
 #endif
 	pps->kcmode &= ~KCMODE_ABIFLAG;
 }
 
 void
 pps_init_abi(struct pps_state *pps)
 {
 
 	pps_init(pps);
 	if (pps->driver_abi > 0) {
 		pps->kcmode |= KCMODE_ABIFLAG;
 		pps->kernel_abi = PPS_ABI_VERSION;
 	}
 }
 
 void
 pps_capture(struct pps_state *pps)
 {
 	struct timehands *th;
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
 	th = timehands;
 	pps->capgen = atomic_load_acq_int(&th->th_generation);
 	pps->capth = th;
 #ifdef FFCLOCK
 	pps->capffth = fftimehands;
 #endif
 	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
 	atomic_thread_fence_acq();
 	if (pps->capgen != th->th_generation)
 		pps->capgen = 0;
 }
 
 void
 pps_event(struct pps_state *pps, int event)
 {
 	struct bintime bt;
 	struct timespec ts, *tsp, *osp;
 	u_int tcount, *pcount;
 	int foff;
 	pps_seq_t *pseq;
 #ifdef FFCLOCK
 	struct timespec *tsp_ffc;
 	pps_seq_t *pseq_ffc;
 	ffcounter *ffcount;
 #endif
 #ifdef PPS_SYNC
 	int fhard;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
 	/* Nothing to do if not currently set to capture this event type. */
 	if ((event & pps->ppsparam.mode) == 0)
 		return;
 	/* If the timecounter was wound up underneath us, bail out. */
 	if (pps->capgen == 0 || pps->capgen !=
 	    atomic_load_acq_int(&pps->capth->th_generation))
 		return;
 
 	/* Things would be easier with arrays. */
 	if (event == PPS_CAPTUREASSERT) {
 		tsp = &pps->ppsinfo.assert_timestamp;
 		osp = &pps->ppsparam.assert_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTUREASSERT;
 #endif
 		pcount = &pps->ppscount[0];
 		pseq = &pps->ppsinfo.assert_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
 #endif
 	} else {
 		tsp = &pps->ppsinfo.clear_timestamp;
 		osp = &pps->ppsparam.clear_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTURECLEAR;
 #endif
 		pcount = &pps->ppscount[1];
 		pseq = &pps->ppsinfo.clear_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
 #endif
 	}
 
 	/*
 	 * If the timecounter changed, we cannot compare the count values, so
 	 * we have to drop the rest of the PPS-stuff until the next event.
 	 */
 	if (pps->ppstc != pps->capth->th_counter) {
 		pps->ppstc = pps->capth->th_counter;
 		*pcount = pps->capcount;
 		pps->ppscount[2] = pps->capcount;
 		return;
 	}
 
 	/* Convert the count to a timespec. */
 	tcount = pps->capcount - pps->capth->th_offset_count;
 	tcount &= pps->capth->th_counter->tc_counter_mask;
 	bt = pps->capth->th_bintime;
 	bintime_addx(&bt, pps->capth->th_scale * tcount);
 	bintime2timespec(&bt, &ts);
 
 	/* If the timecounter was wound up underneath us, bail out. */
 	atomic_thread_fence_acq();
 	if (pps->capgen != pps->capth->th_generation)
 		return;
 
 	*pcount = pps->capcount;
 	(*pseq)++;
 	*tsp = ts;
 
 	if (foff) {
-		timespecadd(tsp, osp);
+		timespecadd(tsp, osp, tsp);
 		if (tsp->tv_nsec < 0) {
 			tsp->tv_nsec += 1000000000;
 			tsp->tv_sec -= 1;
 		}
 	}
 
 #ifdef FFCLOCK
 	*ffcount = pps->capffth->tick_ffcount + tcount;
 	bt = pps->capffth->tick_time;
 	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
 	bintime_add(&bt, &pps->capffth->tick_time);
 	bintime2timespec(&bt, &ts);
 	(*pseq_ffc)++;
 	*tsp_ffc = ts;
 #endif
 
 #ifdef PPS_SYNC
 	if (fhard) {
 		uint64_t scale;
 
 		/*
 		 * Feed the NTP PLL/FLL.
 		 * The FLL wants to know how many (hardware) nanoseconds
 		 * elapsed since the previous event.
 		 */
 		tcount = pps->capcount - pps->ppscount[2];
 		pps->ppscount[2] = pps->capcount;
 		tcount &= pps->capth->th_counter->tc_counter_mask;
 		scale = (uint64_t)1 << 63;
 		scale /= pps->capth->th_counter->tc_frequency;
 		scale *= 2;
 		bt.sec = 0;
 		bt.frac = 0;
 		bintime_addx(&bt, scale * tcount);
 		bintime2timespec(&bt, &ts);
 		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
 	}
 #endif
 
 	/* Wakeup anyone sleeping in pps_fetch().  */
 	wakeup(pps);
 }
 
 /*
  * Timecounters need to be updated every so often to prevent the hardware
  * counter from overflowing.  Updating also recalculates the cached values
  * used by the get*() family of functions, so their precision depends on
  * the update frequency.
  */
 
 static int tc_tick;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
     "Approximate number of hardclock ticks in a millisecond");
 
 void
 tc_ticktock(int cnt)
 {
 	static int count;
 
 	if (mtx_trylock_spin(&tc_setclock_mtx)) {
 		count += cnt;
 		if (count >= tc_tick) {
 			count = 0;
 			tc_windup(NULL);
 		}
 		mtx_unlock_spin(&tc_setclock_mtx);
 	}
 }
 
 static void __inline
 tc_adjprecision(void)
 {
 	int t;
 
 	if (tc_timepercentage > 0) {
 		t = (99 + tc_timepercentage) / tc_timepercentage;
 		tc_precexp = fls(t + (t >> 1)) - 1;
 		FREQ2BT(hz / tc_tick, &bt_timethreshold);
 		FREQ2BT(hz, &bt_tickthreshold);
 		bintime_shift(&bt_timethreshold, tc_precexp);
 		bintime_shift(&bt_tickthreshold, tc_precexp);
 	} else {
 		tc_precexp = 31;
 		bt_timethreshold.sec = INT_MAX;
 		bt_timethreshold.frac = ~(uint64_t)0;
 		bt_tickthreshold = bt_timethreshold;
 	}
 	sbt_timethreshold = bttosbt(bt_timethreshold);
 	sbt_tickthreshold = bttosbt(bt_tickthreshold);
 }
 
 static int
 sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = tc_timepercentage;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	tc_timepercentage = val;
 	if (cold)
 		goto done;
 	tc_adjprecision();
 done:
 	return (0);
 }
 
 static void
 inittimecounter(void *dummy)
 {
 	u_int p;
 	int tick_rate;
 
 	/*
 	 * Set the initial timeout to
 	 * max(1, <approx. number of hardclock ticks in a millisecond>).
 	 * People should probably not use the sysctl to set the timeout
 	 * to smaller than its initial value, since that value is the
 	 * smallest reasonable one.  If they want better timestamps they
 	 * should use the non-"get"* functions.
 	 */
 	if (hz > 1000)
 		tc_tick = (hz + 500) / 1000;
 	else
 		tc_tick = 1;
 	tc_adjprecision();
 	FREQ2BT(hz, &tick_bt);
 	tick_sbt = bttosbt(tick_bt);
 	tick_rate = hz / tc_tick;
 	FREQ2BT(tick_rate, &tc_tick_bt);
 	tc_tick_sbt = bttosbt(tc_tick_bt);
 	p = (tc_tick * 1000000) / hz;
 	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
 
 #ifdef FFCLOCK
 	ffclock_init();
 #endif
 	/* warm up new timecounter (again) and get rolling. */
 	(void)timecounter->tc_get_timecount(timecounter);
 	(void)timecounter->tc_get_timecount(timecounter);
 	mtx_lock_spin(&tc_setclock_mtx);
 	tc_windup(NULL);
 	mtx_unlock_spin(&tc_setclock_mtx);
 }
 
 SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
 
 /* Cpu tick handling -------------------------------------------------*/
 
 static int cpu_tick_variable;
 static uint64_t	cpu_tick_frequency;
 
 DPCPU_DEFINE_STATIC(uint64_t, tc_cpu_ticks_base);
 DPCPU_DEFINE_STATIC(unsigned, tc_cpu_ticks_last);
 
 static uint64_t
 tc_cpu_ticks(void)
 {
 	struct timecounter *tc;
 	uint64_t res, *base;
 	unsigned u, *last;
 
 	critical_enter();
 	base = DPCPU_PTR(tc_cpu_ticks_base);
 	last = DPCPU_PTR(tc_cpu_ticks_last);
 	tc = timehands->th_counter;
 	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
 	if (u < *last)
 		*base += (uint64_t)tc->tc_counter_mask + 1;
 	*last = u;
 	res = u + *base;
 	critical_exit();
 	return (res);
 }
 
 void
 cpu_tick_calibration(void)
 {
 	static time_t last_calib;
 
 	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
 		cpu_tick_calibrate(0);
 		last_calib = time_uptime;
 	}
 }
 
 /*
  * This function gets called every 16 seconds on only one designated
  * CPU in the system from hardclock() via cpu_tick_calibration()().
  *
  * Whenever the real time clock is stepped we get called with reset=1
  * to make sure we handle suspend/resume and similar events correctly.
  */
 
 static void
 cpu_tick_calibrate(int reset)
 {
 	static uint64_t c_last;
 	uint64_t c_this, c_delta;
 	static struct bintime  t_last;
 	struct bintime t_this, t_delta;
 	uint32_t divi;
 
 	if (reset) {
 		/* The clock was stepped, abort & reset */
 		t_last.sec = 0;
 		return;
 	}
 
 	/* we don't calibrate fixed rate cputicks */
 	if (!cpu_tick_variable)
 		return;
 
 	getbinuptime(&t_this);
 	c_this = cpu_ticks();
 	if (t_last.sec != 0) {
 		c_delta = c_this - c_last;
 		t_delta = t_this;
 		bintime_sub(&t_delta, &t_last);
 		/*
 		 * Headroom:
 		 * 	2^(64-20) / 16[s] =
 		 * 	2^(44) / 16[s] =
 		 * 	17.592.186.044.416 / 16 =
 		 * 	1.099.511.627.776 [Hz]
 		 */
 		divi = t_delta.sec << 20;
 		divi |= t_delta.frac >> (64 - 20);
 		c_delta <<= 20;
 		c_delta /= divi;
 		if (c_delta > cpu_tick_frequency) {
 			if (0 && bootverbose)
 				printf("cpu_tick increased to %ju Hz\n",
 				    c_delta);
 			cpu_tick_frequency = c_delta;
 		}
 	}
 	c_last = c_this;
 	t_last = t_this;
 }
 
 void
 set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
 {
 
 	if (func == NULL) {
 		cpu_ticks = tc_cpu_ticks;
 	} else {
 		cpu_tick_frequency = freq;
 		cpu_tick_variable = var;
 		cpu_ticks = func;
 	}
 }
 
 uint64_t
 cpu_tickrate(void)
 {
 
 	if (cpu_ticks == tc_cpu_ticks) 
 		return (tc_getfrequency());
 	return (cpu_tick_frequency);
 }
 
 /*
  * We need to be slightly careful converting cputicks to microseconds.
  * There is plenty of margin in 64 bits of microseconds (half a million
  * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
  * before divide conversion (to retain precision) we find that the
  * margin shrinks to 1.5 hours (one millionth of 146y).
  * With a three prong approach we never lose significant bits, no
  * matter what the cputick rate and length of timeinterval is.
  */
 
 uint64_t
 cputick2usec(uint64_t tick)
 {
 
 	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
 		return (tick / (cpu_tickrate() / 1000000LL));
 	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
 		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
 	else
 		return ((tick * 1000000LL) / cpu_tickrate());
 }
 
 cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
 
 static int vdso_th_enable = 1;
 static int
 sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
 {
 	int old_vdso_th_enable, error;
 
 	old_vdso_th_enable = vdso_th_enable;
 	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
 	if (error != 0)
 		return (error);
 	vdso_th_enable = old_vdso_th_enable;
 	return (0);
 }
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
 
 uint32_t
 tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	vdso_th->th_scale = th->th_scale;
 	vdso_th->th_offset_count = th->th_offset_count;
 	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th->th_offset = th->th_offset;
 	vdso_th->th_boottime = th->th_boottime;
 	if (th->th_counter->tc_fill_vdso_timehands != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 
 #ifdef COMPAT_FREEBSD32
 uint32_t
 tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
 	vdso_th32->th_offset_count = th->th_offset_count;
 	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th32->th_offset.sec = th->th_offset.sec;
 	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
 	vdso_th32->th_boottime.sec = th->th_boottime.sec;
 	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac;
 	if (th->th_counter->tc_fill_vdso_timehands32 != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 #endif
Index: head/sys/kern/kern_time.c
===================================================================
--- head/sys/kern/kern_time.c	(revision 336913)
+++ head/sys/kern/kern_time.c	(revision 336914)
@@ -1,1763 +1,1765 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/time.h>
 #include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
 #define CPUCLOCK_BIT		0x80000000
 #define CPUCLOCK_PROCESS_BIT	0x40000000
 #define CPUCLOCK_ID_MASK	(~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT))
 #define MAKE_THREAD_CPUCLOCK(tid)	(CPUCLOCK_BIT|(tid))
 #define MAKE_PROCESS_CPUCLOCK(pid)	\
 	(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid))
 
 static struct kclock	posix_clocks[MAX_CLOCKS];
 static uma_zone_t	itimer_zone = NULL;
 
 /*
  * Time of day and interval timer support.
  *
  * These routines provide the kernel entry points to get and set
  * the time-of-day and per-process interval timers.  Subroutines
  * here provide support for adding and subtracting timeval structures
  * and decrementing interval timers, optionally reloading the interval
  * timers when they expire.
  */
 
 static int	settime(struct thread *, struct timeval *);
 static void	timevalfix(struct timeval *);
 static int	user_clock_nanosleep(struct thread *td, clockid_t clock_id,
 		    int flags, const struct timespec *ua_rqtp,
 		    struct timespec *ua_rmtp);
 
 static void	itimer_start(void);
 static int	itimer_init(void *, int, int);
 static void	itimer_fini(void *, int);
 static void	itimer_enter(struct itimer *);
 static void	itimer_leave(struct itimer *);
 static struct itimer *itimer_find(struct proc *, int);
 static void	itimers_alloc(struct proc *);
 static void	itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
 static void	itimers_event_hook_exit(void *arg, struct proc *p);
 static int	realtimer_create(struct itimer *);
 static int	realtimer_gettime(struct itimer *, struct itimerspec *);
 static int	realtimer_settime(struct itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 static int	realtimer_delete(struct itimer *);
 static void	realtimer_clocktime(clockid_t, struct timespec *);
 static void	realtimer_expire(void *);
 
 int		register_posix_clock(int, struct kclock *);
 void		itimer_fire(struct itimer *it);
 int		itimespecfix(struct timespec *ts);
 
 #define CLOCK_CALL(clock, call, arglist)		\
 	((*posix_clocks[clock].call) arglist)
 
 SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
 
 
 static int
 settime(struct thread *td, struct timeval *tv)
 {
 	struct timeval delta, tv1, tv2;
 	static struct timeval maxtime, laststep;
 	struct timespec ts;
 
 	microtime(&tv1);
 	delta = *tv;
 	timevalsub(&delta, &tv1);
 
 	/*
 	 * If the system is secure, we do not allow the time to be 
 	 * set to a value earlier than 1 second less than the highest
 	 * time we have yet seen. The worst a miscreant can do in
 	 * this circumstance is "freeze" time. He couldn't go
 	 * back to the past.
 	 *
 	 * We similarly do not allow the clock to be stepped more
 	 * than one second, nor more than once per second. This allows
 	 * a miscreant to make the clock march double-time, but no worse.
 	 */
 	if (securelevel_gt(td->td_ucred, 1) != 0) {
 		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 			/*
 			 * Update maxtime to latest time we've seen.
 			 */
 			if (tv1.tv_sec > maxtime.tv_sec)
 				maxtime = tv1;
 			tv2 = *tv;
 			timevalsub(&tv2, &maxtime);
 			if (tv2.tv_sec < -1) {
 				tv->tv_sec = maxtime.tv_sec - 1;
 				printf("Time adjustment clamped to -1 second\n");
 			}
 		} else {
 			if (tv1.tv_sec == laststep.tv_sec)
 				return (EPERM);
 			if (delta.tv_sec > 1) {
 				tv->tv_sec = tv1.tv_sec + 1;
 				printf("Time adjustment clamped to +1 second\n");
 			}
 			laststep = *tv;
 		}
 	}
 
 	ts.tv_sec = tv->tv_sec;
 	ts.tv_nsec = tv->tv_usec * 1000;
 	tc_setclock(&ts);
 	resettodr();
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getcpuclockid2_args {
 	id_t id;
 	int which,
 	clockid_t *clock_id;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap)
 {
 	clockid_t clk_id;
 	int error;
 
 	error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id);
 	if (error == 0)
 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 	return (error);
 }
 
 int
 kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
     clockid_t *clk_id)
 {
 	struct proc *p;
 	pid_t pid;
 	lwpid_t tid;
 	int error;
 
 	switch (which) {
 	case CPUCLOCK_WHICH_PID:
 		if (id != 0) {
 			error = pget(id, PGET_CANSEE | PGET_NOTID, &p);
 			if (error != 0)
 				return (error);
 			PROC_UNLOCK(p);
 			pid = id;
 		} else {
 			pid = td->td_proc->p_pid;
 		}
 		*clk_id = MAKE_PROCESS_CPUCLOCK(pid);
 		return (0);
 	case CPUCLOCK_WHICH_TID:
 		tid = id == 0 ? td->td_tid : id;
 		*clk_id = MAKE_THREAD_CPUCLOCK(tid);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_gettime_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0)
 		error = copyout(&ats, uap->tp, sizeof(ats));
 
 	return (error);
 }
 
 static inline void 
 cputick2timespec(uint64_t runtime, struct timespec *ats)
 {
 	runtime = cputick2usec(runtime);
 	ats->tv_sec = runtime / 1000000;
 	ats->tv_nsec = runtime % 1000000 * 1000;
 }
 
 static void
 get_thread_cputime(struct thread *targettd, struct timespec *ats)
 {
 	uint64_t runtime, curtime, switchtime;
 
 	if (targettd == NULL) { /* current thread */
 		critical_enter();
 		switchtime = PCPU_GET(switchtime);
 		curtime = cpu_ticks();
 		runtime = curthread->td_runtime;
 		critical_exit();
 		runtime += curtime - switchtime;
 	} else {
 		thread_lock(targettd);
 		runtime = targettd->td_runtime;
 		thread_unlock(targettd);
 	}
 	cputick2timespec(runtime, ats);
 }
 
 static void
 get_process_cputime(struct proc *targetp, struct timespec *ats)
 {
 	uint64_t runtime;
 	struct rusage ru;
 
 	PROC_STATLOCK(targetp);
 	rufetch(targetp, &ru);
 	runtime = targetp->p_rux.rux_runtime;
 	if (curthread->td_proc == targetp)
 		runtime += cpu_ticks() - PCPU_GET(switchtime);
 	PROC_STATUNLOCK(targetp);
 	cputick2timespec(runtime, ats);
 }
 
 static int
 get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct proc *p, *p2;
 	struct thread *td2;
 	lwpid_t tid;
 	pid_t pid;
 	int error;
 
 	p = td->td_proc;
 	if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) {
 		tid = clock_id & CPUCLOCK_ID_MASK;
 		td2 = tdfind(tid, p->p_pid);
 		if (td2 == NULL)
 			return (EINVAL);
 		get_thread_cputime(td2, ats);
 		PROC_UNLOCK(td2->td_proc);
 	} else {
 		pid = clock_id & CPUCLOCK_ID_MASK;
 		error = pget(pid, PGET_CANSEE, &p2);
 		if (error != 0)
 			return (EINVAL);
 		get_process_cputime(p2, ats);
 		PROC_UNLOCK(p2);
 	}
 	return (0);
 }
 
 int
 kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval sys, user;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (clock_id) {
 	case CLOCK_REALTIME:		/* Default to precise. */
 	case CLOCK_REALTIME_PRECISE:
 		nanotime(ats);
 		break;
 	case CLOCK_REALTIME_FAST:
 		getnanotime(ats);
 		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
 		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
 		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_MONOTONIC:		/* Default to precise. */
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 		nanouptime(ats);
 		break;
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_MONOTONIC_FAST:
 		getnanouptime(ats);
 		break;
 	case CLOCK_SECOND:
 		ats->tv_sec = time_second;
 		ats->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
 		get_thread_cputime(NULL, ats);
 		break;
 	case CLOCK_PROCESS_CPUTIME_ID:
 		PROC_LOCK(p);
 		get_process_cputime(p, ats);
 		PROC_UNLOCK(p);
 		break;
 	default:
 		if ((int)clock_id >= 0)
 			return (EINVAL);
 		return (get_cputime(td, clock_id, ats));
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_settime_args {
 	clockid_t clock_id;
 	const struct	timespec *tp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_settime(struct thread *td, struct clock_settime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 		return (error);
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 static int allow_insane_settime = 0;
 SYSCTL_INT(_debug, OID_AUTO, allow_insane_settime, CTLFLAG_RWTUN,
     &allow_insane_settime, 0,
     "do not perform possibly restrictive checks on settime(2) args");
 
 int
 kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval atv;
 	int error;
 
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 	if (clock_id != CLOCK_REALTIME)
 		return (EINVAL);
 	if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000 ||
 	    ats->tv_sec < 0)
 		return (EINVAL);
 	if (!allow_insane_settime && ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60)
 		return (EINVAL);
 	/* XXX Don't convert nsec->usec and back */
 	TIMESPEC_TO_TIMEVAL(&atv, ats);
 	error = settime(td, &atv);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getres_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 int
 sys_clock_getres(struct thread *td, struct clock_getres_args *uap)
 {
 	struct timespec ts;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0)
 		error = copyout(&ts, uap->tp, sizeof(ts));
 	return (error);
 }
 
 int
 kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
 {
 
 	ts->tv_sec = 0;
 	switch (clock_id) {
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_UPTIME_PRECISE:
 		/*
 		 * Round up the result of the division cheaply by adding 1.
 		 * Rounding up is especially important if rounding down
 		 * would give 0.  Perfect rounding is unimportant.
 		 */
 		ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 		/* Accurately round up here because we can do so cheaply. */
 		ts->tv_nsec = howmany(1000000000, hz);
 		break;
 	case CLOCK_SECOND:
 		ts->tv_sec = 1;
 		ts->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
 	case CLOCK_PROCESS_CPUTIME_ID:
 	cputime:
 		/* sync with cputick2usec */
 		ts->tv_nsec = 1000000 / cpu_tickrate();
 		if (ts->tv_nsec == 0)
 			ts->tv_nsec = 1000;
 		break;
 	default:
 		if ((int)clock_id < 0)
 			goto cputime;
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
 
 	return (kern_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, rqt,
 	    rmt));
 }
 
 static uint8_t nanowait[MAXCPU];
 
 int
 kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
     const struct timespec *rqt, struct timespec *rmt)
 {
 	struct timespec ts, now;
 	sbintime_t sbt, sbtt, prec, tmp;
 	time_t over;
 	int error;
 	bool is_abs_real;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if ((flags & ~TIMER_ABSTIME) != 0)
 		return (EINVAL);
 	switch (clock_id) {
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_SECOND:
 		is_abs_real = (flags & TIMER_ABSTIME) != 0;
 		break;
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 	case CLOCK_UPTIME_FAST:
 		is_abs_real = false;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 	case CLOCK_PROCESS_CPUTIME_ID:
 		return (ENOTSUP);
 	case CLOCK_THREAD_CPUTIME_ID:
 	default:
 		return (EINVAL);
 	}
 	do {
 		ts = *rqt;
 		if ((flags & TIMER_ABSTIME) != 0) {
 			if (is_abs_real)
 				td->td_rtcgen =
 				    atomic_load_acq_int(&rtc_generation);
 			error = kern_clock_gettime(td, clock_id, &now);
 			KASSERT(error == 0, ("kern_clock_gettime: %d", error));
-			timespecsub(&ts, &now);
+			timespecsub(&ts, &now, &ts);
 		}
 		if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		if (ts.tv_sec > INT32_MAX / 2) {
 			over = ts.tv_sec - INT32_MAX / 2;
 			ts.tv_sec -= over;
 		} else
 			over = 0;
 		tmp = tstosbt(ts);
 		prec = tmp;
 		prec >>= tc_precexp;
 		if (TIMESEL(&sbt, tmp))
 			sbt += tc_tick_sbt;
 		sbt += tmp;
 		error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
 		    sbt, prec, C_ABSOLUTE);
 	} while (error == 0 && is_abs_real && td->td_rtcgen == 0);
 	td->td_rtcgen = 0;
 	if (error != EWOULDBLOCK) {
 		if (TIMESEL(&sbtt, tmp))
 			sbtt += tc_tick_sbt;
 		if (sbtt >= sbt)
 			return (0);
 		if (error == ERESTART)
 			error = EINTR;
 		if ((flags & TIMER_ABSTIME) == 0 && rmt != NULL) {
 			ts = sbttots(sbt - sbtt);
 			ts.tv_sec += over;
 			if (ts.tv_sec < 0)
 				timespecclear(&ts);
 			*rmt = ts;
 		}
 		return (error);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nanosleep_args {
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nanosleep(struct thread *td, struct nanosleep_args *uap)
 {
 
 	return (user_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME,
 	    uap->rqtp, uap->rmtp));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_nanosleep_args {
 	clockid_t clock_id;
 	int 	  flags;
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_nanosleep(struct thread *td, struct clock_nanosleep_args *uap)
 {
 	int error;
 
 	error = user_clock_nanosleep(td, uap->clock_id, uap->flags, uap->rqtp,
 	    uap->rmtp);
 	return (kern_posix_error(td, error));
 }
 
 static int
 user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
     const struct timespec *ua_rqtp, struct timespec *ua_rmtp)
 {
 	struct timespec rmt, rqt;
 	int error;
 
 	error = copyin(ua_rqtp, &rqt, sizeof(rqt));
 	if (error)
 		return (error);
 	if (ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0 &&
 	    !useracc(ua_rmtp, sizeof(rmt), VM_PROT_WRITE))
 		return (EFAULT);
 	error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt);
 	if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) {
 		int error2;
 
 		error2 = copyout(&rmt, ua_rmtp, sizeof(rmt));
 		if (error2)
 			error = error2;
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct gettimeofday_args {
 	struct	timeval *tp;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		error = copyout(&atv, uap->tp, sizeof (atv));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = tz_minuteswest;
 		rtz.tz_dsttime = tz_dsttime;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct settimeofday_args {
 	struct	timeval *tv;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_settimeofday(struct thread *td, struct settimeofday_args *uap)
 {
 	struct timeval atv, *tvp;
 	struct timezone atz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &atv, sizeof(atv));
 		if (error)
 			return (error);
 		tvp = &atv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &atz, sizeof(atz));
 		if (error)
 			return (error);
 		tzp = &atz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
 {
 	int error;
 
 	error = priv_check(td, PRIV_SETTIMEOFDAY);
 	if (error)
 		return (error);
 	/* Verify all parameters before changing time. */
 	if (tv) {
 		if (tv->tv_usec < 0 || tv->tv_usec >= 1000000 ||
 		    tv->tv_sec < 0)
 			return (EINVAL);
 		error = settime(td, tv);
 	}
 	if (tzp && error == 0) {
 		tz_minuteswest = tzp->tz_minuteswest;
 		tz_dsttime = tzp->tz_dsttime;
 	}
 	return (error);
 }
 
 /*
  * Get value of an interval timer.  The process virtual and profiling virtual
  * time timers are kept in the p_stats area, since they can be swapped out.
  * These are kept internally in the way they are specified externally: in
  * time until they expire.
  *
  * The real time interval timer is kept in the process table slot for the
  * process, and its value (it_value) is kept as an absolute time rather than
  * as a delta, so that it is easy to keep periodic real-time signals from
  * drifting.
  *
  * Virtual time timers are processed in the hardclock() routine of
  * kern_clock.c.  The real time timer is processed by a timeout routine,
  * called from the softclock() routine.  Since a callout may be delayed in
  * real time due to interrupt processing in the system, it is possible for
  * the real time timeout routine (realitexpire, given below), to be delayed
  * in real time past when it is supposed to occur.  It does not suffice,
  * therefore, to reload the real timer .it_value from the real time timers
  * .it_interval.  Rather, we compute the next time in absolute time the timer
  * should go off.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getitimer_args {
 	u_int	which;
 	struct	itimerval *itv;
 };
 #endif
 int
 sys_getitimer(struct thread *td, struct getitimer_args *uap)
 {
 	struct itimerval aitv;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 }
 
 int
 kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		/*
 		 * Convert from absolute to relative time in .it_value
 		 * part of real time timer.  If time for real time timer
 		 * has passed return 0, else return difference between
 		 * current time and time for the timer to go off.
 		 */
 		PROC_LOCK(p);
 		*aitv = p->p_realtimer;
 		PROC_UNLOCK(p);
 		if (timevalisset(&aitv->it_value)) {
 			microuptime(&ctv);
 			if (timevalcmp(&aitv->it_value, &ctv, <))
 				timevalclear(&aitv->it_value);
 			else
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
 		PROC_ITIMLOCK(p);
 		*aitv = p->p_stats->p_timer[which];
 		PROC_ITIMUNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(aitv);
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setitimer_args {
 	u_int	which;
 	struct	itimerval *itv, *oitv;
 };
 #endif
 int
 sys_setitimer(struct thread *td, struct setitimer_args *uap)
 {
 	struct itimerval aitv, oitv;
 	int error;
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (sys_getitimer(td, (struct getitimer_args *)uap));
 	}
 
 	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
 		return (error);
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
 }
 
 int
 kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
     struct itimerval *oitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 	sbintime_t sbt, pr;
 
 	if (aitv == NULL)
 		return (kern_getitimer(td, which, oitv));
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(aitv);
 #endif
 	if (itimerfix(&aitv->it_value) ||
 	    aitv->it_value.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 	if (!timevalisset(&aitv->it_value))
 		timevalclear(&aitv->it_interval);
 	else if (itimerfix(&aitv->it_interval) ||
 	    aitv->it_interval.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		PROC_LOCK(p);
 		if (timevalisset(&p->p_realtimer.it_value))
 			callout_stop(&p->p_itcallout);
 		microuptime(&ctv);
 		if (timevalisset(&aitv->it_value)) {
 			pr = tvtosbt(aitv->it_value) >> tc_precexp;
 			timevaladd(&aitv->it_value, &ctv);
 			sbt = tvtosbt(aitv->it_value);
 			callout_reset_sbt(&p->p_itcallout, sbt, pr,
 			    realitexpire, p, C_ABSOLUTE);
 		}
 		*oitv = p->p_realtimer;
 		p->p_realtimer = *aitv;
 		PROC_UNLOCK(p);
 		if (timevalisset(&oitv->it_value)) {
 			if (timevalcmp(&oitv->it_value, &ctv, <))
 				timevalclear(&oitv->it_value);
 			else
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
 		if (aitv->it_interval.tv_sec == 0 &&
 		    aitv->it_interval.tv_usec != 0 &&
 		    aitv->it_interval.tv_usec < tick)
 			aitv->it_interval.tv_usec = tick;
 		if (aitv->it_value.tv_sec == 0 &&
 		    aitv->it_value.tv_usec != 0 &&
 		    aitv->it_value.tv_usec < tick)
 			aitv->it_value.tv_usec = tick;
 		PROC_ITIMLOCK(p);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
 		PROC_ITIMUNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(oitv);
 #endif
 	return (0);
 }
 
 /*
  * Real interval timer expired:
  * send process whose timer expired an alarm signal.
  * If time is not set up to reload, then just return.
  * Else compute next time timer should go off which is > current time.
  * This is where delay in processing this timeout causes multiple
  * SIGALRM calls to be compressed into one.
  * tvtohz() always adds 1 to allow for the time until the next clock
  * interrupt being strictly less than 1 clock tick, but we don't want
  * that here since we want to appear to be in sync with the clock
  * interrupt even when we're delayed.
  */
 void
 realitexpire(void *arg)
 {
 	struct proc *p;
 	struct timeval ctv;
 	sbintime_t isbt;
 
 	p = (struct proc *)arg;
 	kern_psignal(p, SIGALRM);
 	if (!timevalisset(&p->p_realtimer.it_interval)) {
 		timevalclear(&p->p_realtimer.it_value);
 		if (p->p_flag & P_WEXIT)
 			wakeup(&p->p_itcallout);
 		return;
 	}
 	isbt = tvtosbt(p->p_realtimer.it_interval);
 	if (isbt >= sbt_timethreshold)
 		getmicrouptime(&ctv);
 	else
 		microuptime(&ctv);
 	do {
 		timevaladd(&p->p_realtimer.it_value,
 		    &p->p_realtimer.it_interval);
 	} while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=));
 	callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value),
 	    isbt >> tc_precexp, realitexpire, p, C_ABSOLUTE);
 }
 
 /*
  * Check that a proposed value to load into the .it_value or
  * .it_interval part of an interval timer is acceptable, and
  * fix it to have at least minimal value (i.e. if it is less
  * than the resolution of the clock, round it up.)
  */
 int
 itimerfix(struct timeval *tv)
 {
 
 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
 	if (tv->tv_sec == 0 && tv->tv_usec != 0 &&
 	    tv->tv_usec < (u_int)tick / 16)
 		tv->tv_usec = (u_int)tick / 16;
 	return (0);
 }
 
 /*
  * Decrement an interval timer by a specified number
  * of microseconds, which must be less than a second,
  * i.e. < 1000000.  If the timer expires, then reload
  * it.  In this case, carry over (usec - old value) to
  * reduce the value reloaded into the timer so that
  * the timer does not drift.  This routine assumes
  * that it is called in a context where the timers
  * on which it is operating cannot change in value.
  */
 int
 itimerdecr(struct itimerval *itp, int usec)
 {
 
 	if (itp->it_value.tv_usec < usec) {
 		if (itp->it_value.tv_sec == 0) {
 			/* expired, and already in next interval */
 			usec -= itp->it_value.tv_usec;
 			goto expire;
 		}
 		itp->it_value.tv_usec += 1000000;
 		itp->it_value.tv_sec--;
 	}
 	itp->it_value.tv_usec -= usec;
 	usec = 0;
 	if (timevalisset(&itp->it_value))
 		return (1);
 	/* expired, exactly at end of interval */
 expire:
 	if (timevalisset(&itp->it_interval)) {
 		itp->it_value = itp->it_interval;
 		itp->it_value.tv_usec -= usec;
 		if (itp->it_value.tv_usec < 0) {
 			itp->it_value.tv_usec += 1000000;
 			itp->it_value.tv_sec--;
 		}
 	} else
 		itp->it_value.tv_usec = 0;		/* sec is already 0 */
 	return (0);
 }
 
 /*
  * Add and subtract routines for timevals.
  * N.B.: subtract routine doesn't deal with
  * results which are before the beginning,
  * it just gets very confused in this case.
  * Caveat emptor.
  */
 void
 timevaladd(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec += t2->tv_sec;
 	t1->tv_usec += t2->tv_usec;
 	timevalfix(t1);
 }
 
 void
 timevalsub(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec -= t2->tv_sec;
 	t1->tv_usec -= t2->tv_usec;
 	timevalfix(t1);
 }
 
 static void
 timevalfix(struct timeval *t1)
 {
 
 	if (t1->tv_usec < 0) {
 		t1->tv_sec--;
 		t1->tv_usec += 1000000;
 	}
 	if (t1->tv_usec >= 1000000) {
 		t1->tv_sec++;
 		t1->tv_usec -= 1000000;
 	}
 }
 
 /*
  * ratecheck(): simple time-based rate-limit checking.
  */
 int
 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 {
 	struct timeval tv, delta;
 	int rv = 0;
 
 	getmicrouptime(&tv);		/* NB: 10ms precision */
 	delta = tv;
 	timevalsub(&delta, lasttime);
 
 	/*
 	 * check for 0,0 is so that the message will be seen at least once,
 	 * even if interval is huge.
 	 */
 	if (timevalcmp(&delta, mininterval, >=) ||
 	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
 		*lasttime = tv;
 		rv = 1;
 	}
 
 	return (rv);
 }
 
 /*
  * ppsratecheck(): packets (or events) per second limitation.
  *
  * Return 0 if the limit is to be enforced (e.g. the caller
  * should drop a packet because of the rate limitation).
  *
  * maxpps of 0 always causes zero to be returned.  maxpps of -1
  * always causes 1 to be returned; this effectively defeats rate
  * limiting.
  *
  * Note that we maintain the struct timeval for compatibility
  * with other bsd systems.  We reuse the storage and just monitor
  * clock ticks for minimal overhead.  
  */
 int
 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
 {
 	int now;
 
 	/*
 	 * Reset the last time and counter if this is the first call
 	 * or more than a second has passed since the last update of
 	 * lasttime.
 	 */
 	now = ticks;
 	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
 		lasttime->tv_sec = now;
 		*curpps = 1;
 		return (maxpps != 0);
 	} else {
 		(*curpps)++;		/* NB: ignore potential overflow */
 		return (maxpps < 0 || *curpps <= maxpps);
 	}
 }
 
 static void
 itimer_start(void)
 {
 	struct kclock rt_clock = {
 		.timer_create  = realtimer_create,
 		.timer_delete  = realtimer_delete,
 		.timer_settime = realtimer_settime,
 		.timer_gettime = realtimer_gettime,
 		.event_hook    = NULL
 	};
 
 	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
 		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
 	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
 	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
 	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
 	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
 	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
 	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
 		(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
 		(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
 }
 
 int
 register_posix_clock(int clockid, struct kclock *clk)
 {
 	if ((unsigned)clockid >= MAX_CLOCKS) {
 		printf("%s: invalid clockid\n", __func__);
 		return (0);
 	}
 	posix_clocks[clockid] = *clk;
 	return (1);
 }
 
 static int
 itimer_init(void *mem, int size, int flags)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
 	return (0);
 }
 
 static void
 itimer_fini(void *mem, int size)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_destroy(&it->it_mtx);
 }
 
 static void
 itimer_enter(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	it->it_usecount++;
 }
 
 static void
 itimer_leave(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
 
 	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
 		wakeup(it);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_create_args {
 	clockid_t clock_id;
 	struct sigevent * evp;
 	int * timerid;
 };
 #endif
 int
 sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
 {
 	struct sigevent *evp, ev;
 	int id;
 	int error;
 
 	if (uap->evp == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->evp, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
 int
 kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp,
     int *timerid, int preset_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int id;
 	int error;
 
 	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
 		return (EINVAL);
 
 	if (posix_clocks[clock_id].timer_create == NULL)
 		return (EINVAL);
 
 	if (evp != NULL) {
 		if (evp->sigev_notify != SIGEV_NONE &&
 		    evp->sigev_notify != SIGEV_SIGNAL &&
 		    evp->sigev_notify != SIGEV_THREAD_ID)
 			return (EINVAL);
 		if ((evp->sigev_notify == SIGEV_SIGNAL ||
 		     evp->sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(evp->sigev_signo))
 			return (EINVAL);
 	}
 	
 	if (p->p_itimers == NULL)
 		itimers_alloc(p);
 	
 	it = uma_zalloc(itimer_zone, M_WAITOK);
 	it->it_flags = 0;
 	it->it_usecount = 0;
 	it->it_active = 0;
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	it->it_overrun = 0;
 	it->it_overrun_last = 0;
 	it->it_clockid = clock_id;
 	it->it_timerid = -1;
 	it->it_proc = p;
 	ksiginfo_init(&it->it_ksi);
 	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 	error = CLOCK_CALL(clock_id, timer_create, (it));
 	if (error != 0)
 		goto out;
 
 	PROC_LOCK(p);
 	if (preset_id != -1) {
 		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
 		id = preset_id;
 		if (p->p_itimers->its_timers[id] != NULL) {
 			PROC_UNLOCK(p);
 			error = 0;
 			goto out;
 		}
 	} else {
 		/*
 		 * Find a free timer slot, skipping those reserved
 		 * for setitimer().
 		 */
 		for (id = 3; id < TIMER_MAX; id++)
 			if (p->p_itimers->its_timers[id] == NULL)
 				break;
 		if (id == TIMER_MAX) {
 			PROC_UNLOCK(p);
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	it->it_timerid = id;
 	p->p_itimers->its_timers[id] = it;
 	if (evp != NULL)
 		it->it_sigev = *evp;
 	else {
 		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
 		switch (clock_id) {
 		default:
 		case CLOCK_REALTIME:
 			it->it_sigev.sigev_signo = SIGALRM;
 			break;
 		case CLOCK_VIRTUAL:
  			it->it_sigev.sigev_signo = SIGVTALRM;
 			break;
 		case CLOCK_PROF:
 			it->it_sigev.sigev_signo = SIGPROF;
 			break;
 		}
 		it->it_sigev.sigev_value.sival_int = id;
 	}
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
 		it->it_ksi.ksi_code = SI_TIMER;
 		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
 		it->it_ksi.ksi_timerid = id;
 	}
 	PROC_UNLOCK(p);
 	*timerid = id;
 	return (0);
 
 out:
 	ITIMER_LOCK(it);
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 	uma_zfree(itimer_zone, it);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_delete_args {
 	int timerid;
 };
 #endif
 int
 sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
 {
 
 	return (kern_ktimer_delete(td, uap->timerid));
 }
 
 static struct itimer *
 itimer_find(struct proc *p, int timerid)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_itimers == NULL) ||
 	    (timerid < 0) || (timerid >= TIMER_MAX) ||
 	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
 		return (NULL);
 	}
 	ITIMER_LOCK(it);
 	if ((it->it_flags & ITF_DELETING) != 0) {
 		ITIMER_UNLOCK(it);
 		it = NULL;
 	}
 	return (it);
 }
 
 int
 kern_ktimer_delete(struct thread *td, int timerid)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 
 	PROC_LOCK(p);
 	it = itimer_find(p, timerid);
 	if (it == NULL) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 
 	it->it_flags |= ITF_DELETING;
 	while (it->it_usecount > 0) {
 		it->it_flags |= ITF_WANTED;
 		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
 	}
 	it->it_flags &= ~ITF_WANTED;
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 
 	PROC_LOCK(p);
 	if (KSI_ONQ(&it->it_ksi))
 		sigqueue_take(&it->it_ksi);
 	p->p_itimers->its_timers[timerid] = NULL;
 	PROC_UNLOCK(p);
 	uma_zfree(itimer_zone, it);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_settime_args {
 	int timerid;
 	int flags;
 	const struct itimerspec * value;
 	struct itimerspec * ovalue;
 };
 #endif
 int
 sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
 {
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val, sizeof(val));
 	if (error != 0)
 		return (error);
 	ovalp = uap->ovalue != NULL ? &oval : NULL;
 	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
 	if (error == 0 && uap->ovalue != NULL)
 		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
 	return (error);
 }
 
 int
 kern_ktimer_settime(struct thread *td, int timer_id, int flags,
     struct itimerspec *val, struct itimerspec *oval)
 {
 	struct proc *p;
 	struct itimer *it;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_settime, (it,
 		    flags, val, oval));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_gettime_args {
 	int timerid;
 	struct itimerspec * value;
 };
 #endif
 int
 sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
 {
 	struct itimerspec val;
 	int error;
 
 	error = kern_ktimer_gettime(td, uap->timerid, &val);
 	if (error == 0)
 		error = copyout(&val, uap->value, sizeof(val));
 	return (error);
 }
 
 int
 kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val)
 {
 	struct proc *p;
 	struct itimer *it;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct timer_getoverrun_args {
 	int timerid;
 };
 #endif
 int
 sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
 {
 
 	return (kern_ktimer_getoverrun(td, uap->timerid));
 }
 
 int
 kern_ktimer_getoverrun(struct thread *td, int timer_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int error ;
 
 	PROC_LOCK(p);
 	if (timer_id < 3 ||
 	    (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		td->td_retval[0] = it->it_overrun_last;
 		ITIMER_UNLOCK(it);
 		PROC_UNLOCK(p);
 		error = 0;
 	}
 	return (error);
 }
 
 static int
 realtimer_create(struct itimer *it)
 {
 	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
 	return (0);
 }
 
 static int
 realtimer_delete(struct itimer *it)
 {
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	
 	/*
 	 * clear timer's value and interval to tell realtimer_expire
 	 * to not rearm the timer.
 	 */
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	ITIMER_UNLOCK(it);
 	callout_drain(&it->it_callout);
 	ITIMER_LOCK(it);
 	return (0);
 }
 
 static int
 realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
 {
 	struct timespec cts;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	*ovalue = it->it_time;
 	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
-		timespecsub(&ovalue->it_value, &cts);
+		timespecsub(&ovalue->it_value, &cts, &ovalue->it_value);
 		if (ovalue->it_value.tv_sec < 0 ||
 		    (ovalue->it_value.tv_sec == 0 &&
 		     ovalue->it_value.tv_nsec == 0)) {
 			ovalue->it_value.tv_sec  = 0;
 			ovalue->it_value.tv_nsec = 1;
 		}
 	}
 	return (0);
 }
 
 static int
 realtimer_settime(struct itimer *it, int flags,
 	struct itimerspec *value, struct itimerspec *ovalue)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimerspec val;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	val = *value;
 	if (itimespecfix(&val.it_value))
 		return (EINVAL);
 
 	if (timespecisset(&val.it_value)) {
 		if (itimespecfix(&val.it_interval))
 			return (EINVAL);
 	} else {
 		timespecclear(&val.it_interval);
 	}
 	
 	if (ovalue != NULL)
 		realtimer_gettime(it, ovalue);
 
 	it->it_time = val;
 	if (timespecisset(&val.it_value)) {
 		realtimer_clocktime(it->it_clockid, &cts);
 		ts = val.it_value;
 		if ((flags & TIMER_ABSTIME) == 0) {
 			/* Convert to absolute time. */
-			timespecadd(&it->it_time.it_value, &cts);
+			timespecadd(&it->it_time.it_value, &cts,
+				&it->it_time.it_value);
 		} else {
-			timespecsub(&ts, &cts);
+			timespecsub(&ts, &cts, &ts);
 			/*
 			 * We don't care if ts is negative, tztohz will
 			 * fix it.
 			 */
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv),
 			realtimer_expire, it);
 	} else {
 		callout_stop(&it->it_callout);
 	}
 
 	return (0);
 }
 
 static void
 realtimer_clocktime(clockid_t id, struct timespec *ts)
 {
 	if (id == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 int
 itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	it = itimer_find(p, timerid);
 	if (it != NULL) {
 		ksi->ksi_overrun = it->it_overrun;
 		it->it_overrun_last = it->it_overrun;
 		it->it_overrun = 0;
 		ITIMER_UNLOCK(it);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 int
 itimespecfix(struct timespec *ts)
 {
 
 	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
 		ts->tv_nsec = tick * 1000;
 	return (0);
 }
 
 /* Timeout callback for realtime timer */
 static void
 realtimer_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimer *it;
 
 	it = (struct itimer *)arg;
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	/* Only fire if time is reached. */
 	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 		if (timespecisset(&it->it_time.it_interval)) {
 			timespecadd(&it->it_time.it_value,
-				    &it->it_time.it_interval);
+				    &it->it_time.it_interval,
+				    &it->it_time.it_value);
 			while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 				if (it->it_overrun < INT_MAX)
 					it->it_overrun++;
 				else
 					it->it_ksi.ksi_errno = ERANGE;
 				timespecadd(&it->it_time.it_value,
-					    &it->it_time.it_interval);
+					    &it->it_time.it_interval,
+					    &it->it_time.it_value);
 			}
 		} else {
 			/* single shot timer ? */
 			timespecclear(&it->it_time.it_value);
 		}
 		if (timespecisset(&it->it_time.it_value)) {
-			ts = it->it_time.it_value;
-			timespecsub(&ts, &cts);
+			timespecsub(&it->it_time.it_value, &cts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&it->it_callout, tvtohz(&tv),
 				 realtimer_expire, it);
 		}
 		itimer_enter(it);
 		ITIMER_UNLOCK(it);
 		itimer_fire(it);
 		ITIMER_LOCK(it);
 		itimer_leave(it);
 	} else if (timespecisset(&it->it_time.it_value)) {
 		ts = it->it_time.it_value;
-		timespecsub(&ts, &cts);
+		timespecsub(&ts, &cts, &ts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
  			it);
 	}
 }
 
 void
 itimer_fire(struct itimer *it)
 {
 	struct proc *p = it->it_proc;
 	struct thread *td;
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
 			ITIMER_LOCK(it);
 			timespecclear(&it->it_time.it_value);
 			timespecclear(&it->it_time.it_interval);
 			callout_stop(&it->it_callout);
 			ITIMER_UNLOCK(it);
 			return;
 		}
 		if (!KSI_ONQ(&it->it_ksi)) {
 			it->it_ksi.ksi_errno = 0;
 			ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
 			tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
 		} else {
 			if (it->it_overrun < INT_MAX)
 				it->it_overrun++;
 			else
 				it->it_ksi.ksi_errno = ERANGE;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 static void
 itimers_alloc(struct proc *p)
 {
 	struct itimers *its;
 	int i;
 
 	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
 	LIST_INIT(&its->its_virtual);
 	LIST_INIT(&its->its_prof);
 	TAILQ_INIT(&its->its_worklist);
 	for (i = 0; i < TIMER_MAX; i++)
 		its->its_timers[i] = NULL;
 	PROC_LOCK(p);
 	if (p->p_itimers == NULL) {
 		p->p_itimers = its;
 		PROC_UNLOCK(p);
 	}
 	else {
 		PROC_UNLOCK(p);
 		free(its, M_SUBPROC);
 	}
 }
 
 static void
 itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
 	itimers_event_hook_exit(arg, p);
 }
 
 /* Clean up timers when some process events are being triggered. */
 static void
 itimers_event_hook_exit(void *arg, struct proc *p)
 {
 	struct itimers *its;
 	struct itimer *it;
 	int event = (int)(intptr_t)arg;
 	int i;
 
 	if (p->p_itimers != NULL) {
 		its = p->p_itimers;
 		for (i = 0; i < MAX_CLOCKS; ++i) {
 			if (posix_clocks[i].event_hook != NULL)
 				CLOCK_CALL(i, event_hook, (p, i, event));
 		}
 		/*
 		 * According to susv3, XSI interval timers should be inherited
 		 * by new image.
 		 */
 		if (event == ITIMER_EV_EXEC)
 			i = 3;
 		else if (event == ITIMER_EV_EXIT)
 			i = 0;
 		else
 			panic("unhandled event");
 		for (; i < TIMER_MAX; ++i) {
 			if ((it = its->its_timers[i]) != NULL)
 				kern_ktimer_delete(curthread, i);
 		}
 		if (its->its_timers[0] == NULL &&
 		    its->its_timers[1] == NULL &&
 		    its->its_timers[2] == NULL) {
 			free(its, M_SUBPROC);
 			p->p_itimers = NULL;
 		}
 	}
 }
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 336913)
+++ head/sys/kern/kern_umtx.c	(revision 336914)
@@ -1,4567 +1,4565 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015, 2016 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or umtx_lock, write must have both chain lock and
 	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 	
 	/* Spare queue ready to be reused */
 	struct umtxq_queue	*uq_spare_queue;
 
 	/* The queue we on */
 	struct umtxq_queue	*uq_cur_queue;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Per-key wait-queue */
 struct umtxq_queue {
 	struct umtxq_head	head;
 	struct umtx_key		key;
 	LIST_ENTRY(umtxq_queue)	link;
 	int			length;
 };
 
 LIST_HEAD(umtxq_list, umtxq_queue);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_list	uc_queue[2];
 #define UMTX_SHARED_QUEUE	0
 #define UMTX_EXCLUSIVE_QUEUE	1
 
 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
 	u_int 			length;
 	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #ifndef	UMTX_CHAINS
 #define	UMTX_CHAINS		512
 #endif
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct abs_timeout {
 	int clockid;
 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
 	struct timespec cur;
 	struct timespec end;
 };
 
 #ifdef COMPAT_FREEBSD32
 struct umutex32 {
 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
 	__uint32_t		m_flags;	/* Flags of the mutex */
 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
 	__uint32_t		m_rb_lnk;	/* Robust linkage */
 	__uint32_t		m_pad;
 	__uint32_t		m_spare[2];
 };
 
 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
 #endif
 
 int umtx_shm_vnobj_persistent = 0;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
     &umtx_shm_vnobj_persistent, 0,
     "False forces destruction of umtx attached to file, on last close");
 static int umtx_max_rb = 1000;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
     &umtx_max_rb, 0,
     "");
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 static int umtx_verbose_rb = 1;
 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
     &umtx_verbose_rb, 0,
     "");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
 #endif
 
 static void abs_timeout_update(struct abs_timeout *timo);
 
 static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     bool rb);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
     struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void) 
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;	
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;	
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
 	    M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n;
 
 	n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 static inline void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 static inline void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;	
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 static inline void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_queue *uh;
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 static int
 umtxq_check_susp(struct thread *td)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (p->p_flag & P_SINGLE_EXIT)
 			error = EINTR;
 		else
 			error = ERESTART;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 static inline int 
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 static void
 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
 	const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
 		timo->is_abs_real = false;
 		abs_timeout_update(timo);
-		timo->end = timo->cur;
-		timespecadd(&timo->end, timeout);
+		timespecadd(&timo->cur, timeout, &timo->end);
 	} else {
 		timo->end = *timeout;
 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
 		    clockid == CLOCK_REALTIME_FAST ||
 		    clockid == CLOCK_REALTIME_PRECISE;
 		/*
 		 * If is_abs_real, umtxq_sleep will read the clock
 		 * after setting td_rtcgen; otherwise, read it here.
 		 */
 		if (!timo->is_abs_real) {
 			abs_timeout_update(timo);
 		}
 	}
 }
 
 static void
 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
 {
 
 	abs_timeout_init(timo, umtxtime->_clockid,
 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
 }
 
 static inline void
 abs_timeout_update(struct abs_timeout *timo)
 {
 
 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 }
 
 static int
 abs_timeout_gethz(struct abs_timeout *timo)
 {
 	struct timespec tts;
 
 	if (timespeccmp(&timo->end, &timo->cur, <=))
 		return (-1); 
-	tts = timo->end;
-	timespecsub(&tts, &timo->cur);
+	timespecsub(&timo->end, &timo->cur, &tts);
 	return (tstohz(&tts));
 }
 
 static uint32_t
 umtx_unlock_val(uint32_t flags, bool rb)
 {
 
 	if (rb)
 		return (UMUTEX_RB_OWNERDEAD);
 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
 		return (UMUTEX_RB_NOTRECOV);
 	else
 		return (UMUTEX_UNOWNED);
 
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
 	int error, timo;
 
 	if (abstime != NULL && abstime->is_abs_real) {
 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
 		abs_timeout_update(abstime);
 	}
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
 		if (!(uq->uq_flags & UQF_UMTXQ)) {
 			error = 0;
 			break;
 		}
 		if (abstime != NULL) {
 			timo = abs_timeout_gethz(abstime);
 			if (timo < 0) {
 				error = ETIMEDOUT;
 				break;
 			}
 		} else
 			timo = 0;
 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
 		if (error == EINTR || error == ERESTART) {
 			umtxq_lock(&uq->uq_key);
 			break;
 		}
 		if (abstime != NULL) {
 			if (abstime->is_abs_real)
 				curthread->td_rtcgen =
 				    atomic_load_acq_int(&rtc_generation);
 			abs_timeout_update(abstime);
 		}
 		umtxq_lock(&uq->uq_key);
 	}
 
 	curthread->td_rtcgen = 0;
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
     struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int mode)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED ||
 			    owner == UMUTEX_CONTESTED ||
 			    owner == UMUTEX_RB_OWNERDEAD ||
 			    owner == UMUTEX_RB_NOTRECOV)
 				return (0);
 		} else {
 			/*
 			 * Robust mutex terminated.  Kernel duty is to
 			 * return EOWNERDEAD to the userspace.  The
 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
 			 * by the common userspace code.
 			 */
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_RB_OWNERDEAD, &owner,
 				    id | UMUTEX_CONTESTED);
 				if (rv == -1)
 					return (EFAULT);
 				if (owner == UMUTEX_RB_OWNERDEAD)
 					return (EOWNERDEAD); /* success */
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 				continue;
 			}
 			if (owner == UMUTEX_RB_NOTRECOV)
 				return (ENOTRECOVERABLE);
 
 
 			/*
 			 * Try the uncontested case.  This should be
 			 * done in userland.
 			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
 			/*
 			 * If no one owns it but it is contested try
 			 * to acquire it.
 			 */
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 
 				/*
 				 * If this failed the lock has
 				 * changed, restart.
 				 */
 				continue;
 			}
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = umtxq_check_susp(td);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id, newlock;
 	int error, count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	newlock = umtx_unlock_val(flags, rb);
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, newlock);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	if (count > 1)
 		newlock |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, newlock);
 	umtxq_lock(&key);
 	umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1)
 			error = EFAULT;
 	}
 
 	umtxq_lock(&key);
 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
 	    UMUTEX_ROBUST)) {
 	case 0:
 	case UMUTEX_ROBUST:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
 		type = TYPE_PI_ROBUST_UMUTEX;
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
 		type = TYPE_PP_ROBUST_UMUTEX;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 	/*
 	 * Only repair contention bit if there is a waiter, this means the mutex
 	 * is still being referenced by userland code, otherwise don't update
 	 * any memory.
 	 */
 	if (count > 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	} else if (count == 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
 		    (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	}
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	MPASS(pi->pi_owner == NULL);
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
 	int pri;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
     const char *wmesg, struct abs_timeout *timo, bool shared)
 {
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int error, pri;
 #ifdef INVARIANTS
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 #endif
 	error = 0;
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, old_owner, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
 			old_owner = owner;
 			rv = casueword32(&m->m_owner, owner, &owner,
 			    id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 
 			if (owner == old_owner) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 				if (error != 0) {
 					/*
 					 * Since we're going to return an
 					 * error, restore the m_owner to its
 					 * previous, unowned state to avoid
 					 * compounding the problem.
 					 */
 					(void)casuword32(&m->m_owner,
 					    id | UMUTEX_CONTESTED,
 					    old_owner);
 				}
 				if (error == 0 &&
 				    old_owner == UMUTEX_RB_OWNERDEAD)
 					error = EOWNERDEAD;
 				break;
 			}
 
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old, owner |
 		    UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
 		 * value for owner is impossible there.
 		 */
 		if (old == owner) {
 			error = umtxq_sleep_pi(uq, pi,
 			    owner & ~UMUTEX_CONTESTED,
 			    "umtxpi", timeout == NULL ? NULL : &timo,
 			    (flags & USYNC_PROCESS_SHARED) != 0);
 			if (error != 0)
 				continue;
 		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
 
 		error = umtxq_check_susp(td);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	uint32_t id, new_owner, old, owner;
 	int count, error, pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	new_owner = umtx_unlock_val(flags, rb);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, new_owner);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
 			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
 		if (pi->pi_owner == td)
 			umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(&key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 
 	if (count > 1)
 		new_owner |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, new_owner);
 
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
 		mtx_unlock(&umtx_lock);
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
 			    &owner, id | UMUTEX_CONTESTED);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				error = EOWNERDEAD; /* success */
 				break;
 			}
 			error = 0;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0 && error != EOWNERDEAD) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t id, owner, rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
 	    UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
     uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t flags, id, owner, save_ceiling;
 	int error, rv, rv1;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			error = rv == 0 ? 0 : EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_OWNERDEAD) {
 			error = EOWNERDEAD;
 			break;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL) {
 		rv = suword32(old_ceiling, save_ceiling);
 		error = rv == 0 ? 0 : EFAULT;
 	}
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags, rb));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags, rb));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags, rb));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
     struct timespec *timeout, u_long wflags)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		suword32(&cv->c_has_waiters, 1);
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m, false);
 
 	if (timeout != NULL)
 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
 		    timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state, 
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv == 0)
 		rv = fueword32(&sem->_count, &count);
 	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (rv == -1 ? EFAULT : 0);
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (count == 0)
 			break;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
 			/* A relative timeout cannot be restarted. */
 			if (error == ERESTART)
 				error = EINTR;
 			if (error == EINTR) {
 				abs_timeout_update(&timo);
-				timeout->_timeout = timo.end;
-				timespecsub(&timeout->_timeout, &timo.cur);
+				timespecsub(&timo.end, &timo.cur,
+				    &timeout->_timeout);
 			}
 		}
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS)
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 			if (rv == -1)
 				error = EFAULT;
 			umtxq_lock(&key);
 		}
 
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 inline int
 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(addr, tsp, sizeof(struct timespec));
 	if (error == 0) {
 		if (tsp->tv_sec < 0 ||
 		    tsp->tv_nsec >= 1000000000 ||
 		    tsp->tv_nsec < 0)
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 	
 	if (size <= sizeof(struct timespec)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
 	} else 
 		error = copyin(addr, tp, sizeof(struct _umtx_time));
 	if (error != 0)
 		return (error);
 	if (tp->_timeout.tv_sec < 0 ||
 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	char *uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (char **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake_umutex(td, uap->obj));
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_unlock_umutex(td, uap->obj, false));
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_signal(td, uap->obj));
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_broadcast(td, uap->obj));
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2, 
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_rw_unlock(td, uap->obj));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
 		error = copyout(&timeout._timeout,
 		    (struct _umtx_time *)uap->uaddr2 + 1,
 		    sizeof(struct timespec));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
 #define	USHM_OBJ_UMTX(o)						\
     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
 
 #define	USHMF_REG_LINKED	0x0001
 #define	USHMF_OBJ_LINKED	0x0002
 struct umtx_shm_reg {
 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
 	struct umtx_key		ushm_key;
 	struct ucred		*ushm_cred;
 	struct shmfd		*ushm_obj;
 	u_int			ushm_refcnt;
 	u_int			ushm_flags;
 };
 
 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
 
 static uma_zone_t umtx_shm_reg_zone;
 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
 static struct mtx umtx_shm_lock;
 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
 
 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
 
 static void
 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
 {
 	struct umtx_shm_reg_head d;
 	struct umtx_shm_reg *reg, *reg1;
 
 	TAILQ_INIT(&d);
 	mtx_lock(&umtx_shm_lock);
 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
 	mtx_unlock(&umtx_shm_lock);
 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
 		umtx_shm_free_reg(reg);
 	}
 }
 
 static struct task umtx_shm_reg_delfree_task =
     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg_locked(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 	struct umtx_shm_reg_head *reg_head;
 
 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	reg_head = &umtx_shm_registry[key->hash];
 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
 		KASSERT(reg->ushm_key.shared,
 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
 		if (reg->ushm_key.info.shared.object ==
 		    key->info.shared.object &&
 		    reg->ushm_key.info.shared.offset ==
 		    key->info.shared.offset) {
 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
 			KASSERT(reg->ushm_refcnt > 0,
 			    ("reg %p refcnt 0 onlist", reg));
 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
 			    ("reg %p not linked", reg));
 			reg->ushm_refcnt++;
 			return (reg);
 		}
 	}
 	return (NULL);
 }
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 
 	mtx_lock(&umtx_shm_lock);
 	reg = umtx_shm_find_reg_locked(key);
 	mtx_unlock(&umtx_shm_lock);
 	return (reg);
 }
 
 static void
 umtx_shm_free_reg(struct umtx_shm_reg *reg)
 {
 
 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
 	crfree(reg->ushm_cred);
 	shm_drop(reg->ushm_obj);
 	uma_zfree(umtx_shm_reg_zone, reg);
 }
 
 static bool
 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
 {
 	bool res;
 
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
 	reg->ushm_refcnt--;
 	res = reg->ushm_refcnt == 0;
 	if (res || force) {
 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
 			    reg, ushm_reg_link);
 			reg->ushm_flags &= ~USHMF_REG_LINKED;
 		}
 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
 			LIST_REMOVE(reg, ushm_obj_link);
 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
 		}
 	}
 	return (res);
 }
 
 static void
 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
 {
 	vm_object_t object;
 	bool dofree;
 
 	if (force) {
 		object = reg->ushm_obj->shm_object;
 		VM_OBJECT_WLOCK(object);
 		object->flags |= OBJ_UMTXDEAD;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	mtx_lock(&umtx_shm_lock);
 	dofree = umtx_shm_unref_reg_locked(reg, force);
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		umtx_shm_free_reg(reg);
 }
 
 void
 umtx_shm_object_init(vm_object_t object)
 {
 
 	LIST_INIT(USHM_OBJ_UMTX(object));
 }
 
 void
 umtx_shm_object_terminated(vm_object_t object)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	bool dofree;
 
 	dofree = false;
 	mtx_lock(&umtx_shm_lock);
 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
 		if (umtx_shm_unref_reg_locked(reg, true)) {
 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
 			    ushm_reg_link);
 			dofree = true;
 		}
 	}
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
 }
 
 static int
 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
     struct umtx_shm_reg **res)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	struct ucred *cred;
 	int error;
 
 	reg = umtx_shm_find_reg(key);
 	if (reg != NULL) {
 		*res = reg;
 		return (0);
 	}
 	cred = td->td_ucred;
 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
 		return (ENOMEM);
 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
 	reg->ushm_refcnt = 1;
 	bcopy(key, &reg->ushm_key, sizeof(*key));
 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
 	reg->ushm_cred = crhold(cred);
 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
 	if (error != 0) {
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	mtx_lock(&umtx_shm_lock);
 	reg1 = umtx_shm_find_reg_locked(key);
 	if (reg1 != NULL) {
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		*res = reg1;
 		return (0);
 	}
 	reg->ushm_refcnt++;
 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
 	    ushm_obj_link);
 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
 	mtx_unlock(&umtx_shm_lock);
 	*res = reg;
 	return (0);
 }
 
 static int
 umtx_shm_alive(struct thread *td, void *addr)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	int res, ret;
 	boolean_t wired;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
 	    &object, &pindex, &prot, &wired);
 	if (res != KERN_SUCCESS)
 		return (EFAULT);
 	if (object == NULL)
 		ret = EINVAL;
 	else
 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
 	vm_map_lookup_done(map, entry);
 	return (ret);
 }
 
 static void
 umtx_shm_init(void)
 {
 	int i;
 
 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
 	for (i = 0; i < nitems(umtx_shm_registry); i++)
 		TAILQ_INIT(&umtx_shm_registry[i]);
 }
 
 static int
 umtx_shm(struct thread *td, void *addr, u_int flags)
 {
 	struct umtx_key key;
 	struct umtx_shm_reg *reg;
 	struct file *fp;
 	int error, fd;
 
 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
 		return (EINVAL);
 	if ((flags & UMTX_SHM_ALIVE) != 0)
 		return (umtx_shm_alive(td, addr));
 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
 	if (error != 0)
 		return (error);
 	KASSERT(key.shared == 1, ("non-shared key"));
 	if ((flags & UMTX_SHM_CREAT) != 0) {
 		error = umtx_shm_create_reg(td, &key, &reg);
 	} else {
 		reg = umtx_shm_find_reg(&key);
 		if (reg == NULL)
 			error = ESRCH;
 	}
 	umtx_key_release(&key);
 	if (error != 0)
 		return (error);
 	KASSERT(reg != NULL, ("no reg"));
 	if ((flags & UMTX_SHM_DESTROY) != 0) {
 		umtx_shm_unref_reg(reg, true);
 	} else {
 #if 0
 #ifdef MAC
 		error = mac_posixshm_check_open(td->td_ucred,
 		    reg->ushm_obj, FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = shm_access(reg->ushm_obj, td->td_ucred,
 			    FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
 		if (error == 0) {
 			shm_hold(reg->ushm_obj);
 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
 			    &shm_ops);
 			td->td_retval[0] = fd;
 			fdrop(fp, td);
 		}
 	}
 	umtx_shm_unref_reg(reg, false);
 	return (error);
 }
 
 static int
 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (umtx_shm(td, uap->uaddr1, uap->val));
 }
 
 static int
 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
 {
 
 	td->td_rb_list = rbp->robust_list_offset;
 	td->td_rbp_list = rbp->robust_priv_list_offset;
 	td->td_rb_inact = rbp->robust_inact_offset;
 	return (0);
 }
 
 static int
 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	int error;
 
 	if (uap->val > sizeof(rb))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	error = copyin(uap->uaddr1, &rb, uap->val);
 	if (error != 0)
 		return (error);
 	return (umtx_robust_lists(td, &rb));
 }
 
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static const _umtx_op_func op_table[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
 };
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table))
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct timespec32 {
 	int32_t tv_sec;
 	int32_t tv_nsec;
 };
 
 struct umtx_time32 {
 	struct	timespec32	timeout;
 	uint32_t		flags;
 	uint32_t		clockid;
 };
 
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		if (ts32.tv_sec < 0 ||
 		    ts32.tv_nsec >= 1000000000 ||
 		    ts32.tv_nsec < 0)
 			error = EINVAL;
 		else {
 			tsp->tv_sec = ts32.tv_sec;
 			tsp->tv_nsec = ts32.tv_nsec;
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_time32 t32;
 	int error;
 	
 	t32.clockid = CLOCK_REALTIME;
 	t32.flags   = 0;
 	if (size <= sizeof(struct timespec32))
 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
 	else 
 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
 	if (error != 0)
 		return (error);
 	if (t32.timeout.tv_sec < 0 ||
 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
 		return (EINVAL);
 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
 	tp->_flags = t32.flags;
 	tp->_clockid = t32.clockid;
 	return (0);
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2, 
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(
 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 #endif
 
 static int
 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
 		struct timespec32 remain32 = {
 			.tv_sec = timeout._timeout.tv_sec,
 			.tv_nsec = timeout._timeout.tv_nsec
 		};
 		error = copyout(&remain32,
 		    (struct umtx_time32 *)uap->uaddr2 + 1,
 		    sizeof(struct timespec32));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
 {
 	uint32_t uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (uint32_t **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
 			    INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 struct umtx_robust_lists_params_compat32 {
 	uint32_t	robust_list_offset;
 	uint32_t	robust_priv_list_offset;
 	uint32_t	robust_inact_offset;
 };
 
 static int
 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	struct umtx_robust_lists_params_compat32 rb32;
 	int error;
 
 	if (uap->val > sizeof(rb32))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	bzero(&rb32, sizeof(rb32));
 	error = copyin(uap->uaddr1, &rb32, uap->val);
 	if (error != 0)
 		return (error);
 	rb.robust_list_offset = rb32.robust_list_offset;
 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
 	rb.robust_inact_offset = rb32.robust_inact_offset;
 	return (umtx_robust_lists(td, &rb));
 }
 
 static const _umtx_op_func op_table_compat32[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
 		return (*op_table_compat32[uap->op])(td,
 		    (struct _umtx_op_args *)uap);
 	}
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
 
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  *
  * Clear robust lists for all process' threads, not delaying the
  * cleanup to thread_exit hook, since the relevant address space is
  * destroyed right now.
  */
 static void
 umtx_exec_hook(void *arg __unused, struct proc *p,
     struct image_params *imgp __unused)
 {
 	struct thread *td;
 
 	KASSERT(p == curproc, ("need curproc"));
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
 	    ("curproc must be single-threaded"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(td == curthread ||
 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
 		    ("running thread %p %p", p, td));
 		PROC_UNLOCK(p);
 		umtx_thread_cleanup(td);
 		PROC_LOCK(p);
 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
 	}
 	PROC_UNLOCK(p);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 
 	umtx_thread_cleanup(td);
 }
 
 static int
 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
 {
 	u_long res1;
 #ifdef COMPAT_FREEBSD32
 	uint32_t res32;
 #endif
 	int error;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		error = fueword32((void *)ptr, &res32);
 		if (error == 0)
 			res1 = res32;
 	} else
 #endif
 	{
 		error = fueword((void *)ptr, &res1);
 	}
 	if (error == 0)
 		*res = res1;
 	else
 		error = EFAULT;
 	return (error);
 }
 
 static void
 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
 {
 #ifdef COMPAT_FREEBSD32
 	struct umutex32 m32;
 
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		memcpy(&m32, m, sizeof(m32));
 		*rb_list = m32.m_rb_lnk;
 	} else
 #endif
 		*rb_list = m->m_rb_lnk;
 }
 
 static int
 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
 {
 	struct umutex m;
 	int error;
 
 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
 	error = copyin((void *)rbp, &m, sizeof(m));
 	if (error != 0)
 		return (error);
 	if (rb_list != NULL)
 		umtx_read_rb_list(td, &m, rb_list);
 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
 		return (EINVAL);
 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
 		/* inact is cleared after unlock, allow the inconsistency */
 		return (inact ? 0 : EINVAL);
 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
 }
 
 static void
 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
     const char *name)
 {
 	int error, i;
 	uintptr_t rbp;
 	bool inact;
 
 	if (rb_list == 0)
 		return;
 	error = umtx_read_uptr(td, rb_list, &rbp);
 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
 		if (rbp == *rb_inact) {
 			inact = true;
 			*rb_inact = 0;
 		} else
 			inact = false;
 		error = umtx_handle_rb(td, rbp, &rbp, inact);
 	}
 	if (i == umtx_max_rb && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
 	}
 	if (error != 0 && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: handling %srb error %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
 	}
 }
 
 /*
  * Clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	uintptr_t rb_inact;
 
 	/*
 	 * Disown pi mutexes.
 	 */
 	uq = td->td_umtxq;
 	if (uq != NULL) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = PRI_MAX;
 		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 			pi->pi_owner = NULL;
 			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 		}
 		mtx_unlock(&umtx_lock);
 		thread_lock(td);
 		sched_lend_user_prio(td, PRI_MAX);
 		thread_unlock(td);
 	}
 
 	/*
 	 * Handle terminated robust mutexes.  Must be done after
 	 * robust pi disown, otherwise unlock could see unowned
 	 * entries.
 	 */
 	rb_inact = td->td_rb_inact;
 	if (rb_inact != 0)
 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
 	if (rb_inact != 0)
 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
 }
Index: head/sys/kern/subr_rtc.c
===================================================================
--- head/sys/kern/subr_rtc.c	(revision 336913)
+++ head/sys/kern/subr_rtc.c	(revision 336914)
@@ -1,424 +1,424 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1982, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Portions of this software were developed by Julien Ridoux at the University
  * of Melbourne under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: Utah $Hdr: clock.c 1.18 91/01/21$
  *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
  *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
  *	and
  *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
  */
 
 /*
  * Helpers for time-of-day clocks. This is useful for architectures that need
  * support multiple models of such clocks, and generally serves to make the
  * code more machine-independent.
  * If the clock in question can also be used as a time counter, the driver
  * needs to initiate this.
  * This code is not yet used by all architectures.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffclock.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #ifdef FFCLOCK
 #include <sys/timeffc.h>
 #endif
 #include <sys/timetc.h>
 
 #include "clock_if.h"
 
 static int show_io;
 SYSCTL_INT(_debug, OID_AUTO, clock_show_io, CTLFLAG_RWTUN, &show_io, 0,
     "Enable debug printing of RTC clock I/O; 1=reads, 2=writes, 3=both.");
 
 static int sysctl_clock_do_io(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_debug, OID_AUTO, clock_do_io, CTLTYPE_INT | CTLFLAG_RW,
     0, 0, sysctl_clock_do_io, "I",
     "Trigger one-time IO on RTC clocks; 1=read (and discard), 2=write");
 
 /* XXX: should be kern. now, it's no longer machdep.  */
 static int disable_rtc_set;
 SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set,
     0, "Disallow adjusting time-of-day clock");
 
 /*
  * An instance of a realtime clock.  A list of these tracks all the registered
  * clocks in the system.
  *
  * The resadj member is used to apply a "resolution adjustment" equal to half
  * the clock's resolution, which is useful mainly on clocks with a whole-second
  * resolution.  Because the clock truncates the fractional part, adding half the
  * resolution performs 4/5 rounding.  The same adjustment is applied to the
  * times returned from clock_gettime(), because the fraction returned will
  * always be zero, but on average the actual fraction at the time of the call
  * should be about .5.
  */
 struct rtc_instance {
 	device_t	clockdev;
 	int		resolution;
 	int		flags;
 	u_int		schedns;
 	struct timespec resadj;
 	struct timeout_task
 			stask;
 	LIST_ENTRY(rtc_instance)
 			rtc_entries;
 };
 
 /*
  * Clocks are updated using a task running on taskqueue_thread.
  */
 static void settime_task_func(void *arg, int pending);
 
 /*
  * Registered clocks are kept in a list which is sorted by resolution; the more
  * accurate clocks get the first shot at providing the time.
  */
 LIST_HEAD(rtc_listhead, rtc_instance);
 static struct rtc_listhead rtc_list = LIST_HEAD_INITIALIZER(rtc_list);
 static struct sx rtc_list_lock;
 SX_SYSINIT(rtc_list_lock_init, &rtc_list_lock, "rtc list");
 
 /*
  * On the task thread, invoke the clock_settime() method of the clock.  Do so
  * holding no locks, so that clock drivers are free to do whatever kind of
  * locking or sleeping they need to.
  */
 static void
 settime_task_func(void *arg, int pending)
 {
 	struct timespec ts;
 	struct rtc_instance *rtc;
 
 	rtc = arg;
 	if (!(rtc->flags & CLOCKF_SETTIME_NO_TS)) {
 		getnanotime(&ts);
 		if (!(rtc->flags & CLOCKF_SETTIME_NO_ADJ)) {
 			ts.tv_sec -= utc_offset();
-			timespecadd(&ts, &rtc->resadj);
+			timespecadd(&ts, &rtc->resadj, &ts);
 		}
 	} else {
 		ts.tv_sec  = 0;
 		ts.tv_nsec = 0;
 	}
 	CLOCK_SETTIME(rtc->clockdev, &ts);
 }
 
 static void
 clock_dbgprint_hdr(device_t dev, int rw)
 {
 	struct timespec now;
 
 	getnanotime(&now);
 	device_printf(dev, "%s at ", (rw & CLOCK_DBG_READ) ? "read " : "write");
 	clock_print_ts(&now, 9);
 	printf(": "); 
 }
 
 void
 clock_dbgprint_bcd(device_t dev, int rw, const struct bcd_clocktime *bct)
 {
 
 	if (show_io & rw) {
 		clock_dbgprint_hdr(dev, rw);
 		clock_print_bcd(bct, 9);
 		printf("\n");
 	}
 }
 
 void
 clock_dbgprint_ct(device_t dev, int rw, const struct clocktime *ct)
 {
 
 	if (show_io & rw) {
 		clock_dbgprint_hdr(dev, rw);
 		clock_print_ct(ct, 9);
 		printf("\n");
 	}
 }
 
 void
 clock_dbgprint_err(device_t dev, int rw, int err)
 {
 
 	if (show_io & rw) {
 		clock_dbgprint_hdr(dev, rw);
 		printf("error = %d\n", err);
 	}
 }
 
 void
 clock_dbgprint_ts(device_t dev, int rw, const struct timespec *ts)
 {
 
 	if (show_io & rw) {
 		clock_dbgprint_hdr(dev, rw);
 		clock_print_ts(ts, 9);
 		printf("\n");
 	}
 }
 
 void
 clock_register_flags(device_t clockdev, long resolution, int flags)
 {
 	struct rtc_instance *rtc, *newrtc;
 
 	newrtc = malloc(sizeof(*newrtc), M_DEVBUF, M_WAITOK);
 	newrtc->clockdev = clockdev;
 	newrtc->resolution = (int)resolution;
 	newrtc->flags = flags;
 	newrtc->schedns = 0;
 	newrtc->resadj.tv_sec  = newrtc->resolution / 2 / 1000000;
 	newrtc->resadj.tv_nsec = newrtc->resolution / 2 % 1000000 * 1000;
 	TIMEOUT_TASK_INIT(taskqueue_thread, &newrtc->stask, 0,
 		    settime_task_func, newrtc);
 
 	sx_xlock(&rtc_list_lock);
 	if (LIST_EMPTY(&rtc_list)) {
 		LIST_INSERT_HEAD(&rtc_list, newrtc, rtc_entries);
 	} else {
 		LIST_FOREACH(rtc, &rtc_list, rtc_entries) {
 			if (rtc->resolution > newrtc->resolution) {
 				LIST_INSERT_BEFORE(rtc, newrtc, rtc_entries);
 				break;
 			} else if (LIST_NEXT(rtc, rtc_entries) == NULL) {
 				LIST_INSERT_AFTER(rtc, newrtc, rtc_entries);
 				break;
 			}
 		}
 	}
 	sx_xunlock(&rtc_list_lock);
 
 	device_printf(clockdev, 
 	    "registered as a time-of-day clock, resolution %d.%6.6ds\n",
 	    newrtc->resolution / 1000000, newrtc->resolution % 1000000);
 }
 
 void
 clock_register(device_t dev, long res)
 {
 
 	clock_register_flags(dev, res, 0);
 }
 
 void
 clock_unregister(device_t clockdev)
 {
 	struct rtc_instance *rtc, *tmp;
 
 	sx_xlock(&rtc_list_lock);
 	LIST_FOREACH_SAFE(rtc, &rtc_list, rtc_entries, tmp) {
 		if (rtc->clockdev == clockdev) {
 			LIST_REMOVE(rtc, rtc_entries);
 			break;
 		}
 	}
 	sx_xunlock(&rtc_list_lock);
 	if (rtc != NULL) {
 		taskqueue_cancel_timeout(taskqueue_thread, &rtc->stask, NULL);
 		taskqueue_drain_timeout(taskqueue_thread, &rtc->stask);
 		free(rtc, M_DEVBUF);
 	}
 }
 
 void
 clock_schedule(device_t clockdev, u_int offsetns)
 {
 	struct rtc_instance *rtc;
 
 	sx_xlock(&rtc_list_lock);
 	LIST_FOREACH(rtc, &rtc_list, rtc_entries) {
 		if (rtc->clockdev == clockdev) {
 			rtc->schedns = offsetns;
 			break;
 		}
 	}
 	sx_xunlock(&rtc_list_lock);
 }
 
 static int
 read_clocks(struct timespec *ts, bool debug_read)
 {
 	struct rtc_instance *rtc;
 	int error;
 
 	error = ENXIO;
 	sx_xlock(&rtc_list_lock);
 	LIST_FOREACH(rtc, &rtc_list, rtc_entries) {
 		if ((error = CLOCK_GETTIME(rtc->clockdev, ts)) != 0)
 			continue;
 		if (ts->tv_sec < 0 || ts->tv_nsec < 0) {
 			error = EINVAL;
 			continue;
 		}
 		if (!(rtc->flags & CLOCKF_GETTIME_NO_ADJ)) {
-			timespecadd(ts, &rtc->resadj);
+			timespecadd(ts, &rtc->resadj, ts);
 			ts->tv_sec += utc_offset();
 		}
 		if (!debug_read) {
 			if (bootverbose)
 				device_printf(rtc->clockdev,
 				    "providing initial system time\n");
 			break;
 		}
 	}
 	sx_xunlock(&rtc_list_lock);
 	return (error);
 }
 
 /*
  * Initialize the system time.  Must be called from a context which does not
  * restrict any locking or sleeping that clock drivers may need to do.
  *
  * First attempt to get the time from a registered realtime clock.  The clocks
  * are queried in order of resolution until one provides the time.  If no clock
  * can provide the current time, use the 'base' time provided by the caller, if
  * non-zero.  The 'base' time is potentially highly inaccurate, such as the last
  * known good value of the system clock, or even a filesystem last-updated
  * timestamp.  It is used to prevent system time from appearing to move
  * backwards in logs.
  */
 void
 inittodr(time_t base)
 {
 	struct timespec ts;
 	int error;
 
 	error = read_clocks(&ts, false);
 
 	/*
 	 * Do not report errors from each clock; it is expected that some clocks
 	 * cannot provide results in some situations.  Only report problems when
 	 * no clocks could provide the time.
 	 */
 	if (error != 0) {
 		switch (error) {
 		case ENXIO:
 			printf("Warning: no time-of-day clock registered, ");
 			break;
 		case EINVAL:
 			printf("Warning: bad time from time-of-day clock, ");
 			break;
 		default:
 			printf("Error reading time-of-day clock (%d), ", error);
 			break;
 		}
 		printf("system time will not be set accurately\n");
 		ts.tv_sec  = (base > 0) ? base : -1;
 		ts.tv_nsec = 0;
 	}
 
 	if (ts.tv_sec >= 0) {
 		tc_setclock(&ts);
 #ifdef FFCLOCK
 		ffclock_reset_clock(&ts);
 #endif
 	}
 }
 
 /*
  * Write system time back to all registered clocks, unless disabled by admin.
  * This can be called from a context that restricts locking and/or sleeping; the
  * actual updating is done asynchronously on a task thread.
  */
 void
 resettodr(void)
 {
 	struct timespec now;
 	struct rtc_instance *rtc;
 	sbintime_t sbt;
 	long waitns;
 
 	if (disable_rtc_set)
 		return;
 
 	sx_xlock(&rtc_list_lock);
 	LIST_FOREACH(rtc, &rtc_list, rtc_entries) {
 		if (rtc->schedns != 0) {
 			getnanotime(&now);
 			waitns = rtc->schedns - now.tv_nsec;
 			if (waitns < 0)
 				waitns += 1000000000;
 			sbt = nstosbt(waitns);
 		} else
 			sbt = 0;
 		taskqueue_enqueue_timeout_sbt(taskqueue_thread,
 		    &rtc->stask, -sbt, 0, C_PREL(31));
 	}
 	sx_xunlock(&rtc_list_lock);
 }
 
 static int
 sysctl_clock_do_io(SYSCTL_HANDLER_ARGS)
 {
 	struct timespec ts_discard;
 	int error, value;
 
 	value = 0;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	switch (value) {
 	case CLOCK_DBG_READ:
 		if (read_clocks(&ts_discard, true) == ENXIO)
 			printf("No registered RTC clocks\n");
 		break;
 	case CLOCK_DBG_WRITE:
 		resettodr();
 		break;
 	default:
                 return (EINVAL);
 	}
 
 	return (0);
 }
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c	(revision 336913)
+++ head/sys/kern/uipc_mqueue.c	(revision 336914)
@@ -1,2933 +1,2931 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_fetchadd_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_name[i] = 0;
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	const struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	int found;
 
 	found = 0;
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0)
 			found = 1;
 	}
 	if (!found) {
 		/*
 		 * No jails are rooted in this directory anymore,
 		 * so no queues should be either.
 		 */
 		sx_xlock(&mqfs_data.mi_lock);
 		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 		    mn_sibling, tpn) {
 			if (pn->mn_pr_root == pr->pr_root)
 				(void)do_unlink(pn, curthread->td_ucred);
 		}
 		sx_xunlock(&mqfs_data.mi_lock);
 	}
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
-		ts2 = *abs_timeout;
 		getnanotime(&ts);
-		timespecsub(&ts2, &ts);
+		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
-		ts2 = *abs_timeout;
 		getnanotime(&ts);
-		timespecsub(&ts2, &ts);
+		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 				    pn->mn_gid, accmode, td->td_ucred, NULL);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			return (error);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			return (error);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			return (error);
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			return (error);
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/uipc_sem.c
===================================================================
--- head/sys/kern/uipc_sem.c	(revision 336913)
+++ head/sys/kern/uipc_sem.c	(revision 336914)
@@ -1,1110 +1,1110 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
 ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct ksem *ks;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL) {
 			path = ks->ks_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred, NULL);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct filedesc *fdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_VALUE(value);
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(name, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	AUDIT_ARG_FD(id);
 	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
-				timespecsub(&ts1, &ts2);
+				timespecsub(&ts1, &ts2, &ts1);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
Index: head/sys/mips/ingenic/jz4780_smb.c
===================================================================
--- head/sys/mips/ingenic/jz4780_smb.c	(revision 336913)
+++ head/sys/mips/ingenic/jz4780_smb.c	(revision 336914)
@@ -1,480 +1,480 @@
 /*-
  * Copyright (c) 2016 Jared McNeill <jmcneill@invisible.ca>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Ingenic JZ4780 SMB Controller
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/time.h>
 #include <machine/bus.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/iicbus/iiconf.h>
 #include <dev/iicbus/iicbus.h>
 
 #include <dev/extres/clk/clk.h>
 
 #include <mips/ingenic/jz4780_smb.h>
 
 #include "iicbus_if.h"
 
 #define	JZSMB_TIMEOUT			((300UL * hz) / 1000)
 
 #define	JZSMB_SPEED_STANDARD		100000
 #define	JZSMB_SETUP_TIME_STANDARD	300
 #define	JZSMB_HOLD_TIME_STANDARD	400
 #define	JZSMB_PERIOD_MIN_STANDARD	4000
 #define	JZSMB_PERIOD_MAX_STANDARD	4700
 
 #define	JZSMB_SPEED_FAST		400000
 #define	JZSMB_SETUP_TIME_FAST		450
 #define	JZSMB_HOLD_TIME_FAST		450
 #define	JZSMB_PERIOD_MIN_FAST		600
 #define	JZSMB_PERIOD_MAX_FAST		1300
 
 #define	JZSMB_HCNT_BASE			8
 #define	JZSMB_HCNT_MIN			6
 #define	JZSMB_LCNT_BASE			1
 #define	JZSMB_LCNT_MIN			8
 
 static inline int
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return (tvtohz(&tv));
 }
 
 static struct ofw_compat_data compat_data[] = {
 	{ "ingenic,jz4780-i2c",		1 },
 	{ NULL,				0 }
 };
 
 static struct resource_spec jzsmb_spec[] = {
 	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
 	{ -1, 0 }
 };
 
 struct jzsmb_softc {
 	struct resource	*res;
 	struct mtx	mtx;
 	clk_t		clk;
 	device_t	iicbus;
 	int		busy;
 	uint32_t	i2c_freq;
 	uint64_t	bus_freq;
 	uint32_t	status;
 
 	struct iic_msg	*msg;
 };
 
 #define	SMB_LOCK(sc)			mtx_lock(&(sc)->mtx)
 #define	SMB_UNLOCK(sc)			mtx_unlock(&(sc)->mtx)
 #define	SMB_ASSERT_LOCKED(sc)		mtx_assert(&(sc)->mtx, MA_OWNED)
 #define	SMB_READ(sc, reg)		bus_read_2((sc)->res, (reg))
 #define	SMB_WRITE(sc, reg, val)		bus_write_2((sc)->res, (reg), (val))
 
 static phandle_t
 jzsmb_get_node(device_t bus, device_t dev)
 {
 	return (ofw_bus_get_node(bus));
 }
 
 static int
 jzsmb_enable(struct jzsmb_softc *sc, int enable)
 {
 	SMB_ASSERT_LOCKED(sc);
 
 	if (enable) {
 		SMB_WRITE(sc, SMBENB, SMBENB_SMBENB);
 		while ((SMB_READ(sc, SMBENBST) & SMBENBST_SMBEN) == 0)
 			;
 	} else {
 		SMB_WRITE(sc, SMBENB, 0);
 		while ((SMB_READ(sc, SMBENBST) & SMBENBST_SMBEN) != 0)
 			;
 	}
 
 	return (0);
 }
 
 static int
 jzsmb_reset_locked(device_t dev, u_char addr)
 {
 	struct jzsmb_softc *sc;
 	uint16_t con;
 	uint32_t period;
 	int hcnt, lcnt, setup_time, hold_time;
 
 	sc = device_get_softc(dev);
 
 	SMB_ASSERT_LOCKED(sc);
 
 	/* Setup master mode operation */
 
 	/* Disable SMB */
 	jzsmb_enable(sc, 0);
 
 	/* Disable interrupts */
 	SMB_WRITE(sc, SMBINTM, 0);
 
 	/* Set supported speed mode and expected SCL frequency */
 	period = sc->bus_freq / sc->i2c_freq;
 	con = SMBCON_REST | SMBCON_SLVDIS | SMBCON_MD;
 	switch (sc->i2c_freq) {
 	case JZSMB_SPEED_STANDARD:
 		con |= SMBCON_SPD_STANDARD;
 		setup_time = JZSMB_SETUP_TIME_STANDARD;
 		hold_time = JZSMB_HOLD_TIME_STANDARD;
 		hcnt = (period * JZSMB_PERIOD_MIN_STANDARD) /
 		    (JZSMB_PERIOD_MAX_STANDARD + JZSMB_PERIOD_MIN_STANDARD);
 		lcnt = period - hcnt;
 		hcnt = MAX(hcnt - JZSMB_HCNT_BASE, JZSMB_HCNT_MIN);
 		lcnt = MAX(lcnt - JZSMB_LCNT_BASE, JZSMB_LCNT_MIN);
 		SMB_WRITE(sc, SMBCON, con);
 		SMB_WRITE(sc, SMBSHCNT, hcnt);
 		SMB_WRITE(sc, SMBSLCNT, lcnt);
 		break;
 	case JZSMB_SPEED_FAST:
 		con |= SMBCON_SPD_FAST;
 		setup_time = JZSMB_SETUP_TIME_FAST;
 		hold_time = JZSMB_HOLD_TIME_FAST;
 		hcnt = (period * JZSMB_PERIOD_MIN_FAST) /
 		    (JZSMB_PERIOD_MAX_FAST + JZSMB_PERIOD_MIN_FAST);
 		lcnt = period - hcnt;
 		hcnt = MAX(hcnt - JZSMB_HCNT_BASE, JZSMB_HCNT_MIN);
 		lcnt = MAX(lcnt - JZSMB_LCNT_BASE, JZSMB_LCNT_MIN);
 		SMB_WRITE(sc, SMBCON, con);
 		SMB_WRITE(sc, SMBFHCNT, hcnt);
 		SMB_WRITE(sc, SMBFLCNT, lcnt);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	setup_time = ((setup_time * sc->bus_freq / 1000) / 1000000) + 1;
 	setup_time = MIN(1, MAX(255, setup_time));
 	SMB_WRITE(sc, SMBSDASU, setup_time);
 
 	hold_time = ((hold_time * sc->bus_freq / 1000) / 1000000) - 1;
 	hold_time = MAX(255, hold_time);
 	if (hold_time >= 0)
 		SMB_WRITE(sc, SMBSDAHD, hold_time | SMBSDAHD_HDENB);
 	else
 		SMB_WRITE(sc, SMBSDAHD, 0);
 
 	SMB_WRITE(sc, SMBTAR, addr >> 1);
 
 	if (addr != 0) {
 		/* Enable SMB */
 		jzsmb_enable(sc, 1);
 	}
 
 	return (0);
 }
 
 static int
 jzsmb_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr)
 {
 	struct jzsmb_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	SMB_LOCK(sc);
 	error = jzsmb_reset_locked(dev, addr);
 	SMB_UNLOCK(sc);
 
 	return (error);
 }
 
 static int
 jzsmb_transfer_read(device_t dev, struct iic_msg *msg)
 {
 	struct jzsmb_softc *sc;
 	struct timespec start, diff;
 	uint16_t con, resid;
 	int timeo;
 
 	sc = device_get_softc(dev);
 	timeo = JZSMB_TIMEOUT * msg->len;
 
 	SMB_ASSERT_LOCKED(sc);
 
 	con = SMB_READ(sc, SMBCON);
 	con |= SMBCON_STPHLD;
 	SMB_WRITE(sc, SMBCON, con);
 
 	getnanouptime(&start);
 	for (resid = msg->len; resid > 0; resid--) {
 		for (int i = 0; i < min(resid, 8); i++)
 			SMB_WRITE(sc, SMBDC, SMBDC_CMD);
 		for (;;) {
 			getnanouptime(&diff);
-			timespecsub(&diff, &start);
+			timespecsub(&diff, &start, &diff);
 			if ((SMB_READ(sc, SMBST) & SMBST_RFNE) != 0) {
 				msg->buf[msg->len - resid] =
 				    SMB_READ(sc, SMBDC) & SMBDC_DAT;
 				break;
 			} else
 				DELAY(1000);
 
 			if (tstohz(&diff) >= timeo) {
 				device_printf(dev,
 				    "read timeout (status=0x%02x)\n",
 				    SMB_READ(sc, SMBST));
 				return (EIO);
 			}
 		}
 	}
 
 	con = SMB_READ(sc, SMBCON);
 	con &= ~SMBCON_STPHLD;
 	SMB_WRITE(sc, SMBCON, con);
 
 	return (0);
 }
 
 static int
 jzsmb_transfer_write(device_t dev, struct iic_msg *msg, int stop_hold)
 {
 	struct jzsmb_softc *sc;
 	struct timespec start, diff;
 	uint16_t con, resid;
 	int timeo;
 
 	sc = device_get_softc(dev);
 	timeo = JZSMB_TIMEOUT * msg->len;
 
 	SMB_ASSERT_LOCKED(sc);
 
 	con = SMB_READ(sc, SMBCON);
 	con |= SMBCON_STPHLD;
 	SMB_WRITE(sc, SMBCON, con);
 
 	getnanouptime(&start);
 	for (resid = msg->len; resid > 0; resid--) {
 		for (;;) {
 			getnanouptime(&diff);
-			timespecsub(&diff, &start);
+			timespecsub(&diff, &start, &diff);
 			if ((SMB_READ(sc, SMBST) & SMBST_TFNF) != 0) {
 				SMB_WRITE(sc, SMBDC,
 				    msg->buf[msg->len - resid]);
 				break;
 			} else
 				DELAY((1000 * hz) / JZSMB_TIMEOUT);
 
 			if (tstohz(&diff) >= timeo) {
 				device_printf(dev,
 				    "write timeout (status=0x%02x)\n",
 				    SMB_READ(sc, SMBST));
 				return (EIO);
 			}
 		}
 	}
 
 	if (!stop_hold) {
 		con = SMB_READ(sc, SMBCON);
 		con &= ~SMBCON_STPHLD;
 		SMB_WRITE(sc, SMBCON, con);
 	}
 
 	return (0);
 }
 
 static int
 jzsmb_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs)
 {
 	struct jzsmb_softc *sc;
 	uint32_t n;
 	uint16_t con;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	SMB_LOCK(sc);
 	while (sc->busy)
 		mtx_sleep(sc, &sc->mtx, 0, "i2cbuswait", 0);
 	sc->busy = 1;
 	sc->status = 0;
 
 	for (n = 0; n < nmsgs; n++) {
 		/* Set target address */
 		if (n == 0 || msgs[n].slave != msgs[n - 1].slave)
 			jzsmb_reset_locked(dev, msgs[n].slave);
 
 		/* Set read or write */
 		if ((msgs[n].flags & IIC_M_RD) != 0)
 			error = jzsmb_transfer_read(dev, &msgs[n]);
 		else
 			error = jzsmb_transfer_write(dev, &msgs[n],
 			    n < nmsgs - 1);
 
 		if (error != 0)
 			goto done;
 	}
 
 done:
 	/* Send stop if necessary */
 	con = SMB_READ(sc, SMBCON);
 	con &= ~SMBCON_STPHLD;
 	SMB_WRITE(sc, SMBCON, con);
 
 	/* Disable SMB */
 	jzsmb_enable(sc, 0);
 
 	sc->msg = NULL;
 	sc->busy = 0;
 	wakeup(sc);
 	SMB_UNLOCK(sc);
 
 	return (error);
 }
 
 static int
 jzsmb_probe(device_t dev)
 {
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
 		return (ENXIO);
 
 	device_set_desc(dev, "Ingenic JZ4780 SMB Controller");
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 jzsmb_attach(device_t dev)
 {
 	struct jzsmb_softc *sc;
 	phandle_t node;
 	int error;
 
 	sc = device_get_softc(dev);
 	node = ofw_bus_get_node(dev);
 	mtx_init(&sc->mtx, device_get_nameunit(dev), "jzsmb", MTX_DEF);
 
 	error = clk_get_by_ofw_index(dev, 0, 0, &sc->clk);
 	if (error != 0) {
 		device_printf(dev, "cannot get clock\n");
 		goto fail;
 	}
 	error = clk_enable(sc->clk);
 	if (error != 0) {
 		device_printf(dev, "cannot enable clock\n");
 		goto fail;
 	}
 	error = clk_get_freq(sc->clk, &sc->bus_freq);
 	if (error != 0 || sc->bus_freq == 0) {
 		device_printf(dev, "cannot get bus frequency\n");
 		return (error);
 	}
 
 	if (bus_alloc_resources(dev, jzsmb_spec, &sc->res) != 0) {
 		device_printf(dev, "cannot allocate resources for device\n");
 		error = ENXIO;
 		goto fail;
 	}
 
 	if (OF_getencprop(node, "clock-frequency", &sc->i2c_freq,
 	    sizeof(sc->i2c_freq)) != 0 || sc->i2c_freq == 0)
 		sc->i2c_freq = 100000;	/* Default to standard mode */
 
 	sc->iicbus = device_add_child(dev, "iicbus", -1);
 	if (sc->iicbus == NULL) {
 		device_printf(dev, "cannot add iicbus child device\n");
 		error = ENXIO;
 		goto fail;
 	}
 
 	bus_generic_attach(dev);
 
 	return (0);
 
 fail:
 	bus_release_resources(dev, jzsmb_spec, &sc->res);
 	if (sc->clk != NULL)
 		clk_release(sc->clk);
 	mtx_destroy(&sc->mtx);
 	return (error);
 }
 
 static device_method_t jzsmb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		jzsmb_probe),
 	DEVMETHOD(device_attach,	jzsmb_attach),
 
 	/* Bus interface */
 	DEVMETHOD(bus_setup_intr,	bus_generic_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	bus_generic_teardown_intr),
 	DEVMETHOD(bus_alloc_resource,	bus_generic_alloc_resource),
 	DEVMETHOD(bus_release_resource,	bus_generic_release_resource),
 	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
 	DEVMETHOD(bus_adjust_resource,	bus_generic_adjust_resource),
 	DEVMETHOD(bus_set_resource,	bus_generic_rl_set_resource),
 	DEVMETHOD(bus_get_resource,	bus_generic_rl_get_resource),
 
 	/* OFW methods */
 	DEVMETHOD(ofw_bus_get_node,	jzsmb_get_node),
 
 	/* iicbus interface */
 	DEVMETHOD(iicbus_callback,	iicbus_null_callback),
 	DEVMETHOD(iicbus_reset,		jzsmb_reset),
 	DEVMETHOD(iicbus_transfer,	jzsmb_transfer),
 
 	DEVMETHOD_END
 };
 
 static driver_t jzsmb_driver = {
 	"iichb",
 	jzsmb_methods,
 	sizeof(struct jzsmb_softc),
 };
 
 static devclass_t jzsmb_devclass;
 
 EARLY_DRIVER_MODULE(iicbus, jzsmb, iicbus_driver, iicbus_devclass, 0, 0,
     BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE);
 EARLY_DRIVER_MODULE(jzsmb, simplebus, jzsmb_driver, jzsmb_devclass, 0, 0,
     BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE);
 MODULE_VERSION(jzsmb, 1);
Index: head/sys/netinet/ip_input.c
===================================================================
--- head/sys/netinet/ip_input.c	(revision 336913)
+++ head/sys/netinet/ip_input.c	(revision 336914)
@@ -1,1427 +1,1427 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #include <netinet/in_rss.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_drain(void);
 extern void ipreass_slowtimo(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 struct rmlock in_ifaddr_lock;
 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 VNET_DEFINE_STATIC(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 VNET_DEFINE_STATIC(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen,
     "I", "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	CK_STAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_init();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	/* Skip initialization of globals for non-default instances. */
 #ifdef VIMAGE
 	if (!IS_DEFAULT_VNET(curvnet)) {
 		netisr_register_vnet(&ip_nh);
 #ifdef	RSS
 		netisr_register_vnet(&ip_direct_nh);
 #endif
 		return;
 	}
 #endif
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 static void
 ip_destroy(void *unused __unused)
 {
 	struct ifnet *ifp;
 	int error;
 
 #ifdef	RSS
 	netisr_unregister_vnet(&ip_direct_nh);
 #endif
 	netisr_unregister_vnet(&ip_nh);
 
 	if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Remove the IPv4 addresses from all interfaces. */
 	in_ifscrub_all();
 
 	/* Make sure the IPv4 routes are gone as well. */
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		rt_flushifroutes_af(ifp, AF_INET);
 	IFNET_RUNLOCK();
 
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0)
 			return;
 	}
 #endif /* IPSEC */
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Try to forward the packet, but if we fail continue.
 	 * ip_tryforward() does inbound and outbound packet firewall
 	 * processing. If firewall has decided that destination becomes
 	 * our local address, it sets M_FASTFWD_OURS flag. In this
 	 * case skip another inbound firewall processing and update
 	 * ip pointer.
 	 */
 	if (V_ipforwarding != 0
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	    && (!IPSEC_ENABLED(ipv4) ||
 	    IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0)
 #endif
 	    ) {
 		if ((m = ip_tryforward(m)) == NULL)
 			return;
 		if (m->m_flags & M_FASTFWD_OURS) {
 			m->m_flags &= ~M_FASTFWD_OURS;
 			ip = mtod(m, struct ip *);
 			goto ours;
 		}
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (IPSEC_ENABLED(ipv4) &&
 	    IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0)
 			goto passin;
 #endif
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, 0, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0)
 			return;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_slowtimo();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (
 #ifdef IPSTEALTH
 	    V_ipstealth == 0 &&
 #endif
 	    ip->ip_ttl <= IPTTLDEC) {
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
 		return;
 	}
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(&ro,
 	    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 	    M_GETFIB(m));
 #else
 	in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 #endif
 	NET_EPOCH_ENTER();
 	if (ro.ro_rt != NULL) {
 		ia = ifatoia(ro.ro_rt->rt_ifa);
 	} else
 		ia = NULL;
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copym() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 #ifdef IPSTEALTH
 	if (V_ipstealth == 0)
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_FORWARD(ipv4, m)) != 0) {
 			/* mbuf consumed by IPsec */
 			m_freem(mcopy);
 			if (error != EINPROGRESS)
 				IPSTAT_INC(ips_cantforward);
 			goto out;
 		}
 		/* No IPsec processing required */
 	}
 #endif /* IPSEC */
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct rtentry *rt;
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			goto out;
 		}
 	}
 	if (mcopy == NULL)
 		goto out;
 
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		goto out;
 	}
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
  out:
 	NET_EPOCH_EXIT();
 }
 
 #define	CHECK_SO_CT(sp, ct) \
     (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0)
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 	bool stamped;
 
 	stamped = false;
 	if ((inp->inp_socket->so_options & SO_BINTIME) ||
 	    CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) {
 		struct bintime boottimebin, bt;
 		struct timespec ts1;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts1);
 			timespec2bintime(&ts1, &bt);
 			getboottimebin(&boottimebin);
 			bintime_add(&bt, &boottimebin);
 		} else {
 			bintime(&bt);
 		}
 		*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 		    SCM_BINTIME, SOL_SOCKET);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	}
 	if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) {
 		struct bintime boottimebin, bt1;
 		struct timespec ts1;;
 		struct timeval tv;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts1);
 			timespec2bintime(&ts1, &bt1);
 			getboottimebin(&boottimebin);
 			bintime_add(&bt1, &boottimebin);
 			bintime2timeval(&bt1, &tv);
 		} else {
 			microtime(&tv);
 		}
 		*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 		    SCM_TIMESTAMP, SOL_SOCKET);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	} else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) {
 		struct bintime boottimebin;
 		struct timespec ts, ts1;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts);
 			getboottimebin(&boottimebin);
 			bintime2timespec(&boottimebin, &ts1);
-			timespecadd(&ts, &ts1);
+			timespecadd(&ts, &ts1, &ts);
 		} else {
 			nanotime(&ts);
 		}
 		*mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts),
 		    SCM_REALTIME, SOL_SOCKET);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	} else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) {
 		struct timespec ts;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP))
 			mbuf_tstmp2timespec(m, &ts);
 		else
 			nanouptime(&ts);
 		*mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts),
 		    SCM_MONOTONIC, SOL_SOCKET);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	}
 	if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 	    M_TSTMP)) {
 		struct sock_timestamp_info sti;
 
 		bzero(&sti, sizeof(sti));
 		sti.st_info_flags = ST_INFO_HW;
 		if ((m->m_flags & M_TSTMP_HPREC) != 0)
 			sti.st_info_flags |= ST_INFO_HW_HPREC;
 		*mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO,
 		    SOL_SOCKET);
 		if (*mp != NULL)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 VNET_DEFINE_STATIC(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: head/sys/netinet6/ip6_input.c
===================================================================
--- head/sys/netinet6/ip6_input.c	(revision 336913)
+++ head/sys/netinet6/ip6_input.c	(revision 336914)
@@ -1,1862 +1,1862 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_systm.h>
 #include <net/if_llatbl.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #endif /* INET */
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <netinet6/ip6protosw.h>
 
 extern struct domain inet6domain;
 
 u_char ip6_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DEFINE(u_long, in6_ifaddrhmask);
 
 static struct netisr_handler ip6_nh = {
 	.nh_name = "ip6",
 	.nh_handler = ip6_input,
 	.nh_proto = NETISR_IPV6,
 #ifdef RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 static int
 sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip6_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip6_nh, qlimit));
 }
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I",
     "Maximum size of the IPv6 input queue");
 
 #ifdef RSS
 static struct netisr_handler ip6_direct_nh = {
 	.nh_name = "ip6_direct",
 	.nh_handler = ip6_direct_input,
 	.nh_proto = NETISR_IPV6_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v6,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 
 static int
 sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip6_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip6_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen,
     "I", "Maximum size of the IPv6 direct input queue");
 
 #endif
 
 VNET_DEFINE(struct pfil_head, inet6_pfil_hook);
 
 VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat);
 VNET_PCPUSTAT_SYSINIT(ip6stat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ip6stat);
 #endif /* VIMAGE */
 
 struct rmlock in6_ifaddr_lock;
 RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock");
 
 static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
 #ifdef PULLDOWN_TEST
 static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
 #endif
 
 /*
  * IP6 initialization: fill in IP6 protocol switch table.
  * All protocols not implemented in kernel go to raw IP6 protocol handler.
  */
 void
 ip6_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal",
 	    &V_ip6_auto_linklocal);
 	TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv);
 	TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr);
 
 	CK_STAILQ_INIT(&V_in6_ifaddrhead);
 	V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR,
 	    &V_in6_ifaddrhmask);
 
 	/* Initialize packet filter hooks. */
 	V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet6_pfil_hook.ph_af = AF_INET6;
 	if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET6],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	scope6_init();
 	addrsel_policy_init();
 	nd6_init();
 	frag6_init();
 
 	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
 
 	/* Skip global initialization stuff for non-default instances. */
 #ifdef VIMAGE
 	if (!IS_DEFAULT_VNET(curvnet)) {
 		netisr_register_vnet(&ip6_nh);
 #ifdef RSS
 		netisr_register_vnet(&ip6_direct_nh);
 #endif
 		return;
 	}
 #endif
 
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip6_init");
 
 	/* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip6_protox[i] = pr - inet6sw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip6_protox[].
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip6_protox[pr->pr_protocol] = pr - inet6sw;
 		}
 
 	netisr_register(&ip6_nh);
 #ifdef RSS
 	netisr_register(&ip6_direct_nh);
 #endif
 }
 
 /*
  * The protocol to be inserted into ip6_protox[] must be already registered
  * in inet6sw[], either statically or through pf_proto_register().
  */
 int
 ip6proto_register(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] != pr - inet6sw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/*
 	 * Find the protocol position in inet6sw[] and set the index.
 	 */
 	for (pr = inet6domain.dom_protosw;
 	    pr < inet6domain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET6 &&
 		    pr->pr_protocol && pr->pr_protocol == ip6proto) {
 			ip6_protox[pr->pr_protocol] = pr - inet6sw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ip6proto_unregister(short ip6proto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip6_protox[ip6proto] == pr - inet6sw)	/* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip6_protox[ip6proto] = pr - inet6sw;
 	return (0);
 }
 
 #ifdef VIMAGE
 static void
 ip6_destroy(void *unused __unused)
 {
 	struct ifaddr *ifa, *nifa;
 	struct ifnet *ifp;
 	int error;
 
 #ifdef RSS
 	netisr_unregister_vnet(&ip6_direct_nh);
 #endif
 	netisr_unregister_vnet(&ip6_nh);
 
 	if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Cleanup addresses. */
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* Cannot lock here - lock recursion. */
 		/* IF_ADDR_LOCK(ifp); */
 		CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
 
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			in6_purgeaddr(ifa);
 		}
 		/* IF_ADDR_UNLOCK(ifp); */
 		in6_ifdetach_destroy(ifp);
 		mld_domifdetach(ifp);
 		/* Make sure any routes are gone as well. */
 		rt_flushifroutes_af(ifp, AF_INET6);
 	}
 	IFNET_RUNLOCK();
 
 	nd6_destroy();
 	in6_ifattach_destroy();
 
 	hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL);
 #endif
 
 static int
 ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off,
     int *nxt, int *ours)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 
 	if (ip6_hopopts_input(plen, rtalert, &m, off)) {
 #if 0	/*touches NULL pointer*/
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
 		goto out;	/* m have already been freed */
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * if the payload length field is 0 and the next header field
 	 * indicates Hop-by-Hop Options header, then a Jumbo Payload
 	 * option MUST be included.
 	 */
 	if (ip6->ip6_plen == 0 && *plen == 0) {
 		/*
 		 * Note that if a valid jumbo payload option is
 		 * contained, ip6_hopopts_input() must set a valid
 		 * (non-zero) payload length to the variable plen.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
 		goto out;
 	}
 #ifndef PULLDOWN_TEST
 	/* ip6_hopopts_input() ensures that mbuf is contiguous */
 	hbh = (struct ip6_hbh *)(ip6 + 1);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		goto out;
 	}
 #endif
 	*nxt = hbh->ip6h_nxt;
 
 	/*
 	 * If we are acting as a router and the packet contains a
 	 * router alert option, see if we know the option value.
 	 * Currently, we only support the option value for MLD, in which
 	 * case we should pass the packet to the multicast routing
 	 * daemon.
 	 */
 	if (*rtalert != ~0) {
 		switch (*rtalert) {
 		case IP6OPT_RTALERT_MLD:
 			if (V_ip6_forwarding)
 				*ours = 1;
 			break;
 		default:
 			/*
 			 * RFC2711 requires unrecognized values must be
 			 * silently ignored.
 			 */
 			break;
 		}
 	}
 
 	return (0);
 
 out:
 	return (1);
 }
 
 #ifdef RSS
 /*
  * IPv6 direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip6_direct_input(struct mbuf *m)
 {
 	int off, nxt;
 	int nest;
 	struct m_tag *mtag;
 	struct ip6_direct_ctx *ip6dc;
 
 	mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL);
 	KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!"));
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	nxt = ip6dc->ip6dc_nxt;
 	off = ip6dc->ip6dc_off;
 
 	nest = 0;
 
 	m_tag_delete(m, mtag);
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		if (IPSEC_ENABLED(ipv6)) {
 			if (IPSEC_INPUT(ipv6, m, off, nxt) != 0)
 				return;
 		}
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	m_freem(m);
 }
 #endif
 
 void
 ip6_input(struct mbuf *m)
 {
 	struct in6_addr odst;
 	struct ip6_hdr *ip6;
 	struct in6_ifaddr *ia;
 	struct ifnet *rcvif;
 	u_int32_t plen;
 	u_int32_t rtalert = ~0;
 	int off = sizeof(struct ip6_hdr), nest;
 	int nxt, ours = 0;
 	int srcrt = 0;
 
 	/*
 	 * Drop the packet if IPv6 operation is disabled on the interface.
 	 */
 	rcvif = m->m_pkthdr.rcvif;
 	if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED))
 		goto bad;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * should the inner packet be considered authentic?
 	 * see comment in ah4_input().
 	 * NB: m cannot be NULL when passed to the input routine
 	 */
 
 	m->m_flags &= ~M_AUTHIPHDR;
 	m->m_flags &= ~M_AUTHIPDGM;
 
 #endif /* IPSEC */
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * Firewall changed destination to local.
 		 */
 		ip6 = mtod(m, struct ip6_hdr *);
 		goto passin;
 	}
 
 	/*
 	 * mbuf statistics
 	 */
 	if (m->m_flags & M_EXT) {
 		if (m->m_next)
 			IP6STAT_INC(ip6s_mext2m);
 		else
 			IP6STAT_INC(ip6s_mext1);
 	} else {
 		if (m->m_next) {
 			if (m->m_flags & M_LOOP) {
 				IP6STAT_INC(ip6s_m2m[V_loif->if_index]);
 			} else if (rcvif->if_index < IP6S_M2MMAX)
 				IP6STAT_INC(ip6s_m2m[rcvif->if_index]);
 			else
 				IP6STAT_INC(ip6s_m2m[0]);
 		} else
 			IP6STAT_INC(ip6s_m1);
 	}
 
 	in6_ifstat_inc(rcvif, ifs6_in_receive);
 	IP6STAT_INC(ip6s_total);
 
 #ifndef PULLDOWN_TEST
 	/*
 	 * L2 bridge code and some other code can return mbuf chain
 	 * that does not conform to KAME requirement.  too bad.
 	 * XXX: fails to join if interface MTU > MCLBYTES.  jumbogram?
 	 */
 	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
 		struct mbuf *n;
 
 		if (m->m_pkthdr.len > MHLEN)
 			n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 			n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL)
 			goto bad;
 
 		m_move_pkthdr(n, m);
 		m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_len = n->m_pkthdr.len;
 		m_freem(m);
 		m = n;
 	}
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
 			goto bad;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]);
 	IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6);
 
 	/*
 	 * Check against address spoofing/corruption.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
 		/*
 		 * XXX: "badscope" is not very suitable for a multicast source.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
 	    !(m->m_flags & M_LOOP)) {
 		/*
 		 * In this case, the packet should come from the loopback
 		 * interface.  However, we cannot just check the if_flags,
 		 * because ip6_mloopback() passes the "actual" interface
 		 * as the outgoing/incoming interface.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 	    IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) {
 		/*
 		 * RFC4291 2.7:
 		 * Nodes must not originate a packet to a multicast address
 		 * whose scop field contains the reserved value 0; if such
 		 * a packet is received, it must be silently dropped.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
 		/* packet is dropped by traffic conditioner */
 		return;
 	}
 #endif
 	/*
 	 * The following check is not documented in specs.  A malicious
 	 * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
 	 * and bypass security checks (act as if it was from 127.0.0.1 by using
 	 * IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * This check chokes if we are in an SIIT cloud.  As none of BSDs
 	 * support IPv4-less kernel compilation, we cannot support SIIT
 	 * environment at all.  So, it makes more sense for us to reject any
 	 * malicious packets for non-SIIT environment, than try to do a
 	 * partial support for SIIT environment.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #if 0
 	/*
 	 * Reject packets with IPv4 compatible addresses (auto tunnel).
 	 *
 	 * The code forbids auto tunnel relay case in RFC1933 (the check is
 	 * stronger than RFC1933).  We may want to re-enable it if mech-xx
 	 * is revised to forbid relaying case.
 	 */
 	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 #endif
 	/*
 	 * Try to forward the packet, but if we fail continue.
 	 * ip6_tryforward() does inbound and outbound packet firewall
 	 * processing. If firewall has decided that destination becomes
 	 * our local address, it sets M_FASTFWD_OURS flag. In this
 	 * case skip another inbound firewall processing and update
 	 * ip6 pointer.
 	 */
 	if (V_ip6_forwarding != 0
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	    && (!IPSEC_ENABLED(ipv6) ||
 	    IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0)
 #endif
 	    ) {
 		if ((m = ip6_tryforward(m)) == NULL)
 			return;
 		if (m->m_flags & M_FASTFWD_OURS) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			goto passin;
 		}
 	}
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (IPSEC_ENABLED(ipv6) &&
 	    IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0)
 			goto passin;
 #endif
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing
 	 *     (e.g. by NAT rewriting).  When this happens,
 	 *     tell ip6_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passin;
 
 	odst = ip6->ip6_dst;
 	if (pfil_run_hooks(&V_inet6_pfil_hook, &m,
 	    m->m_pkthdr.rcvif, PFIL_IN, 0, NULL))
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 	ip6 = mtod(m, struct ip6_hdr *);
 	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
 	if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP &&
 	    m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 		/*
 		 * Directly ship the packet on.  This allows forwarding
 		 * packets originally destined to us to some other directly
 		 * connected host.
 		 */
 		ip6_forward(m, 1);
 		return;
 	}
 
 passin:
 	/*
 	 * Disambiguate address scope zones (if there is ambiguity).
 	 * We first make sure that the original source or destination address
 	 * is not in our internal form for scoped addresses.  Such addresses
 	 * are not necessarily invalid spec-wise, but we cannot accept them due
 	 * to the usage conflict.
 	 * in6_setscope() then also checks and rejects the cases where src or
 	 * dst are the loopback address and the receiving interface
 	 * is not loopback.
 	 */
 	if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope); /* XXX */
 		goto bad;
 	}
 	if (in6_setscope(&ip6->ip6_src, rcvif, NULL) ||
 	    in6_setscope(&ip6->ip6_dst, rcvif, NULL)) {
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		ours = 1;
 		goto hbhcheck;
 	}
 	/*
 	 * Multicast check. Assume packet is for us to avoid
 	 * prematurely taking locks.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ours = 1;
 		in6_ifstat_inc(rcvif, ifs6_in_mcast);
 		goto hbhcheck;
 	}
 	/*
 	 * Unicast check
 	 * XXX: For now we keep link-local IPv6 addresses with embedded
 	 *      scope zone id, therefore we use zero zoneid here.
 	 */
 	ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 	if (ia != NULL) {
 		if (ia->ia6_flags & IN6_IFF_NOTREADY) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 			/* address is not ready, so discard the packet. */
 			nd6log((LOG_INFO,
 			    "ip6_input: packet to an unready address %s->%s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst)));
 			ifa_free(&ia->ia_ifa);
 			goto bad;
 		}
 		/* Count the packet in the ip address stats */
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		ifa_free(&ia->ia_ifa);
 		ours = 1;
 		goto hbhcheck;
 	}
 
 	/*
 	 * Now there is no reason to process the packet if it's not our own
 	 * and we're not a router.
 	 */
 	if (!V_ip6_forwarding) {
 		IP6STAT_INC(ip6s_cantforward);
 		goto bad;
 	}
 
   hbhcheck:
 	/*
 	 * Process Hop-by-Hop options header if it's contained.
 	 * m may be modified in ip6_hopopts_input().
 	 * If a JumboPayload option is included, plen will also be modified.
 	 */
 	plen = (u_int32_t)ntohs(ip6->ip6_plen);
 	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 		if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0)
 			return;
 	} else
 		nxt = ip6->ip6_nxt;
 
 	/*
 	 * Use mbuf flags to propagate Router Alert option to
 	 * ICMPv6 layer, as hop-by-hop options have been stripped.
 	 */
 	if (rtalert != ~0)
 		m->m_flags |= M_RTALERT_MLD;
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(rcvif, ifs6_in_truncated);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Forward if desirable.
 	 */
 	if (V_ip6_mrouter &&
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		/*
 		 * If we are acting as a multicast router, all
 		 * incoming multicast packets are passed to the
 		 * kernel-level multicast forwarding function.
 		 * The packet is returned (relatively) intact; if
 		 * ip6_mforward() returns a non-zero value, the packet
 		 * must be discarded, else it may be accepted below.
 		 *
 		 * XXX TODO: Check hlim and multicast scope here to avoid
 		 * unnecessarily calling into ip6_mforward().
 		 */
 		if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) {
 			IP6STAT_INC(ip6s_cantforward);
 			goto bad;
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
 		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * Malicious party may be able to use IPv4 mapped addr to confuse
 	 * tcp/udp stack and bypass security checks (act as if it was from
 	 * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
 	 *
 	 * For SIIT end node behavior, you may want to disable the check.
 	 * However, you will  become vulnerable to attacks using IPv4 mapped
 	 * source.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(rcvif, ifs6_in_addrerr);
 		goto bad;
 	}
 
 	/*
 	 * Tell launch routine the next header
 	 */
 	IP6STAT_INC(ip6s_delivered);
 	in6_ifstat_inc(rcvif, ifs6_in_deliver);
 	nest = 0;
 
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			goto bad;
 		}
 
 		/*
 		 * protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < off) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(rcvif, ifs6_in_truncated);
 			goto bad;
 		}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		if (IPSEC_ENABLED(ipv6)) {
 			if (IPSEC_INPUT(ipv6, m, off, nxt) != 0)
 				return;
 		}
 #endif /* IPSEC */
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
 	return;
 bad:
 	in6_ifstat_inc(rcvif, ifs6_in_discard);
 	if (m != NULL)
 		m_freem(m);
 }
 
 /*
  * Hop-by-Hop options header processing. If a valid jumbo payload option is
  * included, the real payload length will be stored in plenp.
  *
  * rtalertp - XXX: should be stored more smart way
  */
 static int
 ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
     struct mbuf **mp, int *offp)
 {
 	struct mbuf *m = *mp;
 	int off = *offp, hbhlen;
 	struct ip6_hbh *hbh;
 
 	/* validation of the length of the header */
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 
 	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
 	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
 		sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 	hbhlen = (hbh->ip6h_len + 1) << 3;
 	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
 		hbhlen);
 	if (hbh == NULL) {
 		IP6STAT_INC(ip6s_tooshort);
 		return -1;
 	}
 #endif
 	off += hbhlen;
 	hbhlen -= sizeof(struct ip6_hbh);
 	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
 				hbhlen, rtalertp, plenp) < 0)
 		return (-1);
 
 	*offp = off;
 	*mp = m;
 	return (0);
 }
 
 /*
  * Search header for all Hop-by-hop options and process each option.
  * This function is separate from ip6_hopopts_input() in order to
  * handle a case where the sending node itself process its hop-by-hop
  * options header. In such a case, the function is called from ip6_output().
  *
  * The function assumes that hbh header is located right after the IPv6 header
  * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
  * opthead + hbhlen is located in contiguous memory region.
  */
 int
 ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
     u_int32_t *rtalertp, u_int32_t *plenp)
 {
 	struct ip6_hdr *ip6;
 	int optlen = 0;
 	u_int8_t *opt = opthead;
 	u_int16_t rtalert_val;
 	u_int32_t jumboplen;
 	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
 
 	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
 		switch (*opt) {
 		case IP6OPT_PAD1:
 			optlen = 1;
 			break;
 		case IP6OPT_PADN:
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = *(opt + 1) + 2;
 			break;
 		case IP6OPT_ROUTER_ALERT:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_RTALERT_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_RTALERT_LEN;
 			bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
 			*rtalertp = ntohs(rtalert_val);
 			break;
 		case IP6OPT_JUMBO:
 			/* XXX may need check for alignment */
 			if (hbhlen < IP6OPT_JUMBO_LEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
 				/* XXX stat */
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 1 - opthead);
 				return (-1);
 			}
 			optlen = IP6OPT_JUMBO_LEN;
 
 			/*
 			 * IPv6 packets that have non 0 payload length
 			 * must not contain a jumbo payload option.
 			 */
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (ip6->ip6_plen) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt - opthead);
 				return (-1);
 			}
 
 			/*
 			 * We may see jumbolen in unaligned location, so
 			 * we'd need to perform bcopy().
 			 */
 			bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
 			jumboplen = (u_int32_t)htonl(jumboplen);
 
 #if 1
 			/*
 			 * if there are multiple jumbo payload options,
 			 * *plenp will be non-zero and the packet will be
 			 * rejected.
 			 * the behavior may need some debate in ipngwg -
 			 * multiple options does not make sense, however,
 			 * there's no explicit mention in specification.
 			 */
 			if (*plenp != 0) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 #endif
 
 			/*
 			 * jumbo payload length must be larger than 65535.
 			 */
 			if (jumboplen <= IPV6_MAXPACKET) {
 				IP6STAT_INC(ip6s_badoptions);
 				icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff + opt + 2 - opthead);
 				return (-1);
 			}
 			*plenp = jumboplen;
 
 			break;
 		default:		/* unknown option */
 			if (hbhlen < IP6OPT_MINLEN) {
 				IP6STAT_INC(ip6s_toosmall);
 				goto bad;
 			}
 			optlen = ip6_unknown_opt(opt, m,
 			    erroff + opt - opthead);
 			if (optlen == -1)
 				return (-1);
 			optlen += 2;
 			break;
 		}
 	}
 
 	return (0);
 
   bad:
 	m_freem(m);
 	return (-1);
 }
 
 /*
  * Unknown option processing.
  * The third argument `off' is the offset from the IPv6 header to the option,
  * which is necessary if the IPv6 header the and option header and IPv6 header
  * is not contiguous in order to return an ICMPv6 error.
  */
 int
 ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
 {
 	struct ip6_hdr *ip6;
 
 	switch (IP6OPT_TYPE(*optp)) {
 	case IP6OPT_TYPE_SKIP: /* ignore the option */
 		return ((int)*(optp + 1));
 	case IP6OPT_TYPE_DISCARD:	/* silently discard */
 		m_freem(m);
 		return (-1);
 	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
 		IP6STAT_INC(ip6s_badoptions);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    (m->m_flags & (M_BCAST|M_MCAST)))
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_OPTION, off);
 		return (-1);
 	}
 
 	m_freem(m);		/* XXX: NOTREACHED */
 	return (-1);
 }
 
 /*
  * Create the "control" list for this pcb.
  * These functions will not modify mbuf chain at all.
  *
  * With KAME mbuf chain restriction:
  * The routine will be called from upper layer handlers like tcp6_input().
  * Thus the routine assumes that the caller (tcp6_input) have already
  * called IP6_EXTHDR_CHECK() and all the extension headers are located in the
  * very first mbuf on the mbuf chain.
  *
  * ip6_savecontrol_v4 will handle those options that are possible to be
  * set on a v4-mapped socket.
  * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
  * options and handle the v6-only ones itself.
  */
 struct mbuf **
 ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
     int *v4only)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef SO_TIMESTAMP
 	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
 		union {
 			struct timeval tv;
 			struct bintime bt;
 			struct timespec ts;
 		} t;
 		struct bintime boottimebin, bt1;
 		struct timespec ts1;
 		bool stamped;
 
 		stamped = false;
 		switch (inp->inp_socket->so_ts_clock) {
 		case SO_TS_REALTIME_MICRO:
 			if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 			    M_TSTMP)) {
 				mbuf_tstmp2timespec(m, &ts1);
 				timespec2bintime(&ts1, &bt1);
 				getboottimebin(&boottimebin);
 				bintime_add(&bt1, &boottimebin);
 				bintime2timeval(&bt1, &t.tv);
 			} else {
 				microtime(&t.tv);
 			}
 			*mp = sbcreatecontrol((caddr_t) &t.tv, sizeof(t.tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp != NULL) {
 				mp = &(*mp)->m_next;
 				stamped = true;
 			}
 			break;
 
 		case SO_TS_BINTIME:
 			if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 			    M_TSTMP)) {
 				mbuf_tstmp2timespec(m, &ts1);
 				timespec2bintime(&ts1, &t.bt);
 				getboottimebin(&boottimebin);
 				bintime_add(&t.bt, &boottimebin);
 			} else {
 				bintime(&t.bt);
 			}
 			*mp = sbcreatecontrol((caddr_t)&t.bt, sizeof(t.bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp != NULL) {
 				mp = &(*mp)->m_next;
 				stamped = true;
 			}
 			break;
 
 		case SO_TS_REALTIME:
 			if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 			    M_TSTMP)) {
 				mbuf_tstmp2timespec(m, &t.ts);
 				getboottimebin(&boottimebin);
 				bintime2timespec(&boottimebin, &ts1);
-				timespecadd(&t.ts, &ts1);
+				timespecadd(&t.ts, &ts1, &t.ts);
 			} else {
 				nanotime(&t.ts);
 			}
 			*mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts),
 			    SCM_REALTIME, SOL_SOCKET);
 			if (*mp != NULL) {
 				mp = &(*mp)->m_next;
 				stamped = true;
 			}
 			break;
 
 		case SO_TS_MONOTONIC:
 			if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 			    M_TSTMP))
 				mbuf_tstmp2timespec(m, &t.ts);
 			else
 				nanouptime(&t.ts);
 			*mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts),
 			    SCM_MONOTONIC, SOL_SOCKET);
 			if (*mp != NULL) {
 				mp = &(*mp)->m_next;
 				stamped = true;
 			}
 			break;
 
 		default:
 			panic("unknown (corrupted) so_ts_clock");
 		}
 		if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) ==
 		    (M_PKTHDR | M_TSTMP)) {
 			struct sock_timestamp_info sti;
 
 			bzero(&sti, sizeof(sti));
 			sti.st_info_flags = ST_INFO_HW;
 			if ((m->m_flags & M_TSTMP_HPREC) != 0)
 				sti.st_info_flags |= ST_INFO_HW_HPREC;
 			*mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti),
 			    SCM_TIME_INFO, SOL_SOCKET);
 			if (*mp != NULL)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 #define IS2292(inp, x, y)	(((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
 	/* RFC 2292 sec. 5 */
 	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
 		struct in6_pktinfo pi6;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			pi6.ipi6_addr.s6_addr32[0] = 0;
 			pi6.ipi6_addr.s6_addr32[1] = 0;
 			pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 			pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr;
 #else
 			/* We won't hit this code */
 			bzero(&pi6.ipi6_addr, sizeof(struct in6_addr));
 #endif
 		} else {	
 			bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
 			in6_clearscope(&pi6.ipi6_addr);	/* XXX */
 		}
 		pi6.ipi6_ifindex =
 		    (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;
 
 		*mp = sbcreatecontrol((caddr_t) &pi6,
 		    sizeof(struct in6_pktinfo),
 		    IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
 		int hlim;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			hlim = ip->ip_ttl;
 #else
 			/* We won't hit this code */
 			hlim = 0;
 #endif
 		} else {
 			hlim = ip6->ip6_hlim & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
 		    IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
 		    IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if ((inp->inp_flags & IN6P_TCLASS) != 0) {
 		int tclass;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 #ifdef INET
 			struct ip *ip;
 
 			ip = mtod(m, struct ip *);
 			tclass = ip->ip_tos;
 #else
 			/* We won't hit this code */
 			tclass = 0;
 #endif
 		} else {
 			u_int32_t flowinfo;
 
 			flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
 			flowinfo >>= 20;
 			tclass = flowinfo & 0xff;
 		}
 		*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int),
 		    IPV6_TCLASS, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (v4only != NULL) {
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			*v4only = 1;
 		} else {
 			*v4only = 0;
 		}
 	}
 
 	return (mp);
 }
 
 void
 ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	int v4only = 0;
 
 	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
 	if (v4only)
 		return;
 
 	/*
 	 * IPV6_HOPOPTS socket option.  Recall that we required super-user
 	 * privilege for the option (see ip6_ctloutput), but it might be too
 	 * strict, since there might be some hop-by-hop options which can be
 	 * returned to normal user.
 	 * See also RFC 2292 section 6 (or RFC 3542 section 8).
 	 */
 	if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) {
 		/*
 		 * Check if a hop-by-hop options header is contatined in the
 		 * received packet, and if so, store the options as ancillary
 		 * data. Note that a hop-by-hop options header must be
 		 * just after the IPv6 header, which is assured through the
 		 * IPv6 input processing.
 		 */
 		if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
 			struct ip6_hbh *hbh;
 			int hbhlen = 0;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext;
 #endif
 
 #ifndef PULLDOWN_TEST
 			hbh = (struct ip6_hbh *)(ip6 + 1);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 #else
 			ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
 			    ip6->ip6_nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			hbh = mtod(ext, struct ip6_hbh *);
 			hbhlen = (hbh->ip6h_len + 1) << 3;
 			if (hbhlen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			/*
 			 * XXX: We copy the whole header even if a
 			 * jumbo payload option is included, the option which
 			 * is to be removed before returning according to
 			 * RFC2292.
 			 * Note: this constraint is removed in RFC3542
 			 */
 			*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
 			    IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
 			    IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 #endif
 		}
 	}
 
 	if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) {
 		int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);
 
 		/*
 		 * Search for destination options headers or routing
 		 * header(s) through the header chain, and stores each
 		 * header as ancillary data.
 		 * Note that the order of the headers remains in
 		 * the chain of ancillary data.
 		 */
 		while (1) {	/* is explicit loop prevention necessary? */
 			struct ip6_ext *ip6e = NULL;
 			int elen;
 #ifdef PULLDOWN_TEST
 			struct mbuf *ext = NULL;
 #endif
 
 			/*
 			 * if it is not an extension header, don't try to
 			 * pull it from the chain.
 			 */
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 			default:
 				goto loopend;
 			}
 
 #ifndef PULLDOWN_TEST
 			if (off + sizeof(*ip6e) > m->m_len)
 				goto loopend;
 			ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (off + elen > m->m_len)
 				goto loopend;
 #else
 			ext = ip6_pullexthdr(m, off, nxt);
 			if (ext == NULL) {
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 			ip6e = mtod(ext, struct ip6_ext *);
 			if (nxt == IPPROTO_AH)
 				elen = (ip6e->ip6e_len + 2) << 2;
 			else
 				elen = (ip6e->ip6e_len + 1) << 3;
 			if (elen != ext->m_len) {
 				m_freem(ext);
 				IP6STAT_INC(ip6s_tooshort);
 				return;
 			}
 #endif
 
 			switch (nxt) {
 			case IPPROTO_DSTOPTS:
 				if (!(in6p->inp_flags & IN6P_DSTOPTS))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p,
 					IPV6_2292DSTOPTS, IPV6_DSTOPTS),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_ROUTING:
 				if (!(in6p->inp_flags & IN6P_RTHDR))
 					break;
 
 				*mp = sbcreatecontrol((caddr_t)ip6e, elen,
 				    IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
 				    IPPROTO_IPV6);
 				if (*mp)
 					mp = &(*mp)->m_next;
 				break;
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_AH: /* is it possible? */
 				break;
 
 			default:
 				/*
 				 * other cases have been filtered in the above.
 				 * none will visit this case.  here we supply
 				 * the code just in case (nxt overwritten or
 				 * other cases).
 				 */
 #ifdef PULLDOWN_TEST
 				m_freem(ext);
 #endif
 				goto loopend;
 
 			}
 
 			/* proceed with the next header. */
 			off += elen;
 			nxt = ip6e->ip6e_nxt;
 			ip6e = NULL;
 #ifdef PULLDOWN_TEST
 			m_freem(ext);
 			ext = NULL;
 #endif
 		}
 	  loopend:
 		;
 	}
 
 	if (in6p->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 
 }
 #undef IS2292
 
 void
 ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu)
 {
 	struct socket *so;
 	struct mbuf *m_mtu;
 	struct ip6_mtuinfo mtuctl;
 
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * Notify the error by sending IPV6_PATHMTU ancillary data if
 	 * application wanted to know the MTU value.
 	 * NOTE: we notify disconnected sockets, because some udp
 	 * applications keep sending sockets disconnected.
 	 * NOTE: our implementation doesn't notify connected sockets that has
 	 * foreign address that is different than given destination addresses
 	 * (this is permitted by RFC 3542).
 	 */
 	if ((inp->inp_flags & IN6P_MTU) == 0 || (
 	    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr)))
 		return;
 
 	mtuctl.ip6m_mtu = mtu;
 	mtuctl.ip6m_addr = *dst;
 	if (sa6_recoverscope(&mtuctl.ip6m_addr))
 		return;
 
 	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
 	    IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
 		return;
 
 	so =  inp->inp_socket;
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
 	    == 0) {
 		m_freem(m_mtu);
 		/* XXX: should count statistics */
 	} else
 		sorwakeup(so);
 }
 
 #ifdef PULLDOWN_TEST
 /*
  * pull single extension header from mbuf chain.  returns single mbuf that
  * contains the result, or NULL on error.
  */
 static struct mbuf *
 ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
 {
 	struct ip6_ext ip6e;
 	size_t elen;
 	struct mbuf *n;
 
 #ifdef DIAGNOSTIC
 	switch (nxt) {
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_AH: /* is it possible? */
 		break;
 	default:
 		printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
 	}
 #endif
 
 	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 	if (nxt == IPPROTO_AH)
 		elen = (ip6e.ip6e_len + 2) << 2;
 	else
 		elen = (ip6e.ip6e_len + 1) << 3;
 
 	if (elen > MLEN)
 		n = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL)
 		return NULL;
 
 	m_copydata(m, off, elen, mtod(n, caddr_t));
 	n->m_len = elen;
 	return n;
 }
 #endif
 
 /*
  * Get pointer to the previous header followed by the header
  * currently processed.
  */
 int
 ip6_get_prevhdr(const struct mbuf *m, int off)
 {
 	struct ip6_ext ip6e;
 	struct ip6_hdr *ip6;
 	int len, nlen, nxt;
 
 	if (off == sizeof(struct ip6_hdr))
 		return (offsetof(struct ip6_hdr, ip6_nxt));
 	if (off < sizeof(struct ip6_hdr))
 		panic("%s: off < sizeof(struct ip6_hdr)", __func__);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	nxt = ip6->ip6_nxt;
 	len = sizeof(struct ip6_hdr);
 	nlen = 0;
 	while (len < off) {
 		m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e);
 		switch (nxt) {
 		case IPPROTO_FRAGMENT:
 			nlen = sizeof(struct ip6_frag);
 			break;
 		case IPPROTO_AH:
 			nlen = (ip6e.ip6e_len + 2) << 2;
 			break;
 		default:
 			nlen = (ip6e.ip6e_len + 1) << 3;
 		}
 		len += nlen;
 		nxt = ip6e.ip6e_nxt;
 	}
 	return (len - nlen);
 }
 
 /*
  * get next header offset.  m will be retained.
  */
 int
 ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	struct ip6_hdr ip6;
 	struct ip6_ext ip6e;
 	struct ip6_frag fh;
 
 	/* just in case */
 	if (m == NULL)
 		panic("ip6_nexthdr: m == NULL");
 	if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
 		return -1;
 
 	switch (proto) {
 	case IPPROTO_IPV6:
 		if (m->m_pkthdr.len < off + sizeof(ip6))
 			return -1;
 		m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
 		if (nxtp)
 			*nxtp = ip6.ip6_nxt;
 		off += sizeof(ip6);
 		return off;
 
 	case IPPROTO_FRAGMENT:
 		/*
 		 * terminate parsing if it is not the first fragment,
 		 * it does not make sense to parse through it.
 		 */
 		if (m->m_pkthdr.len < off + sizeof(fh))
 			return -1;
 		m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
 		/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
 		if (fh.ip6f_offlg & IP6F_OFF_MASK)
 			return -1;
 		if (nxtp)
 			*nxtp = fh.ip6f_nxt;
 		off += sizeof(struct ip6_frag);
 		return off;
 
 	case IPPROTO_AH:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 2) << 2;
 		return off;
 
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_ROUTING:
 	case IPPROTO_DSTOPTS:
 		if (m->m_pkthdr.len < off + sizeof(ip6e))
 			return -1;
 		m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
 		if (nxtp)
 			*nxtp = ip6e.ip6e_nxt;
 		off += (ip6e.ip6e_len + 1) << 3;
 		return off;
 
 	case IPPROTO_NONE:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* give up */
 		return -1;
 
 	default:
 		return -1;
 	}
 
 	/* NOTREACHED */
 }
 
 /*
  * get offset for the last header in the chain.  m will be kept untainted.
  */
 int
 ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp)
 {
 	int newoff;
 	int nxt;
 
 	if (!nxtp) {
 		nxt = -1;
 		nxtp = &nxt;
 	}
 	while (1) {
 		newoff = ip6_nexthdr(m, off, proto, nxtp);
 		if (newoff < 0)
 			return off;
 		else if (newoff < off)
 			return -1;	/* invalid */
 		else if (newoff == off)
 			return newoff;
 
 		off = newoff;
 		proto = *nxtp;
 	}
 }
 
 /*
  * System control for IP6
  */
 
 u_char	inet6ctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
Index: head/sys/netsmb/smb_iod.c
===================================================================
--- head/sys/netsmb/smb_iod.c	(revision 336913)
+++ head/sys/netsmb/smb_iod.c	(revision 336914)
@@ -1,722 +1,722 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
  
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/endian.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/unistd.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_rq.h>
 #include <netsmb/smb_tran.h>
 #include <netsmb/smb_trantcp.h>
 
 
 #define SMBIOD_SLEEP_TIMO	2
 #define	SMBIOD_PING_TIMO	60	/* seconds */
 
 #define	SMB_IOD_EVLOCKPTR(iod)	(&((iod)->iod_evlock))
 #define	SMB_IOD_EVLOCK(iod)	smb_sl_lock(&((iod)->iod_evlock))
 #define	SMB_IOD_EVUNLOCK(iod)	smb_sl_unlock(&((iod)->iod_evlock))
 
 #define	SMB_IOD_RQLOCKPTR(iod)	(&((iod)->iod_rqlock))
 #define	SMB_IOD_RQLOCK(iod)	smb_sl_lock(&((iod)->iod_rqlock))
 #define	SMB_IOD_RQUNLOCK(iod)	smb_sl_unlock(&((iod)->iod_rqlock))
 
 #define	smb_iod_wakeup(iod)	wakeup(&(iod)->iod_flags)
 
 
 static MALLOC_DEFINE(M_SMBIOD, "SMBIOD", "SMB network io daemon");
 
 static int smb_iod_next;
 
 static int  smb_iod_sendall(struct smbiod *iod);
 static int  smb_iod_disconnect(struct smbiod *iod);
 static void smb_iod_thread(void *);
 
 static __inline void
 smb_iod_rqprocessed(struct smb_rq *rqp, int error)
 {
 	SMBRQ_SLOCK(rqp);
 	rqp->sr_lerror = error;
 	rqp->sr_rpgen++;
 	rqp->sr_state = SMBRQ_NOTIFIED;
 	wakeup(&rqp->sr_state);
 	SMBRQ_SUNLOCK(rqp);
 }
 
 static void
 smb_iod_invrq(struct smbiod *iod)
 {
 	struct smb_rq *rqp;
 
 	/*
 	 * Invalidate all outstanding requests for this connection
 	 */
 	SMB_IOD_RQLOCK(iod);
 	TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) {
 		rqp->sr_flags |= SMBR_RESTART;
 		smb_iod_rqprocessed(rqp, ENOTCONN);
 	}
 	SMB_IOD_RQUNLOCK(iod);
 }
 
 static void
 smb_iod_closetran(struct smbiod *iod)
 {
 	struct smb_vc *vcp = iod->iod_vc;
 	struct thread *td = iod->iod_td;
 
 	if (vcp->vc_tdata == NULL)
 		return;
 	SMB_TRAN_DISCONNECT(vcp, td);
 	SMB_TRAN_DONE(vcp, td);
 	vcp->vc_tdata = NULL;
 }
 
 static void
 smb_iod_dead(struct smbiod *iod)
 {
 	iod->iod_state = SMBIOD_ST_DEAD;
 	smb_iod_closetran(iod);
 	smb_iod_invrq(iod);
 }
 
 static int
 smb_iod_connect(struct smbiod *iod)
 {
 	struct smb_vc *vcp = iod->iod_vc;
 	struct thread *td = iod->iod_td;
 	int error;
 
 	SMBIODEBUG("%d\n", iod->iod_state);
 	switch(iod->iod_state) {
 	    case SMBIOD_ST_VCACTIVE:
 		SMBERROR("called for already opened connection\n");
 		return EISCONN;
 	    case SMBIOD_ST_DEAD:
 		return ENOTCONN;	/* XXX: last error code ? */
 	    default:
 		break;
 	}
 	vcp->vc_genid++;
 	error = 0;
 
 	error = (int)SMB_TRAN_CREATE(vcp, td);
 	if (error)
 		goto fail;
 	SMBIODEBUG("tcreate\n");
 	if (vcp->vc_laddr) {
 		error = (int)SMB_TRAN_BIND(vcp, vcp->vc_laddr, td);
 		if (error)
 			goto fail;
 	}
 	SMBIODEBUG("tbind\n");
 	error = (int)SMB_TRAN_CONNECT(vcp, vcp->vc_paddr, td);
 	if (error)
 		goto fail;
 	SMB_TRAN_SETPARAM(vcp, SMBTP_SELECTID, &iod->iod_flags);
 	iod->iod_state = SMBIOD_ST_TRANACTIVE;
 	SMBIODEBUG("tconnect\n");
 	/* vcp->vc_mid = 0;*/
 	error = (int)smb_smb_negotiate(vcp, &iod->iod_scred);
 	if (error)
 		goto fail;
 	SMBIODEBUG("snegotiate\n");
 	error = (int)smb_smb_ssnsetup(vcp, &iod->iod_scred);
 	if (error)
 		goto fail;
 	iod->iod_state = SMBIOD_ST_VCACTIVE;
 	SMBIODEBUG("completed\n");
 	smb_iod_invrq(iod);
 	return (0);
 
  fail:
 	smb_iod_dead(iod);
 	return (error);
 }
 
 static int
 smb_iod_disconnect(struct smbiod *iod)
 {
 	struct smb_vc *vcp = iod->iod_vc;
 
 	SMBIODEBUG("\n");
 	if (iod->iod_state == SMBIOD_ST_VCACTIVE) {
 		smb_smb_ssnclose(vcp, &iod->iod_scred);
 		iod->iod_state = SMBIOD_ST_TRANACTIVE;
 	}
 	vcp->vc_smbuid = SMB_UID_UNKNOWN;
 	smb_iod_closetran(iod);
 	iod->iod_state = SMBIOD_ST_NOTCONN;
 	return 0;
 }
 
 static int
 smb_iod_treeconnect(struct smbiod *iod, struct smb_share *ssp)
 {
 	int error;
 
 	if (iod->iod_state != SMBIOD_ST_VCACTIVE) {
 		if (iod->iod_state != SMBIOD_ST_DEAD)
 			return ENOTCONN;
 		iod->iod_state = SMBIOD_ST_RECONNECT;
 		error = smb_iod_connect(iod);
 		if (error)
 			return error;
 	}
 	SMBIODEBUG("tree reconnect\n");
 	SMBS_ST_LOCK(ssp);
 	ssp->ss_flags |= SMBS_RECONNECTING;
 	SMBS_ST_UNLOCK(ssp);
 	error = smb_smb_treeconnect(ssp, &iod->iod_scred);
 	SMBS_ST_LOCK(ssp);
 	ssp->ss_flags &= ~SMBS_RECONNECTING;
 	SMBS_ST_UNLOCK(ssp);
 	wakeup(&ssp->ss_vcgenid);
 	return error;
 }
 
 static int
 smb_iod_sendrq(struct smbiod *iod, struct smb_rq *rqp)
 {
 	struct thread *td = iod->iod_td;
 	struct smb_vc *vcp = iod->iod_vc;
 	struct smb_share *ssp = rqp->sr_share;
 	struct mbuf *m;
 	int error;
 
 	SMBIODEBUG("iod_state = %d\n", iod->iod_state);
 	switch (iod->iod_state) {
 	    case SMBIOD_ST_NOTCONN:
 		smb_iod_rqprocessed(rqp, ENOTCONN);
 		return 0;
 	    case SMBIOD_ST_DEAD:
 		iod->iod_state = SMBIOD_ST_RECONNECT;
 		return 0;
 	    case SMBIOD_ST_RECONNECT:
 		return 0;
 	    default:
 		break;
 	}
 	if (rqp->sr_sendcnt == 0) {
 #ifdef movedtoanotherplace
 		if (vcp->vc_maxmux != 0 && iod->iod_muxcnt >= vcp->vc_maxmux)
 			return 0;
 #endif
 		le16enc(rqp->sr_rqtid, ssp ? ssp->ss_tid : SMB_TID_UNKNOWN);
 		le16enc(rqp->sr_rquid, vcp ? vcp->vc_smbuid : 0);
 		mb_fixhdr(&rqp->sr_rq);
 		if (vcp->vc_hflags2 & SMB_FLAGS2_SECURITY_SIGNATURE)
 			smb_rq_sign(rqp);
 	}
 	if (rqp->sr_sendcnt++ > 5) {
 		rqp->sr_flags |= SMBR_RESTART;
 		smb_iod_rqprocessed(rqp, rqp->sr_lerror);
 		/*
 		 * If all attempts to send a request failed, then
 		 * something is seriously hosed.
 		 */
 		return ENOTCONN;
 	}
 	SMBSDEBUG("M:%04x, P:%04x, U:%04x, T:%04x\n", rqp->sr_mid, 0, 0, 0);
 	m_dumpm(rqp->sr_rq.mb_top);
 	m = m_copym(rqp->sr_rq.mb_top, 0, M_COPYALL, M_WAITOK);
 	error = rqp->sr_lerror = SMB_TRAN_SEND(vcp, m, td);
 	if (error == 0) {
 		getnanotime(&rqp->sr_timesent);
 		iod->iod_lastrqsent = rqp->sr_timesent;
 		rqp->sr_flags |= SMBR_SENT;
 		rqp->sr_state = SMBRQ_SENT;
 		return 0;
 	}
 	/*
 	 * Check for fatal errors
 	 */
 	if (SMB_TRAN_FATAL(vcp, error)) {
 		/*
 		 * No further attempts should be made
 		 */
 		return ENOTCONN;
 	}
 	if (smb_rq_intr(rqp))
 		smb_iod_rqprocessed(rqp, EINTR);
 	return 0;
 }
 
 /*
  * Process incoming packets
  */
 static int
 smb_iod_recvall(struct smbiod *iod)
 {
 	struct smb_vc *vcp = iod->iod_vc;
 	struct thread *td = iod->iod_td;
 	struct smb_rq *rqp;
 	struct mbuf *m;
 	u_char *hp;
 	u_short mid;
 	int error;
 
 	switch (iod->iod_state) {
 	    case SMBIOD_ST_NOTCONN:
 	    case SMBIOD_ST_DEAD:
 	    case SMBIOD_ST_RECONNECT:
 		return 0;
 	    default:
 		break;
 	}
 	for (;;) {
 		m = NULL;
 		error = SMB_TRAN_RECV(vcp, &m, td);
 		if (error == EWOULDBLOCK)
 			break;
 		if (SMB_TRAN_FATAL(vcp, error)) {
 			smb_iod_dead(iod);
 			break;
 		}
 		if (error)
 			break;
 		if (m == NULL) {
 			SMBERROR("tran return NULL without error\n");
 			error = EPIPE;
 			continue;
 		}
 		m = m_pullup(m, SMB_HDRLEN);
 		if (m == NULL)
 			continue;	/* wait for a good packet */
 		/*
 		 * Now we got an entire and possibly invalid SMB packet.
 		 * Be careful while parsing it.
 		 */
 		m_dumpm(m);
 		hp = mtod(m, u_char*);
 		if (bcmp(hp, SMB_SIGNATURE, SMB_SIGLEN) != 0) {
 			m_freem(m);
 			continue;
 		}
 		mid = SMB_HDRMID(hp);
 		SMBSDEBUG("mid %04x\n", (u_int)mid);
 		SMB_IOD_RQLOCK(iod);
 		TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) {
 			if (rqp->sr_mid != mid)
 				continue;
 			SMBRQ_SLOCK(rqp);
 			if (rqp->sr_rp.md_top == NULL) {
 				md_initm(&rqp->sr_rp, m);
 			} else {
 				if (rqp->sr_flags & SMBR_MULTIPACKET) {
 					md_append_record(&rqp->sr_rp, m);
 				} else {
 					SMBRQ_SUNLOCK(rqp);
 					SMBERROR("duplicate response %d (ignored)\n", mid);
 					break;
 				}
 			}
 			SMBRQ_SUNLOCK(rqp);
 			smb_iod_rqprocessed(rqp, 0);
 			break;
 		}
 		SMB_IOD_RQUNLOCK(iod);
 		if (rqp == NULL) {
 			SMBERROR("drop resp with mid %d\n", (u_int)mid);
 /*			smb_printrqlist(vcp);*/
 			m_freem(m);
 		}
 	}
 	/*
 	 * check for interrupts
 	 */
 	SMB_IOD_RQLOCK(iod);
 	TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) {
 		if (smb_td_intr(rqp->sr_cred->scr_td)) {
 			smb_iod_rqprocessed(rqp, EINTR);
 		}
 	}
 	SMB_IOD_RQUNLOCK(iod);
 	return 0;
 }
 
 int
 smb_iod_request(struct smbiod *iod, int event, void *ident)
 {
 	struct smbiod_event *evp;
 	int error;
 
 	SMBIODEBUG("\n");
 	evp = smb_zmalloc(sizeof(*evp), M_SMBIOD, M_WAITOK);
 	evp->ev_type = event;
 	evp->ev_ident = ident;
 	SMB_IOD_EVLOCK(iod);
 	STAILQ_INSERT_TAIL(&iod->iod_evlist, evp, ev_link);
 	if ((event & SMBIOD_EV_SYNC) == 0) {
 		SMB_IOD_EVUNLOCK(iod);
 		smb_iod_wakeup(iod);
 		return 0;
 	}
 	smb_iod_wakeup(iod);
 	msleep(evp, SMB_IOD_EVLOCKPTR(iod), PWAIT | PDROP, "90evw", 0);
 	error = evp->ev_error;
 	free(evp, M_SMBIOD);
 	return error;
 }
 
 /*
  * Place request in the queue.
  * Request from smbiod have a high priority.
  */
 int
 smb_iod_addrq(struct smb_rq *rqp)
 {
 	struct smb_vc *vcp = rqp->sr_vc;
 	struct smbiod *iod = vcp->vc_iod;
 	int error;
 
 	SMBIODEBUG("\n");
 	if (rqp->sr_cred->scr_td != NULL &&
 	    rqp->sr_cred->scr_td->td_proc == iod->iod_p) {
 		rqp->sr_flags |= SMBR_INTERNAL;
 		SMB_IOD_RQLOCK(iod);
 		TAILQ_INSERT_HEAD(&iod->iod_rqlist, rqp, sr_link);
 		SMB_IOD_RQUNLOCK(iod);
 		for (;;) {
 			if (smb_iod_sendrq(iod, rqp) != 0) {
 				smb_iod_dead(iod);
 				break;
 			}
 			/*
 			 * we don't need to lock state field here
 			 */
 			if (rqp->sr_state != SMBRQ_NOTSENT)
 				break;
 			tsleep(&iod->iod_flags, PWAIT, "90sndw", hz);
 		}
 		if (rqp->sr_lerror)
 			smb_iod_removerq(rqp);
 		return rqp->sr_lerror;
 	}
 
 	switch (iod->iod_state) {
 	    case SMBIOD_ST_NOTCONN:
 		return ENOTCONN;
 	    case SMBIOD_ST_DEAD:
 		error = smb_iod_request(vcp->vc_iod, SMBIOD_EV_CONNECT | SMBIOD_EV_SYNC, NULL);
 		if (error)
 			return error;
 		return EXDEV;
 	    default:
 		break;
 	}
 
 	SMB_IOD_RQLOCK(iod);
 	for (;;) {
 		if (vcp->vc_maxmux == 0) {
 			SMBERROR("maxmux == 0\n");
 			break;
 		}
 		if (iod->iod_muxcnt < vcp->vc_maxmux)
 			break;
 		iod->iod_muxwant++;
 		msleep(&iod->iod_muxwant, SMB_IOD_RQLOCKPTR(iod),
 		    PWAIT, "90mux", 0);
 	}
 	iod->iod_muxcnt++;
 	TAILQ_INSERT_TAIL(&iod->iod_rqlist, rqp, sr_link);
 	SMB_IOD_RQUNLOCK(iod);
 	smb_iod_wakeup(iod);
 	return 0;
 }
 
 int
 smb_iod_removerq(struct smb_rq *rqp)
 {
 	struct smb_vc *vcp = rqp->sr_vc;
 	struct smbiod *iod = vcp->vc_iod;
 
 	SMBIODEBUG("\n");
 	if (rqp->sr_flags & SMBR_INTERNAL) {
 		SMB_IOD_RQLOCK(iod);
 		TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link);
 		SMB_IOD_RQUNLOCK(iod);
 		return 0;
 	}
 	SMB_IOD_RQLOCK(iod);
 	while (rqp->sr_flags & SMBR_XLOCK) {
 		rqp->sr_flags |= SMBR_XLOCKWANT;
 		msleep(rqp, SMB_IOD_RQLOCKPTR(iod), PWAIT, "90xrm", 0);
 	}
 	TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link);
 	iod->iod_muxcnt--;
 	if (iod->iod_muxwant) {
 		iod->iod_muxwant--;
 		wakeup(&iod->iod_muxwant);
 	}
 	SMB_IOD_RQUNLOCK(iod);
 	return 0;
 }
 
 int
 smb_iod_waitrq(struct smb_rq *rqp)
 {
 	struct smbiod *iod = rqp->sr_vc->vc_iod;
 	int error;
 
 	SMBIODEBUG("\n");
 	if (rqp->sr_flags & SMBR_INTERNAL) {
 		for (;;) {
 			smb_iod_sendall(iod);
 			smb_iod_recvall(iod);
 			if (rqp->sr_rpgen != rqp->sr_rplast)
 				break;
 			tsleep(&iod->iod_flags, PWAIT, "90irq", hz);
 		}
 		smb_iod_removerq(rqp);
 		return rqp->sr_lerror;
 
 	}
 	SMBRQ_SLOCK(rqp);
 	if (rqp->sr_rpgen == rqp->sr_rplast)
 		msleep(&rqp->sr_state, SMBRQ_SLOCKPTR(rqp), PWAIT, "90wrq", 0);
 	rqp->sr_rplast++;
 	SMBRQ_SUNLOCK(rqp);
 	error = rqp->sr_lerror;
 	if (rqp->sr_flags & SMBR_MULTIPACKET) {
 		/*
 		 * If request should stay in the list, then reinsert it
 		 * at the end of queue so other waiters have chance to concur
 		 */
 		SMB_IOD_RQLOCK(iod);
 		TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link);
 		TAILQ_INSERT_TAIL(&iod->iod_rqlist, rqp, sr_link);
 		SMB_IOD_RQUNLOCK(iod);
 	} else
 		smb_iod_removerq(rqp);
 	return error;
 }
 
 
 static int
 smb_iod_sendall(struct smbiod *iod)
 {
 	struct smb_vc *vcp = iod->iod_vc;
 	struct smb_rq *rqp;
 	struct timespec ts, tstimeout;
 	int herror;
 
 	herror = 0;
 	/*
 	 * Loop through the list of requests and send them if possible
 	 */
 	SMB_IOD_RQLOCK(iod);
 	TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) {
 		switch (rqp->sr_state) {
 		    case SMBRQ_NOTSENT:
 			rqp->sr_flags |= SMBR_XLOCK;
 			SMB_IOD_RQUNLOCK(iod);
 			herror = smb_iod_sendrq(iod, rqp);
 			SMB_IOD_RQLOCK(iod);
 			rqp->sr_flags &= ~SMBR_XLOCK;
 			if (rqp->sr_flags & SMBR_XLOCKWANT) {
 				rqp->sr_flags &= ~SMBR_XLOCKWANT;
 				wakeup(rqp);
 			}
 			break;
 		    case SMBRQ_SENT:
 			SMB_TRAN_GETPARAM(vcp, SMBTP_TIMEOUT, &tstimeout);
-			timespecadd(&tstimeout, &tstimeout);
+			timespecadd(&tstimeout, &tstimeout, &tstimeout);
 			getnanotime(&ts);
-			timespecsub(&ts, &tstimeout);
+			timespecsub(&ts, &tstimeout, &ts);
 			if (timespeccmp(&ts, &rqp->sr_timesent, >)) {
 				smb_iod_rqprocessed(rqp, ETIMEDOUT);
 			}
 			break;
 		    default:
 			break;
 		}
 		if (herror)
 			break;
 	}
 	SMB_IOD_RQUNLOCK(iod);
 	if (herror == ENOTCONN)
 		smb_iod_dead(iod);
 	return 0;
 }
 
 /*
  * "main" function for smbiod daemon
  */
 static __inline void
 smb_iod_main(struct smbiod *iod)
 {
 /*	struct smb_vc *vcp = iod->iod_vc;*/
 	struct smbiod_event *evp;
 /*	struct timespec tsnow;*/
 	int error;
 
 	SMBIODEBUG("\n");
 	error = 0;
 
 	/*
 	 * Check all interesting events
 	 */
 	for (;;) {
 		SMB_IOD_EVLOCK(iod);
 		evp = STAILQ_FIRST(&iod->iod_evlist);
 		if (evp == NULL) {
 			SMB_IOD_EVUNLOCK(iod);
 			break;
 		}
 		STAILQ_REMOVE_HEAD(&iod->iod_evlist, ev_link);
 		evp->ev_type |= SMBIOD_EV_PROCESSING;
 		SMB_IOD_EVUNLOCK(iod);
 		switch (evp->ev_type & SMBIOD_EV_MASK) {
 		    case SMBIOD_EV_CONNECT:
 			iod->iod_state = SMBIOD_ST_RECONNECT;
 			evp->ev_error = smb_iod_connect(iod);
 			break;
 		    case SMBIOD_EV_DISCONNECT:
 			evp->ev_error = smb_iod_disconnect(iod);
 			break;
 		    case SMBIOD_EV_TREECONNECT:
 			evp->ev_error = smb_iod_treeconnect(iod, evp->ev_ident);
 			break;
 		    case SMBIOD_EV_SHUTDOWN:
 			iod->iod_flags |= SMBIOD_SHUTDOWN;
 			break;
 		    case SMBIOD_EV_NEWRQ:
 			break;
 		}
 		if (evp->ev_type & SMBIOD_EV_SYNC) {
 			SMB_IOD_EVLOCK(iod);
 			wakeup(evp);
 			SMB_IOD_EVUNLOCK(iod);
 		} else
 			free(evp, M_SMBIOD);
 	}
 #if 0
 	if (iod->iod_state == SMBIOD_ST_VCACTIVE) {
 		getnanotime(&tsnow);
-		timespecsub(&tsnow, &iod->iod_pingtimo);
+		timespecsub(&tsnow, &iod->iod_pingtimo, &tsnow);
 		if (timespeccmp(&tsnow, &iod->iod_lastrqsent, >)) {
 			smb_smb_echo(vcp, &iod->iod_scred);
 		}
 	}
 #endif
 	smb_iod_sendall(iod);
 	smb_iod_recvall(iod);
 	return;
 }
 
 void
 smb_iod_thread(void *arg)
 {
 	struct smbiod *iod = arg;
 
 	mtx_lock(&Giant);
 
 	/*
 	 * Here we assume that the thread structure will be the same
 	 * for an entire kthread (kproc, to be more precise) life.
 	 */
 	iod->iod_td = curthread;
 	smb_makescred(&iod->iod_scred, iod->iod_td, NULL);
 	while ((iod->iod_flags & SMBIOD_SHUTDOWN) == 0) {
 		smb_iod_main(iod);
 		SMBIODEBUG("going to sleep for %d ticks\n", iod->iod_sleeptimo);
 		if (iod->iod_flags & SMBIOD_SHUTDOWN)
 			break;
 		tsleep(&iod->iod_flags, PWAIT, "90idle", iod->iod_sleeptimo);
 	}
 
 	/* We can now safely destroy the mutexes and free the iod structure. */
 	smb_sl_destroy(&iod->iod_rqlock);
 	smb_sl_destroy(&iod->iod_evlock);
 	free(iod, M_SMBIOD);
 	mtx_unlock(&Giant);
 	kproc_exit(0);
 }
 
 int
 smb_iod_create(struct smb_vc *vcp)
 {
 	struct smbiod *iod;
 	int error;
 
 	iod = smb_zmalloc(sizeof(*iod), M_SMBIOD, M_WAITOK);
 	iod->iod_id = smb_iod_next++;
 	iod->iod_state = SMBIOD_ST_NOTCONN;
 	iod->iod_vc = vcp;
 	iod->iod_sleeptimo = hz * SMBIOD_SLEEP_TIMO;
 	iod->iod_pingtimo.tv_sec = SMBIOD_PING_TIMO;
 	getnanotime(&iod->iod_lastrqsent);
 	vcp->vc_iod = iod;
 	smb_sl_init(&iod->iod_rqlock, "90rql");
 	TAILQ_INIT(&iod->iod_rqlist);
 	smb_sl_init(&iod->iod_evlock, "90evl");
 	STAILQ_INIT(&iod->iod_evlist);
 	error = kproc_create(smb_iod_thread, iod, &iod->iod_p,
 	    RFNOWAIT, 0, "smbiod%d", iod->iod_id);
 	if (error) {
 		SMBERROR("can't start smbiod: %d", error);
 		vcp->vc_iod = NULL;
 		smb_sl_destroy(&iod->iod_rqlock);
 		smb_sl_destroy(&iod->iod_evlock);
 		free(iod, M_SMBIOD);
 		return error;
 	}
 	return 0;
 }
 
 int
 smb_iod_destroy(struct smbiod *iod)
 {
 	smb_iod_request(iod, SMBIOD_EV_SHUTDOWN | SMBIOD_EV_SYNC, NULL);
 	return 0;
 }
 
 int
 smb_iod_init(void)
 {
 	return 0;
 }
 
 int
 smb_iod_done(void)
 {
 	return 0;
 }
 
Index: head/sys/netsmb/smb_trantcp.c
===================================================================
--- head/sys/netsmb/smb_trantcp.c	(revision 336913)
+++ head/sys/netsmb/smb_trantcp.c	(revision 336914)
@@ -1,696 +1,695 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <sys/mchain.h>
 
 #include <netsmb/netbios.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_tran.h>
 #include <netsmb/smb_trantcp.h>
 #include <netsmb/smb_subr.h>
 
 #define M_NBDATA	M_PCB
 
 static int smb_tcpsndbuf = NB_SNDQ - 1;
 static int smb_tcprcvbuf = NB_RCVQ - 1;
 
 SYSCTL_DECL(_net_smb);
 SYSCTL_INT(_net_smb, OID_AUTO, tcpsndbuf, CTLFLAG_RW, &smb_tcpsndbuf, 0, "");
 SYSCTL_INT(_net_smb, OID_AUTO, tcprcvbuf, CTLFLAG_RW, &smb_tcprcvbuf, 0, "");
 
 #define nb_sosend(so,m,flags,td) sosend(so, NULL, 0, m, 0, flags, td)
 
 static int  nbssn_recv(struct nbpcb *nbp, struct mbuf **mpp, int *lenp,
 	u_int8_t *rpcodep, struct thread *td);
 static int  smb_nbst_disconnect(struct smb_vc *vcp, struct thread *td);
 
 static int
 nb_setsockopt_int(struct socket *so, int level, int name, int val)
 {
 	struct sockopt sopt;
 	int error;
 
 	bzero(&sopt, sizeof(sopt));
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = &val;
 	sopt.sopt_valsize = sizeof(val);
 	CURVNET_SET(so->so_vnet);
 	error = sosetopt(so, &sopt);
 	CURVNET_RESTORE();
 	return error;
 }
 
 static int
 nb_intr(struct nbpcb *nbp, struct proc *p)
 {
 	return 0;
 }
 
 static int
 nb_upcall(struct socket *so, void *arg, int waitflag)
 {
 	struct nbpcb *nbp = arg;
 
 	if (arg == NULL || nbp->nbp_selectid == NULL)
 		return (SU_OK);
 	wakeup(nbp->nbp_selectid);
 	return (SU_OK);
 }
 
 static int
 nb_sethdr(struct mbuf *m, u_int8_t type, u_int32_t len)
 {
 	u_int32_t *p = mtod(m, u_int32_t *);
 
 	*p = htonl((len & 0x1FFFF) | (type << 24));
 	return 0;
 }
 
 static int
 nb_put_name(struct mbchain *mbp, struct sockaddr_nb *snb)
 {
 	int error;
 	u_char seglen, *cp;
 
 	cp = snb->snb_name;
 	if (*cp == 0)
 		return EINVAL;
 	NBDEBUG("[%s]\n", cp);
 	for (;;) {
 		seglen = (*cp) + 1;
 		error = mb_put_mem(mbp, cp, seglen, MB_MSYSTEM);
 		if (error)
 			return error;
 		if (seglen == 1)
 			break;
 		cp += seglen;
 	}
 	return 0;
 }
 
 static int
 nb_connect_in(struct nbpcb *nbp, struct sockaddr_in *to, struct thread *td)
 {
 	struct socket *so;
 	int error, s;
 
 	error = socreate(AF_INET, &so, SOCK_STREAM, IPPROTO_TCP,
 	    td->td_ucred, td);
 	if (error)
 		return error;
 	nbp->nbp_tso = so;
 	SOCKBUF_LOCK(&so->so_rcv);
 	soupcall_set(so, SO_RCV, nb_upcall, nbp);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	so->so_rcv.sb_timeo = (5 * SBT_1S);
 	so->so_snd.sb_timeo = (5 * SBT_1S);
 	error = soreserve(so, nbp->nbp_sndbuf, nbp->nbp_rcvbuf);
 	if (error)
 		goto bad;
 	nb_setsockopt_int(so, SOL_SOCKET, SO_KEEPALIVE, 1);
 	nb_setsockopt_int(so, IPPROTO_TCP, TCP_NODELAY, 1);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags &= ~SB_NOINTR;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags &= ~SB_NOINTR;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	error = soconnect(so, (struct sockaddr*)to, td);
 	if (error)
 		goto bad;
 	s = splnet();
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		tsleep(&so->so_timeo, PSOCK, "nbcon", 2 * hz);
 		if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0 &&
 			(error = nb_intr(nbp, td->td_proc)) != 0) {
 			so->so_state &= ~SS_ISCONNECTING;
 			splx(s);
 			goto bad;
 		}
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		splx(s);
 		goto bad;
 	}
 	splx(s);
 	return 0;
 bad:
 	smb_nbst_disconnect(nbp->nbp_vc, td);
 	return error;
 }
 
 static int
 nbssn_rq_request(struct nbpcb *nbp, struct thread *td)
 {
 	struct mbchain *mbp;
 	struct mdchain *mdp;
 	struct mbuf *m0;
 	struct timeval tv;
 	struct sockaddr_in sin;
 	u_short port;
 	u_int8_t rpcode;
 	int error, rplen;
 
 	mbp = malloc(sizeof(struct mbchain), M_NBDATA, M_WAITOK);
 	mdp = malloc(sizeof(struct mbchain), M_NBDATA, M_WAITOK);
 	error = mb_init(mbp);
 	if (error) {
 		free(mbp, M_NBDATA);
 		free(mdp, M_NBDATA);
 		return error;
 	}
 	mb_put_uint32le(mbp, 0);
 	nb_put_name(mbp, nbp->nbp_paddr);
 	nb_put_name(mbp, nbp->nbp_laddr);
 	nb_sethdr(mbp->mb_top, NB_SSN_REQUEST, mb_fixhdr(mbp) - 4);
 	error = nb_sosend(nbp->nbp_tso, mbp->mb_top, 0, td);
 	if (!error) {
 		nbp->nbp_state = NBST_RQSENT;
 	}
 	mb_detach(mbp);
 	mb_done(mbp);
 	free(mbp, M_NBDATA);
 	if (error) {
 		free(mdp, M_NBDATA);
 		return error;
 	}
 	TIMESPEC_TO_TIMEVAL(&tv, &nbp->nbp_timo);
 	error = selsocket(nbp->nbp_tso, POLLIN, &tv, td);
 	if (error == EWOULDBLOCK) {	/* Timeout */
 		NBDEBUG("initial request timeout\n");
 		free(mdp, M_NBDATA);
 		return ETIMEDOUT;
 	}
 	if (error) {			/* restart or interrupt */
 		free(mdp, M_NBDATA);
 		return error;
 	}
 	error = nbssn_recv(nbp, &m0, &rplen, &rpcode, td);
 	if (error) {
 		NBDEBUG("recv() error %d\n", error);
 		free(mdp, M_NBDATA);
 		return error;
 	}
 	/*
 	 * Process NETBIOS reply
 	 */
 	if (m0)
 		md_initm(mdp, m0);
 	error = 0;
 	do {
 		if (rpcode == NB_SSN_POSRESP) {
 			nbp->nbp_state = NBST_SESSION;
 			nbp->nbp_flags |= NBF_CONNECTED;
 			break;
 		}
 		if (rpcode != NB_SSN_RTGRESP) {
 			error = ECONNABORTED;
 			break;
 		}
 		if (rplen != 6) {
 			error = ECONNABORTED;
 			break;
 		}
 		md_get_mem(mdp, (caddr_t)&sin.sin_addr, 4, MB_MSYSTEM);
 		md_get_uint16(mdp, &port);
 		sin.sin_port = port;
 		nbp->nbp_state = NBST_RETARGET;
 		smb_nbst_disconnect(nbp->nbp_vc, td);
 		error = nb_connect_in(nbp, &sin, td);
 		if (!error)
 			error = nbssn_rq_request(nbp, td);
 		if (error) {
 			smb_nbst_disconnect(nbp->nbp_vc, td);
 			break;
 		}
 	} while(0);
 	if (m0)
 		md_done(mdp);
 	free(mdp, M_NBDATA);
 	return error;
 }
 
 static int
 nbssn_recvhdr(struct nbpcb *nbp, int *lenp,
 	u_int8_t *rpcodep, int flags, struct thread *td)
 {
 	struct socket *so = nbp->nbp_tso;
 	struct uio auio;
 	struct iovec aio;
 	u_int32_t len;
 	int error;
 
 	aio.iov_base = (caddr_t)&len;
 	aio.iov_len = sizeof(len);
 	auio.uio_iov = &aio;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 	auio.uio_resid = sizeof(len);
 	auio.uio_td = td;
 	CURVNET_SET(so->so_vnet);
 	error = soreceive(so, (struct sockaddr **)NULL, &auio,
 	    (struct mbuf **)NULL, (struct mbuf **)NULL, &flags);
 	CURVNET_RESTORE();
 	if (error)
 		return error;
 	if (auio.uio_resid > 0) {
 		SMBSDEBUG("short reply\n");
 		return EPIPE;
 	}
 	len = ntohl(len);
 	*rpcodep = (len >> 24) & 0xFF;
 	len &= 0x1ffff;
 	if (len > SMB_MAXPKTLEN) {
 		SMBERROR("packet too long (%d)\n", len);
 		return EFBIG;
 	}
 	*lenp = len;
 	return 0;
 }
 
 static int
 nbssn_recv(struct nbpcb *nbp, struct mbuf **mpp, int *lenp,
 	u_int8_t *rpcodep, struct thread *td)
 {
 	struct socket *so = nbp->nbp_tso;
 	struct uio auio;
 	struct mbuf *m, *tm, *im;
 	u_int8_t rpcode;
 	int len, resid;
 	int error, rcvflg;
 
 	if (so == NULL)
 		return ENOTCONN;
 
 	if (mpp)
 		*mpp = NULL;
 	m = NULL;
 	for(;;) {
 		/*
 		 * Poll for a response header.
 		 * If we don't have one waiting, return.
 		 */
 		len = 0;
 		rpcode = 0;
 		error = nbssn_recvhdr(nbp, &len, &rpcode, MSG_DONTWAIT, td);
 		if ((so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) ||
 		    (so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 			nbp->nbp_state = NBST_CLOSED;
 			NBDEBUG("session closed by peer\n");
 			return ECONNRESET;
 		}
 		if (error)
 			return error;
 		if (len == 0 && nbp->nbp_state != NBST_SESSION)
 			break;
 		/* no data, try again */
 		if (rpcode == NB_SSN_KEEPALIVE)
 			continue;
 
 		/*
 		 * Loop, blocking, for data following the response header.
 		 *
 		 * Note that we can't simply block here with MSG_WAITALL for the
 		 * entire response size, as it may be larger than the TCP
 		 * slow-start window that the sender employs.  This will result
 		 * in the sender stalling until the delayed ACK is sent, then
 		 * resuming slow-start, resulting in very poor performance.
 		 *
 		 * Instead, we never request more than NB_SORECEIVE_CHUNK
 		 * bytes at a time, resulting in an ack being pushed by
 		 * the TCP code at the completion of each call.
 		 */
 		resid = len;
 		while (resid > 0) {
 			tm = NULL;
 			rcvflg = MSG_WAITALL;
 			bzero(&auio, sizeof(auio));
 			auio.uio_resid = min(resid, NB_SORECEIVE_CHUNK);
 			auio.uio_td = td;
 			resid -= auio.uio_resid;
 			/*
 			 * Spin until we have collected everything in
 			 * this chunk.
 			 */
 			do {
 				rcvflg = MSG_WAITALL;
 				CURVNET_SET(so->so_vnet);
 				error = soreceive(so, (struct sockaddr **)NULL,
 				    &auio, &tm, (struct mbuf **)NULL, &rcvflg);
 				CURVNET_RESTORE();
 			} while (error == EWOULDBLOCK || error == EINTR ||
 				 error == ERESTART);
 			if (error)
 				goto out;
 			/* short return guarantees unhappiness */
 			if (auio.uio_resid > 0) {
 				SMBERROR("packet is shorter than expected\n");
 				error = EPIPE;
 				goto out;
 			}
 			/* append received chunk to previous chunk(s) */
 			if (m == NULL) {
 				m = tm;
 			} else {
 				/*
 				 * Just glue the new chain on the end.
 				 * Consumer will pullup as required.
 				 */
 				for (im = m; im->m_next != NULL; im = im->m_next)
 					;
 				im->m_next = tm;
 			}
 		}
 		/* got a session/message packet? */
 		if (nbp->nbp_state == NBST_SESSION &&
 		    rpcode == NB_SSN_MESSAGE)
 			break;
 		/* drop packet and try for another */
 		NBDEBUG("non-session packet %x\n", rpcode);
 		if (m) {
 			m_freem(m);
 			m = NULL;
 		}
 	}
 
 out:
 	if (error) {
 		if (m)
 			m_freem(m);
 		return error;
 	}
 	if (mpp)
 		*mpp = m;
 	else
 		m_freem(m);
 	*lenp = len;
 	*rpcodep = rpcode;
 	return 0;
 }
 
 /*
  * SMB transport interface
  */
 static int
 smb_nbst_create(struct smb_vc *vcp, struct thread *td)
 {
 	struct nbpcb *nbp;
 
 	nbp = malloc(sizeof *nbp, M_NBDATA, M_WAITOK);
 	bzero(nbp, sizeof *nbp);
 	nbp->nbp_timo.tv_sec = 15;	/* XXX: sysctl ? */
 	nbp->nbp_state = NBST_CLOSED;
 	nbp->nbp_vc = vcp;
 	nbp->nbp_sndbuf = smb_tcpsndbuf;
 	nbp->nbp_rcvbuf = smb_tcprcvbuf;
 	vcp->vc_tdata = nbp;
 	return 0;
 }
 
 static int
 smb_nbst_done(struct smb_vc *vcp, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 
 	if (nbp == NULL)
 		return ENOTCONN;
 	smb_nbst_disconnect(vcp, td);
 	if (nbp->nbp_laddr)
 		free(nbp->nbp_laddr, M_SONAME);
 	if (nbp->nbp_paddr)
 		free(nbp->nbp_paddr, M_SONAME);
 	free(nbp, M_NBDATA);
 	return 0;
 }
 
 static int
 smb_nbst_bind(struct smb_vc *vcp, struct sockaddr *sap, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 	struct sockaddr_nb *snb;
 	int error, slen;
 
 	NBDEBUG("\n");
 	error = EINVAL;
 	do {
 		if (nbp->nbp_flags & NBF_LOCADDR)
 			break;
 		/*
 		 * It is possible to create NETBIOS name in the kernel,
 		 * but nothing prevents us to do it in the user space.
 		 */
 		if (sap == NULL)
 			break;
 		slen = sap->sa_len;
 		if (slen < NB_MINSALEN)
 			break;
 		snb = (struct sockaddr_nb*)sodupsockaddr(sap, M_WAITOK);
 		if (snb == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		nbp->nbp_laddr = snb;
 		nbp->nbp_flags |= NBF_LOCADDR;
 		error = 0;
 	} while(0);
 	return error;
 }
 
 static int
 smb_nbst_connect(struct smb_vc *vcp, struct sockaddr *sap, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 	struct sockaddr_in sin;
 	struct sockaddr_nb *snb;
 	struct timespec ts1, ts2;
 	int error, slen;
 
 	NBDEBUG("\n");
 	if (nbp->nbp_tso != NULL)
 		return EISCONN;
 	if (nbp->nbp_laddr == NULL)
 		return EINVAL;
 	slen = sap->sa_len;
 	if (slen < NB_MINSALEN)
 		return EINVAL;
 	if (nbp->nbp_paddr) {
 		free(nbp->nbp_paddr, M_SONAME);
 		nbp->nbp_paddr = NULL;
 	}
 	snb = (struct sockaddr_nb*)sodupsockaddr(sap, M_WAITOK);
 	if (snb == NULL)
 		return ENOMEM;
 	nbp->nbp_paddr = snb;
 	sin = snb->snb_addrin;
 	getnanotime(&ts1);
 	error = nb_connect_in(nbp, &sin, td);
 	if (error)
 		return error;
 	getnanotime(&ts2);
-	timespecsub(&ts2, &ts1);
+	timespecsub(&ts2, &ts1, &ts2);
 	if (ts2.tv_sec == 0) {
 		ts2.tv_sec = 1;
 		ts2.tv_nsec = 0;
 	}
-	nbp->nbp_timo = ts2;
-	timespecadd(&nbp->nbp_timo, &ts2);
-	timespecadd(&nbp->nbp_timo, &ts2);
-	timespecadd(&nbp->nbp_timo, &ts2);	/*  * 4 */
+	timespecadd(&ts2, &ts2, &nbp->nbp_timo);
+	timespecadd(&nbp->nbp_timo, &ts2, &nbp->nbp_timo);
+	timespecadd(&nbp->nbp_timo, &ts2, &nbp->nbp_timo);	/*  * 4 */
 	error = nbssn_rq_request(nbp, td);
 	if (error)
 		smb_nbst_disconnect(vcp, td);
 	return error;
 }
 
 static int
 smb_nbst_disconnect(struct smb_vc *vcp, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 	struct socket *so;
 
 	if (nbp == NULL || nbp->nbp_tso == NULL)
 		return ENOTCONN;
 	if ((so = nbp->nbp_tso) != NULL) {
 		nbp->nbp_flags &= ~NBF_CONNECTED;
 		nbp->nbp_tso = (struct socket *)NULL;
 		soshutdown(so, 2);
 		soclose(so);
 	}
 	if (nbp->nbp_state != NBST_RETARGET) {
 		nbp->nbp_state = NBST_CLOSED;
 	}
 	return 0;
 }
 
 static int
 smb_nbst_send(struct smb_vc *vcp, struct mbuf *m0, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 	int error;
 
 	if (nbp->nbp_state != NBST_SESSION) {
 		error = ENOTCONN;
 		goto abort;
 	}
 	M_PREPEND(m0, 4, M_WAITOK);
 	nb_sethdr(m0, NB_SSN_MESSAGE, m_fixhdr(m0) - 4);
 	error = nb_sosend(nbp->nbp_tso, m0, 0, td);
 	return error;
 abort:
 	if (m0)
 		m_freem(m0);
 	return error;
 }
 
 
 static int
 smb_nbst_recv(struct smb_vc *vcp, struct mbuf **mpp, struct thread *td)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 	u_int8_t rpcode;
 	int error, rplen;
 
 	nbp->nbp_flags |= NBF_RECVLOCK;
 	error = nbssn_recv(nbp, mpp, &rplen, &rpcode, td);
 	nbp->nbp_flags &= ~NBF_RECVLOCK;
 	return error;
 }
 
 static void
 smb_nbst_timo(struct smb_vc *vcp)
 {
 	return;
 }
 
 static void
 smb_nbst_intr(struct smb_vc *vcp)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 
 	if (nbp == NULL || nbp->nbp_tso == NULL)
 		return;
 	sorwakeup(nbp->nbp_tso);
 	sowwakeup(nbp->nbp_tso);
 }
 
 static int
 smb_nbst_getparam(struct smb_vc *vcp, int param, void *data)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 
 	switch (param) {
 	    case SMBTP_SNDSZ:
 		*(int*)data = nbp->nbp_sndbuf;
 		break;
 	    case SMBTP_RCVSZ:
 		*(int*)data = nbp->nbp_rcvbuf;
 		break;
 	    case SMBTP_TIMEOUT:
 		*(struct timespec*)data = nbp->nbp_timo;
 		break;
 	    default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 static int
 smb_nbst_setparam(struct smb_vc *vcp, int param, void *data)
 {
 	struct nbpcb *nbp = vcp->vc_tdata;
 
 	switch (param) {
 	    case SMBTP_SELECTID:
 		nbp->nbp_selectid = data;
 		break;
 	    default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 /*
  * Check for fatal errors
  */
 static int
 smb_nbst_fatal(struct smb_vc *vcp, int error)
 {
 	switch (error) {
 	    case ENOTCONN:
 	    case ENETRESET:
 	    case ECONNABORTED:
 		return 1;
 	}
 	return 0;
 }
 
 
 struct smb_tran_desc smb_tran_nbtcp_desc = {
 	SMBT_NBTCP,
 	smb_nbst_create, smb_nbst_done,
 	smb_nbst_bind, smb_nbst_connect, smb_nbst_disconnect,
 	smb_nbst_send, smb_nbst_recv,
 	smb_nbst_timo, smb_nbst_intr,
 	smb_nbst_getparam, smb_nbst_setparam,
 	smb_nbst_fatal
 };
 
Index: head/sys/opencrypto/crypto.c
===================================================================
--- head/sys/opencrypto/crypto.c	(revision 336913)
+++ head/sys/opencrypto/crypto.c	(revision 336914)
@@ -1,1853 +1,1853 @@
 /*-
  * Copyright (c) 2002-2006 Sam Leffler.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Cryptographic Subsystem.
  *
  * This code is derived from the Openbsd Cryptographic Framework (OCF)
  * that has the copyright shown below.  Very little of the original
  * code remains.
  */
 
 /*-
  * The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu)
  *
  * This code was written by Angelos D. Keromytis in Athens, Greece, in
  * February 2000. Network Security Technologies Inc. (NSTI) kindly
  * supported the development of this code.
  *
  * Copyright (c) 2000, 2001 Angelos D. Keromytis
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all source code copies of any software which is or includes a copy or
  * modification of this software.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 
 #define	CRYPTO_TIMING				/* enable timing support */
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/uma.h>
 #include <crypto/intake.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>			/* XXX for M_XDATA */
 
 #include <sys/kobj.h>
 #include <sys/bus.h>
 #include "cryptodev_if.h"
 
 #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
 #include <machine/pcb.h>
 #endif
 
 struct crypto_session {
 	device_t parent;
 	void *softc;
 	uint32_t hid;
 	uint32_t capabilities;
 };
 
 SDT_PROVIDER_DEFINE(opencrypto);
 
 /*
  * Crypto drivers register themselves by allocating a slot in the
  * crypto_drivers table with crypto_get_driverid() and then registering
  * each algorithm they support with crypto_register() and crypto_kregister().
  */
 static	struct mtx crypto_drivers_mtx;		/* lock on driver table */
 #define	CRYPTO_DRIVER_LOCK()	mtx_lock(&crypto_drivers_mtx)
 #define	CRYPTO_DRIVER_UNLOCK()	mtx_unlock(&crypto_drivers_mtx)
 #define	CRYPTO_DRIVER_ASSERT()	mtx_assert(&crypto_drivers_mtx, MA_OWNED)
 
 /*
  * Crypto device/driver capabilities structure.
  *
  * Synchronization:
  * (d) - protected by CRYPTO_DRIVER_LOCK()
  * (q) - protected by CRYPTO_Q_LOCK()
  * Not tagged fields are read-only.
  */
 struct cryptocap {
 	device_t	cc_dev;			/* (d) device/driver */
 	u_int32_t	cc_sessions;		/* (d) # of sessions */
 	u_int32_t	cc_koperations;		/* (d) # os asym operations */
 	/*
 	 * Largest possible operator length (in bits) for each type of
 	 * encryption algorithm. XXX not used
 	 */
 	u_int16_t	cc_max_op_len[CRYPTO_ALGORITHM_MAX + 1];
 	u_int8_t	cc_alg[CRYPTO_ALGORITHM_MAX + 1];
 	u_int8_t	cc_kalg[CRK_ALGORITHM_MAX + 1];
 
 	int		cc_flags;		/* (d) flags */
 #define CRYPTOCAP_F_CLEANUP	0x80000000	/* needs resource cleanup */
 	int		cc_qblocked;		/* (q) symmetric q blocked */
 	int		cc_kqblocked;		/* (q) asymmetric q blocked */
 	size_t		cc_session_size;
 };
 static	struct cryptocap *crypto_drivers = NULL;
 static	int crypto_drivers_num = 0;
 
 /*
  * There are two queues for crypto requests; one for symmetric (e.g.
  * cipher) operations and one for asymmetric (e.g. MOD)operations.
  * A single mutex is used to lock access to both queues.  We could
  * have one per-queue but having one simplifies handling of block/unblock
  * operations.
  */
 static	int crp_sleep = 0;
 static	TAILQ_HEAD(cryptop_q ,cryptop) crp_q;		/* request queues */
 static	TAILQ_HEAD(,cryptkop) crp_kq;
 static	struct mtx crypto_q_mtx;
 #define	CRYPTO_Q_LOCK()		mtx_lock(&crypto_q_mtx)
 #define	CRYPTO_Q_UNLOCK()	mtx_unlock(&crypto_q_mtx)
 
 /*
  * Taskqueue used to dispatch the crypto requests
  * that have the CRYPTO_F_ASYNC flag
  */
 static struct taskqueue *crypto_tq;
 
 /*
  * Crypto seq numbers are operated on with modular arithmetic
  */
 #define	CRYPTO_SEQ_GT(a,b)	((int)((a)-(b)) > 0)
 
 struct crypto_ret_worker {
 	struct mtx crypto_ret_mtx;
 
 	TAILQ_HEAD(,cryptop) crp_ordered_ret_q;	/* ordered callback queue for symetric jobs */
 	TAILQ_HEAD(,cryptop) crp_ret_q;		/* callback queue for symetric jobs */
 	TAILQ_HEAD(,cryptkop) crp_ret_kq;	/* callback queue for asym jobs */
 
 	u_int32_t reorder_ops;		/* total ordered sym jobs received */
 	u_int32_t reorder_cur_seq;	/* current sym job dispatched */
 
 	struct proc *cryptoretproc;
 };
 static struct crypto_ret_worker *crypto_ret_workers = NULL;
 
 #define CRYPTO_RETW(i)		(&crypto_ret_workers[i])
 #define CRYPTO_RETW_ID(w)	((w) - crypto_ret_workers)
 #define FOREACH_CRYPTO_RETW(w) \
 	for (w = crypto_ret_workers; w < crypto_ret_workers + crypto_workers_num; ++w)
 
 #define	CRYPTO_RETW_LOCK(w)	mtx_lock(&w->crypto_ret_mtx)
 #define	CRYPTO_RETW_UNLOCK(w)	mtx_unlock(&w->crypto_ret_mtx)
 #define	CRYPTO_RETW_EMPTY(w) \
 	(TAILQ_EMPTY(&w->crp_ret_q) && TAILQ_EMPTY(&w->crp_ret_kq) && TAILQ_EMPTY(&w->crp_ordered_ret_q))
 
 static int crypto_workers_num = 0;
 SYSCTL_INT(_kern, OID_AUTO, crypto_workers_num, CTLFLAG_RDTUN,
 	   &crypto_workers_num, 0,
 	   "Number of crypto workers used to dispatch crypto jobs");
 
 static	uma_zone_t cryptop_zone;
 static	uma_zone_t cryptodesc_zone;
 static	uma_zone_t cryptoses_zone;
 
 int	crypto_userasymcrypto = 1;	/* userland may do asym crypto reqs */
 SYSCTL_INT(_kern, OID_AUTO, userasymcrypto, CTLFLAG_RW,
 	   &crypto_userasymcrypto, 0,
 	   "Enable/disable user-mode access to asymmetric crypto support");
 int	crypto_devallowsoft = 0;	/* only use hardware crypto */
 SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RW,
 	   &crypto_devallowsoft, 0,
 	   "Enable/disable use of software crypto by /dev/crypto");
 
 MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records");
 
 static	void crypto_proc(void);
 static	struct proc *cryptoproc;
 static	void crypto_ret_proc(struct crypto_ret_worker *ret_worker);
 static	void crypto_destroy(void);
 static	int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint);
 static	int crypto_kinvoke(struct cryptkop *krp, int flags);
 static	void crypto_remove(struct cryptocap *cap);
 static	void crypto_task_invoke(void *ctx, int pending);
 static void crypto_batch_enqueue(struct cryptop *crp);
 
 static	struct cryptostats cryptostats;
 SYSCTL_STRUCT(_kern, OID_AUTO, crypto_stats, CTLFLAG_RW, &cryptostats,
 	    cryptostats, "Crypto system statistics");
 
 #ifdef CRYPTO_TIMING
 static	int crypto_timing = 0;
 SYSCTL_INT(_debug, OID_AUTO, crypto_timing, CTLFLAG_RW,
 	   &crypto_timing, 0, "Enable/disable crypto timing support");
 #endif
 
 /* Try to avoid directly exposing the key buffer as a symbol */
 static struct keybuf *keybuf;
 
 static struct keybuf empty_keybuf = {
         .kb_nents = 0
 };
 
 /* Obtain the key buffer from boot metadata */
 static void
 keybuf_init(void)
 {
 	caddr_t kmdp;
 
 	kmdp = preload_search_by_type("elf kernel");
 
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 
 	keybuf = (struct keybuf *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_KEYBUF);
 
         if (keybuf == NULL)
                 keybuf = &empty_keybuf;
 }
 
 /* It'd be nice if we could store these in some kind of secure memory... */
 struct keybuf * get_keybuf(void) {
 
         return (keybuf);
 }
 
 static int
 crypto_init(void)
 {
 	struct crypto_ret_worker *ret_worker;
 	int error;
 
 	mtx_init(&crypto_drivers_mtx, "crypto", "crypto driver table",
 		MTX_DEF|MTX_QUIET);
 
 	TAILQ_INIT(&crp_q);
 	TAILQ_INIT(&crp_kq);
 	mtx_init(&crypto_q_mtx, "crypto", "crypto op queues", MTX_DEF);
 
 	cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop),
 				    0, 0, 0, 0,
 				    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc),
 				    0, 0, 0, 0,
 				    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cryptoses_zone = uma_zcreate("crypto_session",
 	    sizeof(struct crypto_session), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	if (cryptodesc_zone == NULL || cryptop_zone == NULL ||
 	    cryptoses_zone == NULL) {
 		printf("crypto_init: cannot setup crypto zones\n");
 		error = ENOMEM;
 		goto bad;
 	}
 
 	crypto_drivers_num = CRYPTO_DRIVERS_INITIAL;
 	crypto_drivers = malloc(crypto_drivers_num *
 	    sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT | M_ZERO);
 	if (crypto_drivers == NULL) {
 		printf("crypto_init: cannot setup crypto drivers\n");
 		error = ENOMEM;
 		goto bad;
 	}
 
 	if (crypto_workers_num < 1 || crypto_workers_num > mp_ncpus)
 		crypto_workers_num = mp_ncpus;
 
 	crypto_tq = taskqueue_create("crypto", M_WAITOK|M_ZERO,
 				taskqueue_thread_enqueue, &crypto_tq);
 	if (crypto_tq == NULL) {
 		printf("crypto init: cannot setup crypto taskqueue\n");
 		error = ENOMEM;
 		goto bad;
 	}
 
 	taskqueue_start_threads(&crypto_tq, crypto_workers_num, PRI_MIN_KERN,
 		"crypto");
 
 	error = kproc_create((void (*)(void *)) crypto_proc, NULL,
 		    &cryptoproc, 0, 0, "crypto");
 	if (error) {
 		printf("crypto_init: cannot start crypto thread; error %d",
 			error);
 		goto bad;
 	}
 
 	crypto_ret_workers = malloc(crypto_workers_num * sizeof(struct crypto_ret_worker),
 			M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
 	if (crypto_ret_workers == NULL) {
 		error = ENOMEM;
 		printf("crypto_init: cannot allocate ret workers\n");
 		goto bad;
 	}
 
 
 	FOREACH_CRYPTO_RETW(ret_worker) {
 		TAILQ_INIT(&ret_worker->crp_ordered_ret_q);
 		TAILQ_INIT(&ret_worker->crp_ret_q);
 		TAILQ_INIT(&ret_worker->crp_ret_kq);
 
 		ret_worker->reorder_ops = 0;
 		ret_worker->reorder_cur_seq = 0;
 
 		mtx_init(&ret_worker->crypto_ret_mtx, "crypto", "crypto return queues", MTX_DEF);
 
 		error = kproc_create((void (*)(void *)) crypto_ret_proc, ret_worker,
 				&ret_worker->cryptoretproc, 0, 0, "crypto returns %td", CRYPTO_RETW_ID(ret_worker));
 		if (error) {
 			printf("crypto_init: cannot start cryptoret thread; error %d",
 				error);
 			goto bad;
 		}
 	}
 
 	keybuf_init();
 
 	return 0;
 bad:
 	crypto_destroy();
 	return error;
 }
 
 /*
  * Signal a crypto thread to terminate.  We use the driver
  * table lock to synchronize the sleep/wakeups so that we
  * are sure the threads have terminated before we release
  * the data structures they use.  See crypto_finis below
  * for the other half of this song-and-dance.
  */
 static void
 crypto_terminate(struct proc **pp, void *q)
 {
 	struct proc *p;
 
 	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
 	p = *pp;
 	*pp = NULL;
 	if (p) {
 		wakeup_one(q);
 		PROC_LOCK(p);		/* NB: insure we don't miss wakeup */
 		CRYPTO_DRIVER_UNLOCK();	/* let crypto_finis progress */
 		msleep(p, &p->p_mtx, PWAIT, "crypto_destroy", 0);
 		PROC_UNLOCK(p);
 		CRYPTO_DRIVER_LOCK();
 	}
 }
 
 static void
 crypto_destroy(void)
 {
 	struct crypto_ret_worker *ret_worker;
 
 	/*
 	 * Terminate any crypto threads.
 	 */
 	if (crypto_tq != NULL)
 		taskqueue_drain_all(crypto_tq);
 	CRYPTO_DRIVER_LOCK();
 	crypto_terminate(&cryptoproc, &crp_q);
 	FOREACH_CRYPTO_RETW(ret_worker)
 		crypto_terminate(&ret_worker->cryptoretproc, &ret_worker->crp_ret_q);
 	CRYPTO_DRIVER_UNLOCK();
 
 	/* XXX flush queues??? */
 
 	/*
 	 * Reclaim dynamically allocated resources.
 	 */
 	if (crypto_drivers != NULL)
 		free(crypto_drivers, M_CRYPTO_DATA);
 
 	if (cryptoses_zone != NULL)
 		uma_zdestroy(cryptoses_zone);
 	if (cryptodesc_zone != NULL)
 		uma_zdestroy(cryptodesc_zone);
 	if (cryptop_zone != NULL)
 		uma_zdestroy(cryptop_zone);
 	mtx_destroy(&crypto_q_mtx);
 	FOREACH_CRYPTO_RETW(ret_worker)
 		mtx_destroy(&ret_worker->crypto_ret_mtx);
 	free(crypto_ret_workers, M_CRYPTO_DATA);
 	if (crypto_tq != NULL)
 		taskqueue_free(crypto_tq);
 	mtx_destroy(&crypto_drivers_mtx);
 }
 
 uint32_t
 crypto_ses2hid(crypto_session_t crypto_session)
 {
 	return (crypto_session->hid);
 }
 
 uint32_t
 crypto_ses2caps(crypto_session_t crypto_session)
 {
 	return (crypto_session->capabilities);
 }
 
 void *
 crypto_get_driver_session(crypto_session_t crypto_session)
 {
 	return (crypto_session->softc);
 }
 
 static struct cryptocap *
 crypto_checkdriver(u_int32_t hid)
 {
 	if (crypto_drivers == NULL)
 		return NULL;
 	return (hid >= crypto_drivers_num ? NULL : &crypto_drivers[hid]);
 }
 
 /*
  * Compare a driver's list of supported algorithms against another
  * list; return non-zero if all algorithms are supported.
  */
 static int
 driver_suitable(const struct cryptocap *cap, const struct cryptoini *cri)
 {
 	const struct cryptoini *cr;
 
 	/* See if all the algorithms are supported. */
 	for (cr = cri; cr; cr = cr->cri_next)
 		if (cap->cc_alg[cr->cri_alg] == 0)
 			return 0;
 	return 1;
 }
 
 /*
  * Select a driver for a new session that supports the specified
  * algorithms and, optionally, is constrained according to the flags.
  * The algorithm we use here is pretty stupid; just use the
  * first driver that supports all the algorithms we need. If there
  * are multiple drivers we choose the driver with the fewest active
  * sessions.  We prefer hardware-backed drivers to software ones.
  *
  * XXX We need more smarts here (in real life too, but that's
  * XXX another story altogether).
  */
 static struct cryptocap *
 crypto_select_driver(const struct cryptoini *cri, int flags)
 {
 	struct cryptocap *cap, *best;
 	int match, hid;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	/*
 	 * Look first for hardware crypto devices if permitted.
 	 */
 	if (flags & CRYPTOCAP_F_HARDWARE)
 		match = CRYPTOCAP_F_HARDWARE;
 	else
 		match = CRYPTOCAP_F_SOFTWARE;
 	best = NULL;
 again:
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		cap = &crypto_drivers[hid];
 		/*
 		 * If it's not initialized, is in the process of
 		 * going away, or is not appropriate (hardware
 		 * or software based on match), then skip.
 		 */
 		if (cap->cc_dev == NULL ||
 		    (cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
 		    (cap->cc_flags & match) == 0)
 			continue;
 
 		/* verify all the algorithms are supported. */
 		if (driver_suitable(cap, cri)) {
 			if (best == NULL ||
 			    cap->cc_sessions < best->cc_sessions)
 				best = cap;
 		}
 	}
 	if (best == NULL && match == CRYPTOCAP_F_HARDWARE &&
 	    (flags & CRYPTOCAP_F_SOFTWARE)) {
 		/* sort of an Algol 68-style for loop */
 		match = CRYPTOCAP_F_SOFTWARE;
 		goto again;
 	}
 	return best;
 }
 
 /*
  * Create a new session.  The crid argument specifies a crypto
  * driver to use or constraints on a driver to select (hardware
  * only, software only, either).  Whatever driver is selected
  * must be capable of the requested crypto algorithms.
  */
 int
 crypto_newsession(crypto_session_t *cses, struct cryptoini *cri, int crid)
 {
 	crypto_session_t res;
 	void *softc_mem;
 	struct cryptocap *cap;
 	u_int32_t hid;
 	size_t softc_size;
 	int err;
 
 restart:
 	res = NULL;
 	softc_mem = NULL;
 
 	CRYPTO_DRIVER_LOCK();
 	if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		/*
 		 * Use specified driver; verify it is capable.
 		 */
 		cap = crypto_checkdriver(crid);
 		if (cap != NULL && !driver_suitable(cap, cri))
 			cap = NULL;
 	} else {
 		/*
 		 * No requested driver; select based on crid flags.
 		 */
 		cap = crypto_select_driver(cri, crid);
 		/*
 		 * if NULL then can't do everything in one session.
 		 * XXX Fix this. We need to inject a "virtual" session
 		 * XXX layer right about here.
 		 */
 	}
 	if (cap == NULL) {
 		CRYPTDEB("no driver");
 		err = EOPNOTSUPP;
 		goto out;
 	}
 	cap->cc_sessions++;
 	softc_size = cap->cc_session_size;
 	hid = cap - crypto_drivers;
 	cap = NULL;
 	CRYPTO_DRIVER_UNLOCK();
 
 	softc_mem = malloc(softc_size, M_CRYPTO_DATA, M_WAITOK | M_ZERO);
 	res = uma_zalloc(cryptoses_zone, M_WAITOK | M_ZERO);
 	res->softc = softc_mem;
 
 	CRYPTO_DRIVER_LOCK();
 	cap = crypto_checkdriver(hid);
 	if (cap != NULL && (cap->cc_flags & CRYPTOCAP_F_CLEANUP) != 0) {
 		cap->cc_sessions--;
 		crypto_remove(cap);
 		cap = NULL;
 	}
 	if (cap == NULL) {
 		free(softc_mem, M_CRYPTO_DATA);
 		uma_zfree(cryptoses_zone, res);
 		CRYPTO_DRIVER_UNLOCK();
 		goto restart;
 	}
 
 	/* Call the driver initialization routine. */
 	err = CRYPTODEV_NEWSESSION(cap->cc_dev, res, cri);
 	if (err != 0) {
 		CRYPTDEB("dev newsession failed: %d", err);
 		goto out;
 	}
 
 	res->capabilities = cap->cc_flags & 0xff000000;
 	res->hid = hid;
 	*cses = res;
 
 out:
 	CRYPTO_DRIVER_UNLOCK();
 	if (err != 0) {
 		free(softc_mem, M_CRYPTO_DATA);
 		if (res != NULL)
 			uma_zfree(cryptoses_zone, res);
 	}
 	return err;
 }
 
 static void
 crypto_remove(struct cryptocap *cap)
 {
 
 	mtx_assert(&crypto_drivers_mtx, MA_OWNED);
 	if (cap->cc_sessions == 0 && cap->cc_koperations == 0)
 		bzero(cap, sizeof(*cap));
 }
 
 /*
  * Delete an existing session (or a reserved session on an unregistered
  * driver).
  */
 void
 crypto_freesession(crypto_session_t cses)
 {
 	struct cryptocap *cap;
 	void *ses;
 	size_t ses_size;
 	u_int32_t hid;
 
 	if (cses == NULL)
 		return;
 
 	CRYPTO_DRIVER_LOCK();
 
 	hid = crypto_ses2hid(cses);
 	KASSERT(hid < crypto_drivers_num,
 	    ("bogus crypto_session %p hid %u", cses, hid));
 	cap = &crypto_drivers[hid];
 
 	ses = cses->softc;
 	ses_size = cap->cc_session_size;
 
 	if (cap->cc_sessions)
 		cap->cc_sessions--;
 
 	/* Call the driver cleanup routine, if available. */
 	CRYPTODEV_FREESESSION(cap->cc_dev, cses);
 
 	explicit_bzero(ses, ses_size);
 	free(ses, M_CRYPTO_DATA);
 	uma_zfree(cryptoses_zone, cses);
 
 	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
 		crypto_remove(cap);
 
 	CRYPTO_DRIVER_UNLOCK();
 }
 
 /*
  * Return an unused driver id.  Used by drivers prior to registering
  * support for the algorithms they handle.
  */
 int32_t
 crypto_get_driverid(device_t dev, size_t sessionsize, int flags)
 {
 	struct cryptocap *newdrv;
 	int i;
 
 	if ((flags & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		printf("%s: no flags specified when registering driver\n",
 		    device_get_nameunit(dev));
 		return -1;
 	}
 
 	CRYPTO_DRIVER_LOCK();
 
 	for (i = 0; i < crypto_drivers_num; i++) {
 		if (crypto_drivers[i].cc_dev == NULL &&
 		    (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP) == 0) {
 			break;
 		}
 	}
 
 	/* Out of entries, allocate some more. */
 	if (i == crypto_drivers_num) {
 		/* Be careful about wrap-around. */
 		if (2 * crypto_drivers_num <= crypto_drivers_num) {
 			CRYPTO_DRIVER_UNLOCK();
 			printf("crypto: driver count wraparound!\n");
 			return -1;
 		}
 
 		newdrv = malloc(2 * crypto_drivers_num *
 		    sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT|M_ZERO);
 		if (newdrv == NULL) {
 			CRYPTO_DRIVER_UNLOCK();
 			printf("crypto: no space to expand driver table!\n");
 			return -1;
 		}
 
 		bcopy(crypto_drivers, newdrv,
 		    crypto_drivers_num * sizeof(struct cryptocap));
 
 		crypto_drivers_num *= 2;
 
 		free(crypto_drivers, M_CRYPTO_DATA);
 		crypto_drivers = newdrv;
 	}
 
 	/* NB: state is zero'd on free */
 	crypto_drivers[i].cc_sessions = 1;	/* Mark */
 	crypto_drivers[i].cc_dev = dev;
 	crypto_drivers[i].cc_flags = flags;
 	crypto_drivers[i].cc_session_size = sessionsize;
 	if (bootverbose)
 		printf("crypto: assign %s driver id %u, flags 0x%x\n",
 		    device_get_nameunit(dev), i, flags);
 
 	CRYPTO_DRIVER_UNLOCK();
 
 	return i;
 }
 
 /*
  * Lookup a driver by name.  We match against the full device
  * name and unit, and against just the name.  The latter gives
  * us a simple widlcarding by device name.  On success return the
  * driver/hardware identifier; otherwise return -1.
  */
 int
 crypto_find_driver(const char *match)
 {
 	int i, len = strlen(match);
 
 	CRYPTO_DRIVER_LOCK();
 	for (i = 0; i < crypto_drivers_num; i++) {
 		device_t dev = crypto_drivers[i].cc_dev;
 		if (dev == NULL ||
 		    (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP))
 			continue;
 		if (strncmp(match, device_get_nameunit(dev), len) == 0 ||
 		    strncmp(match, device_get_name(dev), len) == 0)
 			break;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 	return i < crypto_drivers_num ? i : -1;
 }
 
 /*
  * Return the device_t for the specified driver or NULL
  * if the driver identifier is invalid.
  */
 device_t
 crypto_find_device_byhid(int hid)
 {
 	struct cryptocap *cap = crypto_checkdriver(hid);
 	return cap != NULL ? cap->cc_dev : NULL;
 }
 
 /*
  * Return the device/driver capabilities.
  */
 int
 crypto_getcaps(int hid)
 {
 	struct cryptocap *cap = crypto_checkdriver(hid);
 	return cap != NULL ? cap->cc_flags : 0;
 }
 
 /*
  * Register support for a key-related algorithm.  This routine
  * is called once for each algorithm supported a driver.
  */
 int
 crypto_kregister(u_int32_t driverid, int kalg, u_int32_t flags)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL &&
 	    (CRK_ALGORITM_MIN <= kalg && kalg <= CRK_ALGORITHM_MAX)) {
 		/*
 		 * XXX Do some performance testing to determine placing.
 		 * XXX We probably need an auxiliary data structure that
 		 * XXX describes relative performances.
 		 */
 
 		cap->cc_kalg[kalg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
 		if (bootverbose)
 			printf("crypto: %s registers key alg %u flags %u\n"
 				, device_get_nameunit(cap->cc_dev)
 				, kalg
 				, flags
 			);
 		err = 0;
 	} else
 		err = EINVAL;
 
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 /*
  * Register support for a non-key-related algorithm.  This routine
  * is called once for each such algorithm supported by a driver.
  */
 int
 crypto_register(u_int32_t driverid, int alg, u_int16_t maxoplen,
     u_int32_t flags)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 
 	cap = crypto_checkdriver(driverid);
 	/* NB: algorithms are in the range [1..max] */
 	if (cap != NULL &&
 	    (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX)) {
 		/*
 		 * XXX Do some performance testing to determine placing.
 		 * XXX We probably need an auxiliary data structure that
 		 * XXX describes relative performances.
 		 */
 
 		cap->cc_alg[alg] = flags | CRYPTO_ALG_FLAG_SUPPORTED;
 		cap->cc_max_op_len[alg] = maxoplen;
 		if (bootverbose)
 			printf("crypto: %s registers alg %u flags %u maxoplen %u\n"
 				, device_get_nameunit(cap->cc_dev)
 				, alg
 				, flags
 				, maxoplen
 			);
 		cap->cc_sessions = 0;		/* Unmark */
 		err = 0;
 	} else
 		err = EINVAL;
 
 	CRYPTO_DRIVER_UNLOCK();
 	return err;
 }
 
 static void
 driver_finis(struct cryptocap *cap)
 {
 	u_int32_t ses, kops;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	ses = cap->cc_sessions;
 	kops = cap->cc_koperations;
 	bzero(cap, sizeof(*cap));
 	if (ses != 0 || kops != 0) {
 		/*
 		 * If there are pending sessions,
 		 * just mark as invalid.
 		 */
 		cap->cc_flags |= CRYPTOCAP_F_CLEANUP;
 		cap->cc_sessions = ses;
 		cap->cc_koperations = kops;
 	}
 }
 
 /*
  * Unregister a crypto driver. If there are pending sessions using it,
  * leave enough information around so that subsequent calls using those
  * sessions will correctly detect the driver has been unregistered and
  * reroute requests.
  */
 int
 crypto_unregister(u_int32_t driverid, int alg)
 {
 	struct cryptocap *cap;
 	int i, err;
 
 	CRYPTO_DRIVER_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL &&
 	    (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX) &&
 	    cap->cc_alg[alg] != 0) {
 		cap->cc_alg[alg] = 0;
 		cap->cc_max_op_len[alg] = 0;
 
 		/* Was this the last algorithm ? */
 		for (i = 1; i <= CRYPTO_ALGORITHM_MAX; i++)
 			if (cap->cc_alg[i] != 0)
 				break;
 
 		if (i == CRYPTO_ALGORITHM_MAX + 1)
 			driver_finis(cap);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_DRIVER_UNLOCK();
 
 	return err;
 }
 
 /*
  * Unregister all algorithms associated with a crypto driver.
  * If there are pending sessions using it, leave enough information
  * around so that subsequent calls using those sessions will
  * correctly detect the driver has been unregistered and reroute
  * requests.
  */
 int
 crypto_unregister_all(u_int32_t driverid)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_DRIVER_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL) {
 		driver_finis(cap);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_DRIVER_UNLOCK();
 
 	return err;
 }
 
 /*
  * Clear blockage on a driver.  The what parameter indicates whether
  * the driver is now ready for cryptop's and/or cryptokop's.
  */
 int
 crypto_unblock(u_int32_t driverid, int what)
 {
 	struct cryptocap *cap;
 	int err;
 
 	CRYPTO_Q_LOCK();
 	cap = crypto_checkdriver(driverid);
 	if (cap != NULL) {
 		if (what & CRYPTO_SYMQ)
 			cap->cc_qblocked = 0;
 		if (what & CRYPTO_ASYMQ)
 			cap->cc_kqblocked = 0;
 		if (crp_sleep)
 			wakeup_one(&crp_q);
 		err = 0;
 	} else
 		err = EINVAL;
 	CRYPTO_Q_UNLOCK();
 
 	return err;
 }
 
 /*
  * Add a crypto request to a queue, to be processed by the kernel thread.
  */
 int
 crypto_dispatch(struct cryptop *crp)
 {
 	struct cryptocap *cap;
 	u_int32_t hid;
 	int result;
 
 	cryptostats.cs_ops++;
 
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		binuptime(&crp->crp_tstamp);
 #endif
 
 	crp->crp_retw_id = ((uintptr_t)crp->crp_session) % crypto_workers_num;
 
 	if (CRYPTOP_ASYNC(crp)) {
 		if (crp->crp_flags & CRYPTO_F_ASYNC_KEEPORDER) {
 			struct crypto_ret_worker *ret_worker;
 
 			ret_worker = CRYPTO_RETW(crp->crp_retw_id);
 
 			CRYPTO_RETW_LOCK(ret_worker);
 			crp->crp_seq = ret_worker->reorder_ops++;
 			CRYPTO_RETW_UNLOCK(ret_worker);
 		}
 
 		TASK_INIT(&crp->crp_task, 0, crypto_task_invoke, crp);
 		taskqueue_enqueue(crypto_tq, &crp->crp_task);
 		return (0);
 	}
 
 	if ((crp->crp_flags & CRYPTO_F_BATCH) == 0) {
 		hid = crypto_ses2hid(crp->crp_session);
 
 		/*
 		 * Caller marked the request to be processed
 		 * immediately; dispatch it directly to the
 		 * driver unless the driver is currently blocked.
 		 */
 		cap = crypto_checkdriver(hid);
 		/* Driver cannot disappeared when there is an active session. */
 		KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__));
 		if (!cap->cc_qblocked) {
 			result = crypto_invoke(cap, crp, 0);
 			if (result != ERESTART)
 				return (result);
 			/*
 			 * The driver ran out of resources, put the request on
 			 * the queue.
 			 */
 		}
 	}
 	crypto_batch_enqueue(crp);
 	return 0;
 }
 
 void
 crypto_batch_enqueue(struct cryptop *crp)
 {
 
 	CRYPTO_Q_LOCK();
 	TAILQ_INSERT_TAIL(&crp_q, crp, crp_next);
 	if (crp_sleep)
 		wakeup_one(&crp_q);
 	CRYPTO_Q_UNLOCK();
 }
 
 /*
  * Add an asymetric crypto request to a queue,
  * to be processed by the kernel thread.
  */
 int
 crypto_kdispatch(struct cryptkop *krp)
 {
 	int error;
 
 	cryptostats.cs_kops++;
 
 	error = crypto_kinvoke(krp, krp->krp_crid);
 	if (error == ERESTART) {
 		CRYPTO_Q_LOCK();
 		TAILQ_INSERT_TAIL(&crp_kq, krp, krp_next);
 		if (crp_sleep)
 			wakeup_one(&crp_q);
 		CRYPTO_Q_UNLOCK();
 		error = 0;
 	}
 	return error;
 }
 
 /*
  * Verify a driver is suitable for the specified operation.
  */
 static __inline int
 kdriver_suitable(const struct cryptocap *cap, const struct cryptkop *krp)
 {
 	return (cap->cc_kalg[krp->krp_op] & CRYPTO_ALG_FLAG_SUPPORTED) != 0;
 }
 
 /*
  * Select a driver for an asym operation.  The driver must
  * support the necessary algorithm.  The caller can constrain
  * which device is selected with the flags parameter.  The
  * algorithm we use here is pretty stupid; just use the first
  * driver that supports the algorithms we need. If there are
  * multiple suitable drivers we choose the driver with the
  * fewest active operations.  We prefer hardware-backed
  * drivers to software ones when either may be used.
  */
 static struct cryptocap *
 crypto_select_kdriver(const struct cryptkop *krp, int flags)
 {
 	struct cryptocap *cap, *best;
 	int match, hid;
 
 	CRYPTO_DRIVER_ASSERT();
 
 	/*
 	 * Look first for hardware crypto devices if permitted.
 	 */
 	if (flags & CRYPTOCAP_F_HARDWARE)
 		match = CRYPTOCAP_F_HARDWARE;
 	else
 		match = CRYPTOCAP_F_SOFTWARE;
 	best = NULL;
 again:
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		cap = &crypto_drivers[hid];
 		/*
 		 * If it's not initialized, is in the process of
 		 * going away, or is not appropriate (hardware
 		 * or software based on match), then skip.
 		 */
 		if (cap->cc_dev == NULL ||
 		    (cap->cc_flags & CRYPTOCAP_F_CLEANUP) ||
 		    (cap->cc_flags & match) == 0)
 			continue;
 
 		/* verify all the algorithms are supported. */
 		if (kdriver_suitable(cap, krp)) {
 			if (best == NULL ||
 			    cap->cc_koperations < best->cc_koperations)
 				best = cap;
 		}
 	}
 	if (best != NULL)
 		return best;
 	if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) {
 		/* sort of an Algol 68-style for loop */
 		match = CRYPTOCAP_F_SOFTWARE;
 		goto again;
 	}
 	return best;
 }
 
 /*
  * Dispatch an asymmetric crypto request.
  */
 static int
 crypto_kinvoke(struct cryptkop *krp, int crid)
 {
 	struct cryptocap *cap = NULL;
 	int error;
 
 	KASSERT(krp != NULL, ("%s: krp == NULL", __func__));
 	KASSERT(krp->krp_callback != NULL,
 	    ("%s: krp->crp_callback == NULL", __func__));
 
 	CRYPTO_DRIVER_LOCK();
 	if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) {
 		cap = crypto_checkdriver(crid);
 		if (cap != NULL) {
 			/*
 			 * Driver present, it must support the necessary
 			 * algorithm and, if s/w drivers are excluded,
 			 * it must be registered as hardware-backed.
 			 */
 			if (!kdriver_suitable(cap, krp) ||
 			    (!crypto_devallowsoft &&
 			     (cap->cc_flags & CRYPTOCAP_F_HARDWARE) == 0))
 				cap = NULL;
 		}
 	} else {
 		/*
 		 * No requested driver; select based on crid flags.
 		 */
 		if (!crypto_devallowsoft)	/* NB: disallow s/w drivers */
 			crid &= ~CRYPTOCAP_F_SOFTWARE;
 		cap = crypto_select_kdriver(krp, crid);
 	}
 	if (cap != NULL && !cap->cc_kqblocked) {
 		krp->krp_hid = cap - crypto_drivers;
 		cap->cc_koperations++;
 		CRYPTO_DRIVER_UNLOCK();
 		error = CRYPTODEV_KPROCESS(cap->cc_dev, krp, 0);
 		CRYPTO_DRIVER_LOCK();
 		if (error == ERESTART) {
 			cap->cc_koperations--;
 			CRYPTO_DRIVER_UNLOCK();
 			return (error);
 		}
 	} else {
 		/*
 		 * NB: cap is !NULL if device is blocked; in
 		 *     that case return ERESTART so the operation
 		 *     is resubmitted if possible.
 		 */
 		error = (cap == NULL) ? ENODEV : ERESTART;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 
 	if (error) {
 		krp->krp_status = error;
 		crypto_kdone(krp);
 	}
 	return 0;
 }
 
 #ifdef CRYPTO_TIMING
 static void
 crypto_tstat(struct cryptotstat *ts, struct bintime *bt)
 {
 	struct bintime now, delta;
 	struct timespec t;
 	uint64_t u;
 
 	binuptime(&now);
 	u = now.frac;
 	delta.frac = now.frac - bt->frac;
 	delta.sec = now.sec - bt->sec;
 	if (u < delta.frac)
 		delta.sec--;
 	bintime2timespec(&delta, &t);
-	timespecadd(&ts->acc, &t);
+	timespecadd(&ts->acc, &t, &ts->acc);
 	if (timespeccmp(&t, &ts->min, <))
 		ts->min = t;
 	if (timespeccmp(&t, &ts->max, >))
 		ts->max = t;
 	ts->count++;
 
 	*bt = now;
 }
 #endif
 
 static void
 crypto_task_invoke(void *ctx, int pending)
 {
 	struct cryptocap *cap;
 	struct cryptop *crp;
 	int hid, result;
 
 	crp = (struct cryptop *)ctx;
 
 	hid = crypto_ses2hid(crp->crp_session);
 	cap = crypto_checkdriver(hid);
 
 	result = crypto_invoke(cap, crp, 0);
 	if (result == ERESTART)
 		crypto_batch_enqueue(crp);
 }
 
 /*
  * Dispatch a crypto request to the appropriate crypto devices.
  */
 static int
 crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint)
 {
 
 	KASSERT(crp != NULL, ("%s: crp == NULL", __func__));
 	KASSERT(crp->crp_callback != NULL,
 	    ("%s: crp->crp_callback == NULL", __func__));
 	KASSERT(crp->crp_desc != NULL, ("%s: crp->crp_desc == NULL", __func__));
 
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		crypto_tstat(&cryptostats.cs_invoke, &crp->crp_tstamp);
 #endif
 	if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) {
 		struct cryptodesc *crd;
 		crypto_session_t nses;
 
 		/*
 		 * Driver has unregistered; migrate the session and return
 		 * an error to the caller so they'll resubmit the op.
 		 *
 		 * XXX: What if there are more already queued requests for this
 		 *      session?
 		 */
 		crypto_freesession(crp->crp_session);
 
 		for (crd = crp->crp_desc; crd->crd_next; crd = crd->crd_next)
 			crd->CRD_INI.cri_next = &(crd->crd_next->CRD_INI);
 
 		/* XXX propagate flags from initial session? */
 		if (crypto_newsession(&nses, &(crp->crp_desc->CRD_INI),
 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE) == 0)
 			crp->crp_session = nses;
 
 		crp->crp_etype = EAGAIN;
 		crypto_done(crp);
 		return 0;
 	} else {
 		/*
 		 * Invoke the driver to process the request.
 		 */
 		return CRYPTODEV_PROCESS(cap->cc_dev, crp, hint);
 	}
 }
 
 /*
  * Release a set of crypto descriptors.
  */
 void
 crypto_freereq(struct cryptop *crp)
 {
 	struct cryptodesc *crd;
 
 	if (crp == NULL)
 		return;
 
 #ifdef DIAGNOSTIC
 	{
 		struct cryptop *crp2;
 		struct crypto_ret_worker *ret_worker;
 
 		CRYPTO_Q_LOCK();
 		TAILQ_FOREACH(crp2, &crp_q, crp_next) {
 			KASSERT(crp2 != crp,
 			    ("Freeing cryptop from the crypto queue (%p).",
 			    crp));
 		}
 		CRYPTO_Q_UNLOCK();
 
 		FOREACH_CRYPTO_RETW(ret_worker) {
 			CRYPTO_RETW_LOCK(ret_worker);
 			TAILQ_FOREACH(crp2, &ret_worker->crp_ret_q, crp_next) {
 				KASSERT(crp2 != crp,
 				    ("Freeing cryptop from the return queue (%p).",
 				    crp));
 			}
 			CRYPTO_RETW_UNLOCK(ret_worker);
 		}
 	}
 #endif
 
 	while ((crd = crp->crp_desc) != NULL) {
 		crp->crp_desc = crd->crd_next;
 		uma_zfree(cryptodesc_zone, crd);
 	}
 	uma_zfree(cryptop_zone, crp);
 }
 
 /*
  * Acquire a set of crypto descriptors.
  */
 struct cryptop *
 crypto_getreq(int num)
 {
 	struct cryptodesc *crd;
 	struct cryptop *crp;
 
 	crp = uma_zalloc(cryptop_zone, M_NOWAIT|M_ZERO);
 	if (crp != NULL) {
 		while (num--) {
 			crd = uma_zalloc(cryptodesc_zone, M_NOWAIT|M_ZERO);
 			if (crd == NULL) {
 				crypto_freereq(crp);
 				return NULL;
 			}
 
 			crd->crd_next = crp->crp_desc;
 			crp->crp_desc = crd;
 		}
 	}
 	return crp;
 }
 
 /*
  * Invoke the callback on behalf of the driver.
  */
 void
 crypto_done(struct cryptop *crp)
 {
 	KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0,
 		("crypto_done: op already done, flags 0x%x", crp->crp_flags));
 	crp->crp_flags |= CRYPTO_F_DONE;
 	if (crp->crp_etype != 0)
 		cryptostats.cs_errs++;
 #ifdef CRYPTO_TIMING
 	if (crypto_timing)
 		crypto_tstat(&cryptostats.cs_done, &crp->crp_tstamp);
 #endif
 	/*
 	 * CBIMM means unconditionally do the callback immediately;
 	 * CBIFSYNC means do the callback immediately only if the
 	 * operation was done synchronously.  Both are used to avoid
 	 * doing extraneous context switches; the latter is mostly
 	 * used with the software crypto driver.
 	 */
 	if (!CRYPTOP_ASYNC_KEEPORDER(crp) &&
 	    ((crp->crp_flags & CRYPTO_F_CBIMM) ||
 	    ((crp->crp_flags & CRYPTO_F_CBIFSYNC) &&
 	     (crypto_ses2caps(crp->crp_session) & CRYPTOCAP_F_SYNC)))) {
 		/*
 		 * Do the callback directly.  This is ok when the
 		 * callback routine does very little (e.g. the
 		 * /dev/crypto callback method just does a wakeup).
 		 */
 #ifdef CRYPTO_TIMING
 		if (crypto_timing) {
 			/*
 			 * NB: We must copy the timestamp before
 			 * doing the callback as the cryptop is
 			 * likely to be reclaimed.
 			 */
 			struct bintime t = crp->crp_tstamp;
 			crypto_tstat(&cryptostats.cs_cb, &t);
 			crp->crp_callback(crp);
 			crypto_tstat(&cryptostats.cs_finis, &t);
 		} else
 #endif
 			crp->crp_callback(crp);
 	} else {
 		struct crypto_ret_worker *ret_worker;
 		bool wake;
 
 		ret_worker = CRYPTO_RETW(crp->crp_retw_id);
 		wake = false;
 
 		/*
 		 * Normal case; queue the callback for the thread.
 		 */
 		CRYPTO_RETW_LOCK(ret_worker);
 		if (CRYPTOP_ASYNC_KEEPORDER(crp)) {
 			struct cryptop *tmp;
 
 			TAILQ_FOREACH_REVERSE(tmp, &ret_worker->crp_ordered_ret_q,
 					cryptop_q, crp_next) {
 				if (CRYPTO_SEQ_GT(crp->crp_seq, tmp->crp_seq)) {
 					TAILQ_INSERT_AFTER(&ret_worker->crp_ordered_ret_q,
 							tmp, crp, crp_next);
 					break;
 				}
 			}
 			if (tmp == NULL) {
 				TAILQ_INSERT_HEAD(&ret_worker->crp_ordered_ret_q,
 						crp, crp_next);
 			}
 
 			if (crp->crp_seq == ret_worker->reorder_cur_seq)
 				wake = true;
 		}
 		else {
 			if (CRYPTO_RETW_EMPTY(ret_worker))
 				wake = true;
 
 			TAILQ_INSERT_TAIL(&ret_worker->crp_ret_q, crp, crp_next);
 		}
 
 		if (wake)
 			wakeup_one(&ret_worker->crp_ret_q);	/* shared wait channel */
 		CRYPTO_RETW_UNLOCK(ret_worker);
 	}
 }
 
 /*
  * Invoke the callback on behalf of the driver.
  */
 void
 crypto_kdone(struct cryptkop *krp)
 {
 	struct crypto_ret_worker *ret_worker;
 	struct cryptocap *cap;
 
 	if (krp->krp_status != 0)
 		cryptostats.cs_kerrs++;
 	CRYPTO_DRIVER_LOCK();
 	/* XXX: What if driver is loaded in the meantime? */
 	if (krp->krp_hid < crypto_drivers_num) {
 		cap = &crypto_drivers[krp->krp_hid];
 		KASSERT(cap->cc_koperations > 0, ("cc_koperations == 0"));
 		cap->cc_koperations--;
 		if (cap->cc_flags & CRYPTOCAP_F_CLEANUP)
 			crypto_remove(cap);
 	}
 	CRYPTO_DRIVER_UNLOCK();
 
 	ret_worker = CRYPTO_RETW(0);
 
 	CRYPTO_RETW_LOCK(ret_worker);
 	if (CRYPTO_RETW_EMPTY(ret_worker))
 		wakeup_one(&ret_worker->crp_ret_q);		/* shared wait channel */
 	TAILQ_INSERT_TAIL(&ret_worker->crp_ret_kq, krp, krp_next);
 	CRYPTO_RETW_UNLOCK(ret_worker);
 }
 
 int
 crypto_getfeat(int *featp)
 {
 	int hid, kalg, feat = 0;
 
 	CRYPTO_DRIVER_LOCK();
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		const struct cryptocap *cap = &crypto_drivers[hid];
 
 		if ((cap->cc_flags & CRYPTOCAP_F_SOFTWARE) &&
 		    !crypto_devallowsoft) {
 			continue;
 		}
 		for (kalg = 0; kalg < CRK_ALGORITHM_MAX; kalg++)
 			if (cap->cc_kalg[kalg] & CRYPTO_ALG_FLAG_SUPPORTED)
 				feat |=  1 << kalg;
 	}
 	CRYPTO_DRIVER_UNLOCK();
 	*featp = feat;
 	return (0);
 }
 
 /*
  * Terminate a thread at module unload.  The process that
  * initiated this is waiting for us to signal that we're gone;
  * wake it up and exit.  We use the driver table lock to insure
  * we don't do the wakeup before they're waiting.  There is no
  * race here because the waiter sleeps on the proc lock for the
  * thread so it gets notified at the right time because of an
  * extra wakeup that's done in exit1().
  */
 static void
 crypto_finis(void *chan)
 {
 	CRYPTO_DRIVER_LOCK();
 	wakeup_one(chan);
 	CRYPTO_DRIVER_UNLOCK();
 	kproc_exit(0);
 }
 
 /*
  * Crypto thread, dispatches crypto requests.
  */
 static void
 crypto_proc(void)
 {
 	struct cryptop *crp, *submit;
 	struct cryptkop *krp;
 	struct cryptocap *cap;
 	u_int32_t hid;
 	int result, hint;
 
 #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
 	fpu_kern_thread(FPU_KERN_NORMAL);
 #endif
 
 	CRYPTO_Q_LOCK();
 	for (;;) {
 		/*
 		 * Find the first element in the queue that can be
 		 * processed and look-ahead to see if multiple ops
 		 * are ready for the same driver.
 		 */
 		submit = NULL;
 		hint = 0;
 		TAILQ_FOREACH(crp, &crp_q, crp_next) {
 			hid = crypto_ses2hid(crp->crp_session);
 			cap = crypto_checkdriver(hid);
 			/*
 			 * Driver cannot disappeared when there is an active
 			 * session.
 			 */
 			KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
 			    __func__, __LINE__));
 			if (cap == NULL || cap->cc_dev == NULL) {
 				/* Op needs to be migrated, process it. */
 				if (submit == NULL)
 					submit = crp;
 				break;
 			}
 			if (!cap->cc_qblocked) {
 				if (submit != NULL) {
 					/*
 					 * We stop on finding another op,
 					 * regardless whether its for the same
 					 * driver or not.  We could keep
 					 * searching the queue but it might be
 					 * better to just use a per-driver
 					 * queue instead.
 					 */
 					if (crypto_ses2hid(submit->crp_session) == hid)
 						hint = CRYPTO_HINT_MORE;
 					break;
 				} else {
 					submit = crp;
 					if ((submit->crp_flags & CRYPTO_F_BATCH) == 0)
 						break;
 					/* keep scanning for more are q'd */
 				}
 			}
 		}
 		if (submit != NULL) {
 			TAILQ_REMOVE(&crp_q, submit, crp_next);
 			hid = crypto_ses2hid(submit->crp_session);
 			cap = crypto_checkdriver(hid);
 			KASSERT(cap != NULL, ("%s:%u Driver disappeared.",
 			    __func__, __LINE__));
 			result = crypto_invoke(cap, submit, hint);
 			if (result == ERESTART) {
 				/*
 				 * The driver ran out of resources, mark the
 				 * driver ``blocked'' for cryptop's and put
 				 * the request back in the queue.  It would
 				 * best to put the request back where we got
 				 * it but that's hard so for now we put it
 				 * at the front.  This should be ok; putting
 				 * it at the end does not work.
 				 */
 				/* XXX validate sid again? */
 				crypto_drivers[crypto_ses2hid(submit->crp_session)].cc_qblocked = 1;
 				TAILQ_INSERT_HEAD(&crp_q, submit, crp_next);
 				cryptostats.cs_blocks++;
 			}
 		}
 
 		/* As above, but for key ops */
 		TAILQ_FOREACH(krp, &crp_kq, krp_next) {
 			cap = crypto_checkdriver(krp->krp_hid);
 			if (cap == NULL || cap->cc_dev == NULL) {
 				/*
 				 * Operation needs to be migrated, invalidate
 				 * the assigned device so it will reselect a
 				 * new one below.  Propagate the original
 				 * crid selection flags if supplied.
 				 */
 				krp->krp_hid = krp->krp_crid &
 				    (CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE);
 				if (krp->krp_hid == 0)
 					krp->krp_hid =
 				    CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE;
 				break;
 			}
 			if (!cap->cc_kqblocked)
 				break;
 		}
 		if (krp != NULL) {
 			TAILQ_REMOVE(&crp_kq, krp, krp_next);
 			result = crypto_kinvoke(krp, krp->krp_hid);
 			if (result == ERESTART) {
 				/*
 				 * The driver ran out of resources, mark the
 				 * driver ``blocked'' for cryptkop's and put
 				 * the request back in the queue.  It would
 				 * best to put the request back where we got
 				 * it but that's hard so for now we put it
 				 * at the front.  This should be ok; putting
 				 * it at the end does not work.
 				 */
 				/* XXX validate sid again? */
 				crypto_drivers[krp->krp_hid].cc_kqblocked = 1;
 				TAILQ_INSERT_HEAD(&crp_kq, krp, krp_next);
 				cryptostats.cs_kblocks++;
 			}
 		}
 
 		if (submit == NULL && krp == NULL) {
 			/*
 			 * Nothing more to be processed.  Sleep until we're
 			 * woken because there are more ops to process.
 			 * This happens either by submission or by a driver
 			 * becoming unblocked and notifying us through
 			 * crypto_unblock.  Note that when we wakeup we
 			 * start processing each queue again from the
 			 * front. It's not clear that it's important to
 			 * preserve this ordering since ops may finish
 			 * out of order if dispatched to different devices
 			 * and some become blocked while others do not.
 			 */
 			crp_sleep = 1;
 			msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0);
 			crp_sleep = 0;
 			if (cryptoproc == NULL)
 				break;
 			cryptostats.cs_intrs++;
 		}
 	}
 	CRYPTO_Q_UNLOCK();
 
 	crypto_finis(&crp_q);
 }
 
 /*
  * Crypto returns thread, does callbacks for processed crypto requests.
  * Callbacks are done here, rather than in the crypto drivers, because
  * callbacks typically are expensive and would slow interrupt handling.
  */
 static void
 crypto_ret_proc(struct crypto_ret_worker *ret_worker)
 {
 	struct cryptop *crpt;
 	struct cryptkop *krpt;
 
 	CRYPTO_RETW_LOCK(ret_worker);
 	for (;;) {
 		/* Harvest return q's for completed ops */
 		crpt = TAILQ_FIRST(&ret_worker->crp_ordered_ret_q);
 		if (crpt != NULL) {
 			if (crpt->crp_seq == ret_worker->reorder_cur_seq) {
 				TAILQ_REMOVE(&ret_worker->crp_ordered_ret_q, crpt, crp_next);
 				ret_worker->reorder_cur_seq++;
 			} else {
 				crpt = NULL;
 			}
 		}
 
 		if (crpt == NULL) {
 			crpt = TAILQ_FIRST(&ret_worker->crp_ret_q);
 			if (crpt != NULL)
 				TAILQ_REMOVE(&ret_worker->crp_ret_q, crpt, crp_next);
 		}
 
 		krpt = TAILQ_FIRST(&ret_worker->crp_ret_kq);
 		if (krpt != NULL)
 			TAILQ_REMOVE(&ret_worker->crp_ret_kq, krpt, krp_next);
 
 		if (crpt != NULL || krpt != NULL) {
 			CRYPTO_RETW_UNLOCK(ret_worker);
 			/*
 			 * Run callbacks unlocked.
 			 */
 			if (crpt != NULL) {
 #ifdef CRYPTO_TIMING
 				if (crypto_timing) {
 					/*
 					 * NB: We must copy the timestamp before
 					 * doing the callback as the cryptop is
 					 * likely to be reclaimed.
 					 */
 					struct bintime t = crpt->crp_tstamp;
 					crypto_tstat(&cryptostats.cs_cb, &t);
 					crpt->crp_callback(crpt);
 					crypto_tstat(&cryptostats.cs_finis, &t);
 				} else
 #endif
 					crpt->crp_callback(crpt);
 			}
 			if (krpt != NULL)
 				krpt->krp_callback(krpt);
 			CRYPTO_RETW_LOCK(ret_worker);
 		} else {
 			/*
 			 * Nothing more to be processed.  Sleep until we're
 			 * woken because there are more returns to process.
 			 */
 			msleep(&ret_worker->crp_ret_q, &ret_worker->crypto_ret_mtx, PWAIT,
 				"crypto_ret_wait", 0);
 			if (ret_worker->cryptoretproc == NULL)
 				break;
 			cryptostats.cs_rets++;
 		}
 	}
 	CRYPTO_RETW_UNLOCK(ret_worker);
 
 	crypto_finis(&ret_worker->crp_ret_q);
 }
 
 #ifdef DDB
 static void
 db_show_drivers(void)
 {
 	int hid;
 
 	db_printf("%12s %4s %4s %8s %2s %2s\n"
 		, "Device"
 		, "Ses"
 		, "Kops"
 		, "Flags"
 		, "QB"
 		, "KB"
 	);
 	for (hid = 0; hid < crypto_drivers_num; hid++) {
 		const struct cryptocap *cap = &crypto_drivers[hid];
 		if (cap->cc_dev == NULL)
 			continue;
 		db_printf("%-12s %4u %4u %08x %2u %2u\n"
 		    , device_get_nameunit(cap->cc_dev)
 		    , cap->cc_sessions
 		    , cap->cc_koperations
 		    , cap->cc_flags
 		    , cap->cc_qblocked
 		    , cap->cc_kqblocked
 		);
 	}
 }
 
 DB_SHOW_COMMAND(crypto, db_show_crypto)
 {
 	struct cryptop *crp;
 	struct crypto_ret_worker *ret_worker;
 
 	db_show_drivers();
 	db_printf("\n");
 
 	db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n",
 	    "HID", "Caps", "Ilen", "Olen", "Etype", "Flags",
 	    "Desc", "Callback");
 	TAILQ_FOREACH(crp, &crp_q, crp_next) {
 		db_printf("%4u %08x %4u %4u %4u %04x %8p %8p\n"
 		    , (int) crypto_ses2hid(crp->crp_session)
 		    , (int) crypto_ses2caps(crp->crp_session)
 		    , crp->crp_ilen, crp->crp_olen
 		    , crp->crp_etype
 		    , crp->crp_flags
 		    , crp->crp_desc
 		    , crp->crp_callback
 		);
 	}
 	FOREACH_CRYPTO_RETW(ret_worker) {
 		db_printf("\n%8s %4s %4s %4s %8s\n",
 		    "ret_worker", "HID", "Etype", "Flags", "Callback");
 		if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
 			TAILQ_FOREACH(crp, &ret_worker->crp_ret_q, crp_next) {
 				db_printf("%8td %4u %4u %04x %8p\n"
 				    , CRYPTO_RETW_ID(ret_worker)
 				    , (int) crypto_ses2hid(crp->crp_session)
 				    , crp->crp_etype
 				    , crp->crp_flags
 				    , crp->crp_callback
 				);
 			}
 		}
 	}
 }
 
 DB_SHOW_COMMAND(kcrypto, db_show_kcrypto)
 {
 	struct cryptkop *krp;
 	struct crypto_ret_worker *ret_worker;
 
 	db_show_drivers();
 	db_printf("\n");
 
 	db_printf("%4s %5s %4s %4s %8s %4s %8s\n",
 	    "Op", "Status", "#IP", "#OP", "CRID", "HID", "Callback");
 	TAILQ_FOREACH(krp, &crp_kq, krp_next) {
 		db_printf("%4u %5u %4u %4u %08x %4u %8p\n"
 		    , krp->krp_op
 		    , krp->krp_status
 		    , krp->krp_iparams, krp->krp_oparams
 		    , krp->krp_crid, krp->krp_hid
 		    , krp->krp_callback
 		);
 	}
 
 	ret_worker = CRYPTO_RETW(0);
 	if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) {
 		db_printf("%4s %5s %8s %4s %8s\n",
 		    "Op", "Status", "CRID", "HID", "Callback");
 		TAILQ_FOREACH(krp, &ret_worker->crp_ret_kq, krp_next) {
 			db_printf("%4u %5u %08x %4u %8p\n"
 			    , krp->krp_op
 			    , krp->krp_status
 			    , krp->krp_crid, krp->krp_hid
 			    , krp->krp_callback
 			);
 		}
 	}
 }
 #endif
 
 int crypto_modevent(module_t mod, int type, void *unused);
 
 /*
  * Initialization code, both for static and dynamic loading.
  * Note this is not invoked with the usual MODULE_DECLARE
  * mechanism but instead is listed as a dependency by the
  * cryptosoft driver.  This guarantees proper ordering of
  * calls on module load/unload.
  */
 int
 crypto_modevent(module_t mod, int type, void *unused)
 {
 	int error = EINVAL;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = crypto_init();
 		if (error == 0 && bootverbose)
 			printf("crypto: <crypto core>\n");
 		break;
 	case MOD_UNLOAD:
 		/*XXX disallow if active sessions */
 		error = 0;
 		crypto_destroy();
 		return 0;
 	}
 	return error;
 }
 MODULE_VERSION(crypto, 1);
 MODULE_DEPEND(crypto, zlib, 1, 1, 1);
Index: head/sys/sys/param.h
===================================================================
--- head/sys/sys/param.h	(revision 336913)
+++ head/sys/sys/param.h	(revision 336914)
@@ -1,365 +1,365 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /*
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		X.0-CURRENT before releng/X.0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1200075	/* Master, propagated to newvers */
+#define __FreeBSD_version 1200076	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #if defined(_KERNEL) || defined(IN_RTLD)
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 #define	P_OSREL_MAP_GUARD		1200035
 #define	P_OSREL_WRFSBASE		1200041
 #define	P_OSREL_CK_CYLGRP		1200046
 #define	P_OSREL_VMTOTAL64		1200054
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	63		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use
  *		the KVM memory reserved for the buffer cache and will wind
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
 
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: head/sys/sys/time.h
===================================================================
--- head/sys/sys/time.h	(revision 336913)
+++ head/sys/sys/time.h	(revision 336914)
@@ -1,564 +1,568 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)time.h	8.5 (Berkeley) 5/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_TIME_H_
 #define	_SYS_TIME_H_
 
 #include <sys/_timeval.h>
 #include <sys/types.h>
 #include <sys/timespec.h>
 
 struct timezone {
 	int	tz_minuteswest;	/* minutes west of Greenwich */
 	int	tz_dsttime;	/* type of dst correction */
 };
 #define	DST_NONE	0	/* not on dst */
 #define	DST_USA		1	/* USA style dst */
 #define	DST_AUST	2	/* Australian style dst */
 #define	DST_WET		3	/* Western European dst */
 #define	DST_MET		4	/* Middle European dst */
 #define	DST_EET		5	/* Eastern European dst */
 #define	DST_CAN		6	/* Canada */
 
 #if __BSD_VISIBLE
 struct bintime {
 	time_t	sec;
 	uint64_t frac;
 };
 
 static __inline void
 bintime_addx(struct bintime *_bt, uint64_t _x)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac += _x;
 	if (_u > _bt->frac)
 		_bt->sec++;
 }
 
 static __inline void
 bintime_add(struct bintime *_bt, const struct bintime *_bt2)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac += _bt2->frac;
 	if (_u > _bt->frac)
 		_bt->sec++;
 	_bt->sec += _bt2->sec;
 }
 
 static __inline void
 bintime_sub(struct bintime *_bt, const struct bintime *_bt2)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac -= _bt2->frac;
 	if (_u < _bt->frac)
 		_bt->sec--;
 	_bt->sec -= _bt2->sec;
 }
 
 static __inline void
 bintime_mul(struct bintime *_bt, u_int _x)
 {
 	uint64_t _p1, _p2;
 
 	_p1 = (_bt->frac & 0xffffffffull) * _x;
 	_p2 = (_bt->frac >> 32) * _x + (_p1 >> 32);
 	_bt->sec *= _x;
 	_bt->sec += (_p2 >> 32);
 	_bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull);
 }
 
 static __inline void
 bintime_shift(struct bintime *_bt, int _exp)
 {
 
 	if (_exp > 0) {
 		_bt->sec <<= _exp;
 		_bt->sec |= _bt->frac >> (64 - _exp);
 		_bt->frac <<= _exp;
 	} else if (_exp < 0) {
 		_bt->frac >>= -_exp;
 		_bt->frac |= (uint64_t)_bt->sec << (64 + _exp);
 		_bt->sec >>= -_exp;
 	}
 }
 
 #define	bintime_clear(a)	((a)->sec = (a)->frac = 0)
 #define	bintime_isset(a)	((a)->sec || (a)->frac)
 #define	bintime_cmp(a, b, cmp)						\
 	(((a)->sec == (b)->sec) ?					\
 	    ((a)->frac cmp (b)->frac) :					\
 	    ((a)->sec cmp (b)->sec))
 
 #define	SBT_1S	((sbintime_t)1 << 32)
 #define	SBT_1M	(SBT_1S * 60)
 #define	SBT_1MS	(SBT_1S / 1000)
 #define	SBT_1US	(SBT_1S / 1000000)
 #define	SBT_1NS	(SBT_1S / 1000000000) /* beware rounding, see nstosbt() */
 #define	SBT_MAX	0x7fffffffffffffffLL
 
 static __inline int
 sbintime_getsec(sbintime_t _sbt)
 {
 
 	return (_sbt >> 32);
 }
 
 static __inline sbintime_t
 bttosbt(const struct bintime _bt)
 {
 
 	return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32));
 }
 
 static __inline struct bintime
 sbttobt(sbintime_t _sbt)
 {
 	struct bintime _bt;
 
 	_bt.sec = _sbt >> 32;
 	_bt.frac = _sbt << 32;
 	return (_bt);
 }
 
 /*
  * Decimal<->sbt conversions.  Multiplying or dividing by SBT_1NS results in
  * large roundoff errors which sbttons() and nstosbt() avoid.  Millisecond and
  * microsecond functions are also provided for completeness.
  */
 static __inline int64_t
 sbttons(sbintime_t _sbt)
 {
 
 	return ((1000000000 * _sbt) >> 32);
 }
 
 static __inline sbintime_t
 nstosbt(int64_t _ns)
 {
 
 	return ((_ns * (((uint64_t)1 << 63) / 500000000)) >> 32);
 }
 
 static __inline int64_t
 sbttous(sbintime_t _sbt)
 {
 
 	return ((1000000 * _sbt) >> 32);
 }
 
 static __inline sbintime_t
 ustosbt(int64_t _us)
 {
 
 	return ((_us * (((uint64_t)1 << 63) / 500000)) >> 32);
 }
 
 static __inline int64_t
 sbttoms(sbintime_t _sbt)
 {
 
 	return ((1000 * _sbt) >> 32);
 }
 
 static __inline sbintime_t
 mstosbt(int64_t _ms)
 {
 
 	return ((_ms * (((uint64_t)1 << 63) / 500)) >> 32);
 }
 
 /*-
  * Background information:
  *
  * When converting between timestamps on parallel timescales of differing
  * resolutions it is historical and scientific practice to round down rather
  * than doing 4/5 rounding.
  *
  *   The date changes at midnight, not at noon.
  *
  *   Even at 15:59:59.999999999 it's not four'o'clock.
  *
  *   time_second ticks after N.999999999 not after N.4999999999
  */
 
 static __inline void
 bintime2timespec(const struct bintime *_bt, struct timespec *_ts)
 {
 
 	_ts->tv_sec = _bt->sec;
 	_ts->tv_nsec = ((uint64_t)1000000000 *
 	    (uint32_t)(_bt->frac >> 32)) >> 32;
 }
 
 static __inline void
 timespec2bintime(const struct timespec *_ts, struct bintime *_bt)
 {
 
 	_bt->sec = _ts->tv_sec;
 	/* 18446744073 = int(2^64 / 1000000000) */
 	_bt->frac = _ts->tv_nsec * (uint64_t)18446744073LL;
 }
 
 static __inline void
 bintime2timeval(const struct bintime *_bt, struct timeval *_tv)
 {
 
 	_tv->tv_sec = _bt->sec;
 	_tv->tv_usec = ((uint64_t)1000000 * (uint32_t)(_bt->frac >> 32)) >> 32;
 }
 
 static __inline void
 timeval2bintime(const struct timeval *_tv, struct bintime *_bt)
 {
 
 	_bt->sec = _tv->tv_sec;
 	/* 18446744073709 = int(2^64 / 1000000) */
 	_bt->frac = _tv->tv_usec * (uint64_t)18446744073709LL;
 }
 
 static __inline struct timespec
 sbttots(sbintime_t _sbt)
 {
 	struct timespec _ts;
 
 	_ts.tv_sec = _sbt >> 32;
 	_ts.tv_nsec = sbttons((uint32_t)_sbt);
 	return (_ts);
 }
 
 static __inline sbintime_t
 tstosbt(struct timespec _ts)
 {
 
 	return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec));
 }
 
 static __inline struct timeval
 sbttotv(sbintime_t _sbt)
 {
 	struct timeval _tv;
 
 	_tv.tv_sec = _sbt >> 32;
 	_tv.tv_usec = sbttous((uint32_t)_sbt);
 	return (_tv);
 }
 
 static __inline sbintime_t
 tvtosbt(struct timeval _tv)
 {
 
 	return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec));
 }
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 /*
  * Simple macros to convert ticks to milliseconds
  * or microseconds and vice-versa. The answer
  * will always be at least 1. Note the return
  * value is a uint32_t however we step up the
  * operations to 64 bit to avoid any overflow/underflow
  * problems.
  */
 #define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \
 	  (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz))
 #define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \
 	  ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz))
 #define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \
 	  (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000))
 #define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \
 	 ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000))
 
+#endif
 /* Operations on timespecs */
 #define	timespecclear(tvp)	((tvp)->tv_sec = (tvp)->tv_nsec = 0)
 #define	timespecisset(tvp)	((tvp)->tv_sec || (tvp)->tv_nsec)
 #define	timespeccmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_nsec cmp (uvp)->tv_nsec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
-#define	timespecadd(vvp, uvp)						\
+
+#define	timespecadd(tsp, usp, vsp)					\
 	do {								\
-		(vvp)->tv_sec += (uvp)->tv_sec;				\
-		(vvp)->tv_nsec += (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec >= 1000000000) {			\
-			(vvp)->tv_sec++;				\
-			(vvp)->tv_nsec -= 1000000000;			\
+		(vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec;		\
+		(vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec;	\
+		if ((vsp)->tv_nsec >= 1000000000L) {			\
+			(vsp)->tv_sec++;				\
+			(vsp)->tv_nsec -= 1000000000L;			\
 		}							\
 	} while (0)
-#define	timespecsub(vvp, uvp)						\
+#define	timespecsub(tsp, usp, vsp)					\
 	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
+		(vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec;		\
+		(vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec;	\
+		if ((vsp)->tv_nsec < 0) {				\
+			(vsp)->tv_sec--;				\
+			(vsp)->tv_nsec += 1000000000L;			\
 		}							\
 	} while (0)
+
+#ifdef _KERNEL
 
 /* Operations on timevals. */
 
 #define	timevalclear(tvp)		((tvp)->tv_sec = (tvp)->tv_usec = 0)
 #define	timevalisset(tvp)		((tvp)->tv_sec || (tvp)->tv_usec)
 #define	timevalcmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_usec cmp (uvp)->tv_usec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
 
 /* timevaladd and timevalsub are not inlined */
 
 #endif /* _KERNEL */
 
 #ifndef _KERNEL			/* NetBSD/OpenBSD compatible interfaces */
 
 #define	timerclear(tvp)		((tvp)->tv_sec = (tvp)->tv_usec = 0)
 #define	timerisset(tvp)		((tvp)->tv_sec || (tvp)->tv_usec)
 #define	timercmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_usec cmp (uvp)->tv_usec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
 #define	timeradd(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec >= 1000000) {			\
 			(vvp)->tv_sec++;				\
 			(vvp)->tv_usec -= 1000000;			\
 		}							\
 	} while (0)
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 #endif
 
 /*
  * Names of the interval timers, and structure
  * defining a timer setting.
  */
 #define	ITIMER_REAL	0
 #define	ITIMER_VIRTUAL	1
 #define	ITIMER_PROF	2
 
 struct itimerval {
 	struct	timeval it_interval;	/* timer interval */
 	struct	timeval it_value;	/* current value */
 };
 
 /*
  * Getkerninfo clock information structure
  */
 struct clockinfo {
 	int	hz;		/* clock frequency */
 	int	tick;		/* micro-seconds per hz tick */
 	int	spare;
 	int	stathz;		/* statistics clock frequency */
 	int	profhz;		/* profiling clock frequency */
 };
 
 /* These macros are also in time.h. */
 #ifndef CLOCK_REALTIME
 #define	CLOCK_REALTIME	0
 #define	CLOCK_VIRTUAL	1
 #define	CLOCK_PROF	2
 #define	CLOCK_MONOTONIC	4
 #define	CLOCK_UPTIME	5		/* FreeBSD-specific. */
 #define	CLOCK_UPTIME_PRECISE	7	/* FreeBSD-specific. */
 #define	CLOCK_UPTIME_FAST	8	/* FreeBSD-specific. */
 #define	CLOCK_REALTIME_PRECISE	9	/* FreeBSD-specific. */
 #define	CLOCK_REALTIME_FAST	10	/* FreeBSD-specific. */
 #define	CLOCK_MONOTONIC_PRECISE	11	/* FreeBSD-specific. */
 #define	CLOCK_MONOTONIC_FAST	12	/* FreeBSD-specific. */
 #define	CLOCK_SECOND	13		/* FreeBSD-specific. */
 #define	CLOCK_THREAD_CPUTIME_ID	14
 #define	CLOCK_PROCESS_CPUTIME_ID	15
 #endif
 
 #ifndef TIMER_ABSTIME
 #define	TIMER_RELTIME	0x0	/* relative timer */
 #define	TIMER_ABSTIME	0x1	/* absolute timer */
 #endif
 
 #if __BSD_VISIBLE
 #define	CPUCLOCK_WHICH_PID	0
 #define	CPUCLOCK_WHICH_TID	1
 #endif
 
 #ifdef _KERNEL
 
 /*
  * Kernel to clock driver interface.
  */
 void	inittodr(time_t base);
 void	resettodr(void);
 
 extern volatile time_t	time_second;
 extern volatile time_t	time_uptime;
 extern struct bintime tc_tick_bt;
 extern sbintime_t tc_tick_sbt;
 extern struct bintime tick_bt;
 extern sbintime_t tick_sbt;
 extern int tc_precexp;
 extern int tc_timepercentage;
 extern struct bintime bt_timethreshold;
 extern struct bintime bt_tickthreshold;
 extern sbintime_t sbt_timethreshold;
 extern sbintime_t sbt_tickthreshold;
 
 extern volatile int rtc_generation;
 
 /*
  * Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
  *
  * Functions without the "get" prefix returns the best timestamp
  * we can produce in the given format.
  *
  * "bin"   == struct bintime  == seconds + 64 bit fraction of seconds.
  * "nano"  == struct timespec == seconds + nanoseconds.
  * "micro" == struct timeval  == seconds + microseconds.
  *
  * Functions containing "up" returns time relative to boot and
  * should be used for calculating time intervals.
  *
  * Functions without "up" returns UTC time.
  *
  * Functions with the "get" prefix returns a less precise result
  * much faster than the functions without "get" prefix and should
  * be used where a precision of 1/hz seconds is acceptable or where
  * performance is priority. (NB: "precision", _not_ "resolution" !)
  */
 
 void	binuptime(struct bintime *bt);
 void	nanouptime(struct timespec *tsp);
 void	microuptime(struct timeval *tvp);
 
 static __inline sbintime_t
 sbinuptime(void)
 {
 	struct bintime _bt;
 
 	binuptime(&_bt);
 	return (bttosbt(_bt));
 }
 
 void	bintime(struct bintime *bt);
 void	nanotime(struct timespec *tsp);
 void	microtime(struct timeval *tvp);
 
 void	getbinuptime(struct bintime *bt);
 void	getnanouptime(struct timespec *tsp);
 void	getmicrouptime(struct timeval *tvp);
 
 static __inline sbintime_t
 getsbinuptime(void)
 {
 	struct bintime _bt;
 
 	getbinuptime(&_bt);
 	return (bttosbt(_bt));
 }
 
 void	getbintime(struct bintime *bt);
 void	getnanotime(struct timespec *tsp);
 void	getmicrotime(struct timeval *tvp);
 
 void	getboottime(struct timeval *boottime);
 void	getboottimebin(struct bintime *boottimebin);
 
 /* Other functions */
 int	itimerdecr(struct itimerval *itp, int usec);
 int	itimerfix(struct timeval *tv);
 int	ppsratecheck(struct timeval *, int *, int);
 int	ratecheck(struct timeval *, const struct timeval *);
 void	timevaladd(struct timeval *t1, const struct timeval *t2);
 void	timevalsub(struct timeval *t1, const struct timeval *t2);
 int	tvtohz(struct timeval *tv);
 
 #define	TC_DEFAULTPERC		5
 
 #define	BT2FREQ(bt)                                                     \
 	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /           \
 	    ((bt)->frac >> 1))
 
 #define	SBT2FREQ(sbt)	((SBT_1S + ((sbt) >> 1)) / (sbt))
 
 #define	FREQ2BT(freq, bt)                                               \
 {									\
 	(bt)->sec = 0;                                                  \
 	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;     \
 }
 
 #define	TIMESEL(sbt, sbt2)						\
 	(((sbt2) >= sbt_timethreshold) ?				\
 	    ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0))
 
 #else /* !_KERNEL */
 #include <time.h>
 
 #include <sys/cdefs.h>
 #include <sys/select.h>
 
 __BEGIN_DECLS
 int	setitimer(int, const struct itimerval *, struct itimerval *);
 int	utimes(const char *, const struct timeval *);
 
 #if __BSD_VISIBLE
 int	adjtime(const struct timeval *, struct timeval *);
 int	clock_getcpuclockid2(id_t, int, clockid_t *);
 int	futimes(int, const struct timeval *);
 int	futimesat(int, const char *, const struct timeval [2]);
 int	lutimes(const char *, const struct timeval *);
 int	settimeofday(const struct timeval *, const struct timezone *);
 #endif
 
 #if __XSI_VISIBLE
 int	getitimer(int, struct itimerval *);
 int	gettimeofday(struct timeval *, struct timezone *);
 #endif
 
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_TIME_H_ */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 336913)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 336914)
@@ -1,2706 +1,2706 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/fcntl.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 #include "opt_ffs.h"
 
 #ifdef NO_FFS_SNAPSHOT
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	return (EINVAL);
 }
 
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *wkhd;
 {
 	return (EINVAL);
 }
 
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 }
 
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 }
 
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	return (EINVAL);
 }
 
 void
 ffs_sync_snap(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 }
 
 #else
 FEATURE(ffs_snapshot, "FFS snapshot support");
 
 LIST_HEAD(, snapdata) snapfree;
 static struct mtx snapfree_lock;
 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int, int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int, int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 static void try_free_snapdata(struct vnode *devvp);
 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
 static int ffs_bp_snapblk(struct vnode *, struct buf *);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the code normally only ensures
  * persistence for the filesystem metadata contained within a
  * snapshot. Setting the following flag allows this crash
  * persistence to be enabled for file contents.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 static int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	ufs2_daddr_t blockno;
 	uint64_t flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
 	int32_t *lp;
 	void *space;
 	struct fs *copy_fs = NULL, *fs;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
 	struct vnode *vp, *xvp, *mvp, *devvp;
 	struct uio auio;
 	struct iovec aiov;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	sn = NULL;
 	/*
 	 * At the moment, journaled soft updates cannot support
 	 * taking snapshots.
 	 */
 	if (MOUNTEDSUJ(mp)) {
 		vfs_mount_error(mp, "%s: Snapshots are not yet supported when "
 		    "running with journaled soft updates", fs->fs_fsmnt);
 		return (EOPNOTSUPP);
 	}
 	MNT_ILOCK(mp);
 	flag = mp->mnt_flag;
 	MNT_IUNLOCK(mp);
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	UFS_UNLOCK(ump);
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE,
 	    snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	vfs_rel(wrtmp);
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	VOP_UNLOCK(nd.ni_dvp, 0);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		vrele(nd.ni_dvp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	vp->v_vflag |= VV_SYSTEM;
 	ip = VTOI(vp);
 	devvp = ITODEVVP(ip);
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP_SET(ip, i_size, ip->i_size);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	error = readblock(vp, bp, numblks - 1);
 	bawrite(bp);
 	if (error != 0)
 		goto out;
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bawrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 	    0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 		if (cg % 10 == 0)
 			ffs_syncvnode(vp, MNT_WAIT, 0);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
 	UFS_LOCK(ump);
 	fs->fs_active = space;
 	UFS_UNLOCK(ump);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (cg % 10 == 0)
 			ffs_syncvnode(vp, MNT_WAIT, 0);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
 	if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0);
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
 		struct proc *p;
 
 		p = td->td_proc;
 		PROC_LOCK(p);
 		saved_nice = p->p_nice;
 		sched_nice(p, 0);
 		PROC_UNLOCK(p);
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) {
 			vn_start_write(NULL, &wrtmp, V_WAIT);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			goto out;
 		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (ip->i_effnlink == 0) {
 		error = ENOENT;		/* Snapshot file unlinked */
 		goto out1;
 	}
 	if (collectsnapstats)
 		nanotime(&starttime);
 
 	/* The last block might have changed.  Copy it again to be sure. */
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error != 0)
 		goto out1;
 	error = readblock(vp, bp, numblks - 1);
 	bp->b_flags |= B_VALIDSUSPWRT;
 	bawrite(bp);
 	if (error != 0)
 		goto out1;
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 	if (fs->fs_sbsize < size)
 		bzero(&((char *)copy_fs)[fs->fs_sbsize],
 		    size - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	space = (char *)space + fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			free(copy_fs, M_UFSMNT);
 			copy_fs = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		space = (char *)space + len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 *
 	 * We also calculate the needed size for the snapshot list.
 	 */
 	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 	MNT_IUNLOCK(mp);
 loop:
 	MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) {
 		if ((xvp->v_usecount == 0 &&
 		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
 		    xvp->v_type == VNON ||
 		    IS_SNAPSHOT(VTOI(xvp))) {
 			VI_UNLOCK(xvp);
 			continue;
 		}
 		/*
 		 * We can skip parent directory vnode because it must have
 		 * this snapshot file in it.
 		 */
 		if (xvp == nd.ni_dvp) {
 			VI_UNLOCK(xvp);
 			continue;
 		}
 		vholdl(xvp);
 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			vdrop(xvp);
 			goto loop;
 		}
 		VI_LOCK(xvp);
 		if (xvp->v_usecount == 0 &&
 		    (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
 			VI_UNLOCK(xvp);
 			VOP_UNLOCK(xvp, 0);
 			vdrop(xvp);
 			continue;
 		}
 		VI_UNLOCK(xvp);
 		if (snapdebug)
 			vn_printf(xvp, "ffs_snapshot: busy vnode ");
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0);
 			vdrop(xvp);
 			continue;
 		}
 		xp = VTOI(xvp);
 		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 			VOP_UNLOCK(xvp, 0);
 			vdrop(xvp);
 			continue;
 		}
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < UFS_NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number,
 				    xvp->v_type, NULL);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
 		}
 		snaplistsize += 1;
 		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY, 1);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY, 1);
 		if (blkno)
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
 			    xp->i_mode, NULL);
 		VOP_UNLOCK(xvp, 0);
 		vdrop(xvp);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			free(copy_fs, M_UFSMNT);
 			copy_fs = NULL;
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto out1;
 		}
 	}
 	/*
 	 * Erase the journal file from the snapshot.
 	 */
 	if (fs->fs_flags & FS_SUJ) {
 		error = softdep_journal_lookup(mp, &xvp);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			free(copy_fs, M_UFSMNT);
 			copy_fs = NULL;
 			goto out1;
 		}
 		xp = VTOI(xvp);
 		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY, 0);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY, 0);
 		vput(xvp);
 	}
 	/*
 	 * Acquire a lock on the snapdata structure, creating it if necessary.
 	 */
 	sn = ffs_snapdata_acquire(devvp);
 	/* 
 	 * Change vnode to use shared snapshot lock instead of the original
 	 * private lock.
 	 */
 	vp->v_vnlock = &sn->sn_lock;
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 	xp = TAILQ_FIRST(&sn->sn_head);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
 	 * to allocate the space for the list of preallocated snapshot blocks.
 	 * This list will be refined below, but this preliminary one will
 	 * keep us out of deadlock until the full one is ready.
 	 */
 	if (xp == NULL) {
 		snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		blkp = &snapblklist[1];
 		*blkp++ = lblkno(fs, fs->fs_sblockloc);
 		blkno = fragstoblks(fs, fs->fs_csaddr);
 		for (cg = 0; cg < fs->fs_ncg; cg++) {
 			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 				break;
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		}
 		len = howmany(fs->fs_cssize, fs->fs_bsize);
 		for (loc = 0; loc < len; loc++)
 			*blkp++ = blkno + loc;
 		for (; cg < fs->fs_ncg; cg++)
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		snapblklist[0] = blkp - snapblklist;
 		VI_LOCK(devvp);
 		if (sn->sn_blklist != NULL)
 			panic("ffs_snapshot: non-empty list");
 		sn->sn_blklist = snapblklist;
 		sn->sn_listsize = blkp - snapblklist;
 		VI_UNLOCK(devvp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	VI_LOCK(devvp);
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %ju already on list",
 		    (uintmax_t)ip->i_number);
 	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 out1:
 	KASSERT((sn != NULL && copy_fs != NULL && error == 0) ||
 		(sn == NULL && copy_fs == NULL && error != 0),
 		("email phk@ and mckusick@"));
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
-		timespecsub(&endtime, &starttime);
+		timespecsub(&endtime, &starttime, &endtime);
 		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (copy_fs == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP, 0);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP, 0);
 		if (error == 0 && xp->i_effnlink == 0) {
 			error = ffs_freefile(ump,
 					     copy_fs,
 					     vp,
 					     xp->i_number,
 					     xp->i_mode, NULL);
 		}
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate space for the full list of preallocated snapshot blocks.
 	 */
 	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = &snapblklist[1];
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (I_IS_UFS1(ip))
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
 		    BLK_SNAP, 0);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
 		    BLK_SNAP, 0);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		free(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	if (snaplistsize < ip->i_snapblklist - snapblklist)
 		panic("ffs_snapshot: list too small");
 	snaplistsize = ip->i_snapblklist - snapblklist;
 	snapblklist[0] = snaplistsize;
 	ip->i_snapblklist = 0;
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)snapblklist;
 	aiov.iov_len = snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		free(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			free(snapblklist, M_UFSMNT);
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &nbp);
 	if (error) {
 		brelse(nbp);
 	} else {
 		loc = blkoff(fs, fs->fs_sblockloc);
 		bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize);
 		bawrite(nbp);
 	}
 	/*
 	 * As this is the newest list, it is the most inclusive, so
 	 * should replace the previous list.
 	 */
 	VI_LOCK(devvp);
 	space = sn->sn_blklist;
 	sn->sn_blklist = snapblklist;
 	sn->sn_listsize = snaplistsize;
 	VI_UNLOCK(devvp);
 	if (space != NULL)
 		free(space, M_UFSMNT);
 	/*
 	 * Preallocate all the direct blocks in the snapshot inode so
 	 * that we never have to write the inode itself to commit an
 	 * update to the contents of the snapshot. Note that once
 	 * created, the size of the snapshot will never change, so
 	 * there will never be a need to write the inode except to
 	 * update the non-integrity-critical time fields and
 	 * allocated-block count.
 	 */
 	for (blockno = 0; blockno < UFS_NDADDR; blockno++) {
 		if (DIP(ip, i_db[blockno]) != 0)
 			continue;
 		error = UFS_BALLOC(vp, lblktosize(fs, blockno),
 		    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 		if (error)
 			break;
 		error = readblock(vp, bp, blockno);
 		bawrite(bp);
 		if (error != 0)
 			break;
 	}
 done:
 	free(copy_fs->fs_csp, M_UFSMNT);
 	free(copy_fs, M_UFSMNT);
 	copy_fs = NULL;
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (saved_nice > 0) {
 		struct proc *p;
 
 		p = td->td_proc;
 		PROC_LOCK(p);
 		sched_nice(td->td_proc, saved_nice);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
 	if (fs->fs_active != 0) {
 		free(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	UFS_UNLOCK(ump);
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	MNT_IUNLOCK(mp);
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0);
 	vrele(nd.ni_dvp);
 	vn_finished_write(wrtmp);
 	process_deferred_inactive(mp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, &bp, &cgp)) != 0)
 		return (error);
 	UFS_LOCK(ITOUMP(ip));
 	ACTIVESET(fs, cg);
 	/*
 	 * Recomputation of summary information might not have been performed
 	 * at mount time.  Sync up summary information for current cylinder
 	 * group while data is in memory to ensure that result of background
 	 * fsck is slightly more consistent.
 	 */
 	fs->fs_cs(fs, cg) = cgp->cg_cs;
 	UFS_UNLOCK(ITOUMP(ip));
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	cgp = (struct cg *)nbp->b_data;
 	bqrelse(bp);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cgbase(fs, cg) / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < UFS_NDADDR) {
 		for ( ; loc < UFS_NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP_SET(ip, i_db[loc], 0);
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		goto out;
 	}
 	indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				goto out;
 			}
 			indiroff = 0;
 		}
 		if (I_IS_UFS1(ip)) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 out:
 	/*
 	 * We have to calculate the crc32c here rather than just setting the
 	 * BX_CYLGRP b_xflags because the allocation of the block for the
 	 * the cylinder group map will always be a full size block (fs_bsize)
 	 * even though the cylinder group may be smaller (fs_cgsize). The
 	 * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP
 	 * flag causes it to be computed over the size of the buffer.
 	 */
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0) {
 		((struct cg *)nbp->b_data)->cg_ckhash = 0;
 		((struct cg *)nbp->b_data)->cg_ckhash =
 		    calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize);
 	}
 	return (error);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 	int clearmode;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < UFS_NDADDR) {
 		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
 	} else {
 		if (DOINGSOFTDEP(snapvp))
 			softdep_prealloc(snapvp, MNT_WAIT);
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * or unlinked snapshots to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (clearmode || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 	    &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -UFS_NDADDR;
 	len = numblks - UFS_NDADDR;
 	rlbn = UFS_NDADDR;
 	for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[UFS_NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs1: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs1: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	free(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < UFS_NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - UFS_NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= UFS_NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs1: bad block");
 			*blkp = expungetype;
 			if (lbn >= UFS_NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
 		    vp->v_type, NULL);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 	int clearmode;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < UFS_NDADDR) {
 		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 	} else {
 		if (DOINGSOFTDEP(snapvp))
 			softdep_prealloc(snapvp, MNT_WAIT);
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (clearmode || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 	    &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -UFS_NDADDR;
 	len = numblks - UFS_NDADDR;
 	rlbn = UFS_NDADDR;
 	for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[UFS_NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs2: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs2: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & B_CACHE) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	free(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < UFS_NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - UFS_NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= UFS_NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs2: bad block");
 			*blkp = expungetype;
 			if (lbn >= UFS_NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
 		    vp->v_type, NULL);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	xp = NULL;
 	sn = ITODEVVP(ip)->v_rdev->si_snapdata;
 	if (sn != NULL)
 		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 			if (xp == ip)
 				break;
 	if (xp != NULL)
 		vrele(ITOV(ip));
 	else if (snapdebug)
 		printf("ffs_snapgone: lost snapshot vnode %ju\n",
 		    (uintmax_t)ip->i_number);
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 	UFS_UNLOCK(ump);
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct buf *ibp;
 	struct fs *fs;
 	ufs2_daddr_t numblks, blkno, dblk;
 	int error, i, last, loc;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	devvp = ITODEVVP(ip);
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	VI_LOCK(devvp);
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		sn = devvp->v_rdev->si_snapdata;
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		VI_UNLOCK(devvp);
 		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 		for (i = 0; i < sn->sn_lock.lk_recurse; i++)
 			lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 		KASSERT(vp->v_vnlock == &sn->sn_lock,
 			("ffs_snapremove: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
 		VI_LOCK(devvp);
 		while (sn->sn_lock.lk_recurse > 0)
 			lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 		try_free_snapdata(devvp);
 	} else
 		VI_UNLOCK(devvp);
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == 0)
 			continue;
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP_SET(ip, i_db[blkno], 0);
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize,
 		     ip->i_number, vp->v_type, NULL))) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 			    btodb(fs->fs_bsize));
 			DIP_SET(ip, i_db[blkno], 0);
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (I_IS_UFS1(ip)) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == 0)
 					continue;
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
 				     fs->fs_bsize, ip->i_number, vp->v_type,
 				     NULL))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == 0)
 				continue;
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
 			     fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * The dirtied indirects must be written out before
 	 * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
 	 * may find indirect pointers using the magic BLK_* values.
 	 */
 	if (DOINGSOFTDEP(vp))
 		ffs_syncvnode(vp, MNT_WAIT, 0);
 #ifdef QUOTA
 	/*
 	 * Reenable disk quotas for ex-snapshot file.
 	 */
 	if (!getinoquota(ip))
 		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
 #endif
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *wkhd;
 {
 	struct buf *ibp, *cbp, *savedcbp = NULL;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 	struct snapdata *sn;
 
 	lbn = fragstoblks(fs, bno);
 retry:
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 	    VI_MTX(devvp)) != 0)
 		goto retry;
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		if (DOINGSOFTDEP(vp))
 			softdep_prealloc(vp, MNT_WAIT);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < UFS_NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
 			if (I_IS_UFS1(ip))
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			if (lbn < UFS_NDADDR) {
 				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (I_IS_UFS1(ip)) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= UFS_NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %ju lbn %jd from inum %ju\n",
 				    "Grabonremove: snapino",
 				    (uintmax_t)ip->i_number,
 				    (intmax_t)lbn, (uintmax_t)inum);
 #endif
 			/*
 			 * If journaling is tracking this write we must add
 			 * the work to the inode or indirect being written.
 			 */
 			if (wkhd != NULL) {
 				if (lbn < UFS_NDADDR)
 					softdep_inode_append(ip,
 					    curthread->td_ucred, wkhd);
 				else
 					softdep_buf_append(ibp, wkhd);
 			}
 			if (lbn < UFS_NDADDR) {
 				DIP_SET(ip, i_db[lbn], bno);
 			} else if (I_IS_UFS1(ip)) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
 			return (1);
 		}
 		if (lbn >= UFS_NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", (uintmax_t)ip->i_number,
 			    (intmax_t)lbn, "for inum", (uintmax_t)inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity. At a minimum we ensure the
 		 * integrity of the filesystem metadata, but use the
 		 * dopersistence sysctl-setable flag to decide on the
 		 * persistence needed for file content data.
 		 */
 		if (savedcbp != NULL) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if ((vtype == VDIR || dopersistence) &&
 			    ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if ((vtype == VDIR || dopersistence) &&
 			    ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity. At a minimum we
 	 * ensure the integrity of the filesystem metadata, but
 	 * use the dopersistence sysctl-setable flag to decide on
 	 * the persistence needed for file content data.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if ((vtype == VDIR || dopersistence) &&
 		    VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	if (error != 0 && wkhd != NULL)
 		softdep_freework(wkhd);
 	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *devvp = ump->um_devvp;
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snapdata *sn;
 	struct vnode *vp;
 	struct vnode *lastvp;
 	struct inode *ip;
 	struct uio auio;
 	struct iovec aiov;
 	void *snapblklist;
 	char *reason;
 	daddr_t snaplistsize;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before ffs_truncate or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
 	lastvp = NULL;
 	sn = NULL;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if (!IS_SNAPSHOT(ip) || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if (!IS_SNAPSHOT(ip)) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
 				(void)ffs_syncvnode(vp, MNT_WAIT, 0);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			vp = NULL;
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * Acquire a lock on the snapdata structure, creating it if
 		 * necessary.
 		 */
 		sn = ffs_snapdata_acquire(devvp);
 		/* 
 		 * Change vnode to use shared snapshot lock instead of the
 		 * original private lock.
 		 */
 		vp->v_vnlock = &sn->sn_lock;
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		VI_LOCK(devvp);
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %ju already on list",
 			    (uintmax_t)ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0);
 		lastvp = vp;
 	}
 	vp = lastvp;
 	/*
 	 * No usable snapshots found.
 	 */
 	if (sn == NULL || vp == NULL)
 		return;
 	/*
 	 * Allocate the space for the block hints list. We always want to
 	 * use the list from the newest snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)&snaplistsize;
 	aiov.iov_len = sizeof(snaplistsize);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset =
 	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 		VOP_UNLOCK(vp, 0);
 		return;
 	}
 	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = snapblklist;
 	aiov.iov_len = snaplistsize * sizeof (daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset -= sizeof(snaplistsize);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 		VOP_UNLOCK(vp, 0);
 		free(snapblklist, M_UFSMNT);
 		return;
 	}
 	VOP_UNLOCK(vp, 0);
 	VI_LOCK(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 	sn->sn_listsize = snaplistsize;
 	sn->sn_blklist = (daddr_t *)snapblklist;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
 
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 		vp = ITOV(xp);
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
 		lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
 		    VI_MTX(devvp));
 		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 		KASSERT(vp->v_vnlock == &sn->sn_lock,
 		("ffs_snapshot_unmount: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 		if (xp->i_effnlink > 0)
 			vrele(vp);
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 	}
 	try_free_snapdata(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
 /*
  * Check the buffer block to be belong to device buffer that shall be
  * locked after snaplk. devvp shall be locked on entry, and will be
  * leaved locked upon exit.
  */
 static int
 ffs_bp_snapblk(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct fs *fs;
 	ufs2_daddr_t lbn, *snapblklist;
 	int lower, upper, mid;
 
 	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
 	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 		return (0);
 	fs = ITOFS(TAILQ_FIRST(&sn->sn_head));
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper)
 		return (1);
 	return (0);
 }
 
 void
 ffs_bdflush(bo, bp)
 	struct bufobj *bo;
 	struct buf *bp;
 {
 	struct thread *td;
 	struct vnode *vp, *devvp;
 	struct buf *nbp;
 	int bp_bdskip;
 
 	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
 		return;
 
 	td = curthread;
 	vp = bp->b_vp;
 	devvp = bo2vnode(bo);
 	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
 
 	VI_LOCK(devvp);
 	bp_bdskip = ffs_bp_snapblk(devvp, bp);
 	if (bp_bdskip)
 		bdwriteskip++;
 	VI_UNLOCK(devvp);
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
 		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
 		altbufferflushes++;
 	} else {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/*
 			 * Don't countdeps with the bo lock
 			 * held.
 			 */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (bp_bdskip) {
 				VI_LOCK(devvp);
 				if (!ffs_bp_snapblk(vp, nbp)) {
 					VI_UNLOCK(devvp);
 					BO_LOCK(bo);
 					BUF_UNLOCK(nbp);
 					continue;
 				}
 				VI_UNLOCK(devvp);
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct buf *ibp, *cbp, *savedcbp = NULL;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
 	long saved_runningbufspace;
 
 	if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	/*
 	 * First check to see if it is in the preallocated list.
 	 * By doing this check we avoid several potential deadlocks.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL ||
 	    TAILQ_EMPTY(&sn->sn_head)) {
 		VI_UNLOCK(devvp);
 		return (0);		/* No snapshot */
 	}
 	ip = TAILQ_FIRST(&sn->sn_head);
 	fs = ITOFS(ip);
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	launched_async_io = 0;
 	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 	/*
 	 * Since I/O on bp isn't yet in progress and it may be blocked
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
 	saved_runningbufspace = bp->b_runningbufspace;
 	if (saved_runningbufspace != 0)
 		runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
 	while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 	    VI_MTX(devvp)) != 0) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL ||
 		    TAILQ_EMPTY(&sn->sn_head)) {
 			VI_UNLOCK(devvp);
 			if (saved_runningbufspace != 0) {
 				bp->b_runningbufspace = saved_runningbufspace;
 				atomic_add_long(&runningbufspace,
 					       bp->b_runningbufspace);
 			}
 			return (0);		/* Snapshot gone */
 		}
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		if (DOINGSOFTDEP(vp))
 			softdep_prealloc(vp, MNT_WAIT);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < UFS_NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
 			if (I_IS_UFS1(ip))
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef INVARIANTS
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %ju lbn %jd for ",
 			    (uintmax_t)ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %ju",
 				    (uintmax_t)VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity. At a minimum we ensure the
 		 * integrity of the filesystem metadata, but use the
 		 * dopersistence sysctl-setable flag to decide on the
 		 * persistence needed for file content data.
 		 */
 		if (savedcbp != NULL) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 			    dopersistence) && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			else
 				launched_async_io = 1;
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 			    dopersistence) && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			else
 				launched_async_io = 1;
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity. At a minimum we
 	 * ensure the integrity of the filesystem metadata, but
 	 * use the dopersistence sysctl-setable flag to decide on
 	 * the persistence needed for file content data.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 		    dopersistence) && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 		else
 			launched_async_io = 1;
 	}
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
 	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 		prev_norunningbuf;
 	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 		waitrunningbufspace();
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
 	if (saved_runningbufspace != 0) {
 		bp->b_runningbufspace = saved_runningbufspace;
 		atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 	}
 	return (error);
 }
 
 /*
  * sync snapshots to force freework records waiting on snapshots to claim
  * blocks to free.
  */
 void
 ffs_sync_snap(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct snapdata *sn;
 	struct vnode *devvp;
 	struct vnode *vp;
 	struct inode *ip;
 
 	devvp = VFSTOUFS(mp)->um_devvp;
 	if ((devvp->v_vflag & VV_COPYONWRITE) == 0)
 		return;
 	for (;;) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL) {
 			VI_UNLOCK(devvp);
 			return;
 		}
 		if (lockmgr(&sn->sn_lock,
 		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		    VI_MTX(devvp)) == 0)
 			break;
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
 	}
 	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(vp, bp, lbn)
 	struct vnode *vp;
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct inode *ip;
 	struct bio *bip;
 	struct fs *fs;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 
 	bip = g_alloc_bio();
 	bip->bio_cmd = BIO_READ;
 	bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_done = NULL;
 
 	g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private);
 	bp->b_error = biowait(bip, "snaprdb");
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
 
 #endif
 
 /*
  * Process file deletes that were deferred by ufs_inactive() due to
  * the file system being suspended. Transfer IN_LAZYACCESS into
  * IN_MODIFIED for vnodes that were accessed during suspension.
  */
 void
 process_deferred_inactive(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 	struct inode *ip;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	(void) vn_start_secondary_write(NULL, &mp, V_WAIT);
  loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * IN_LAZYACCESS is checked here without holding any
 		 * vnode lock, but this flag is set only while holding
 		 * vnode interlock.
 		 */
 		if (vp->v_type == VNON ||
 		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
 		    ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vholdl(vp);
 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		if (error != 0) {
 			vdrop(vp);
 			if (error == ENOENT)
 				continue;	/* vnode recycled */
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
 			ip->i_flag &= ~IN_LAZYACCESS;
 			ip->i_flag |= IN_MODIFIED;
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 			VI_UNLOCK(vp);
 			VOP_UNLOCK(vp, 0);
 			vdrop(vp);
 			continue;
 		}
 		vinactive(vp, td);
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 			 ("process_deferred_inactive: got VI_OWEINACT"));
 		VI_UNLOCK(vp);
 		VOP_UNLOCK(vp, 0);
 		vdrop(vp);
 	}
 	vn_finished_secondary_write(mp);
 }
 
 #ifndef NO_FFS_SNAPSHOT
 
 static struct snapdata *
 ffs_snapdata_alloc(void)
 {
 	struct snapdata *sn;
 
 	/*
 	 * Fetch a snapdata from the free list if there is one available.
 	 */
 	mtx_lock(&snapfree_lock);
 	sn = LIST_FIRST(&snapfree);
 	if (sn != NULL)
 		LIST_REMOVE(sn, sn_link);
 	mtx_unlock(&snapfree_lock);
 	if (sn != NULL)
 		return (sn);
 	/*
  	 * If there were no free snapdatas allocate one.
 	 */
 	sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&sn->sn_head);
 	lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 	    LK_CANRECURSE | LK_NOSHARE);
 	return (sn);
 }
 
 /*
  * The snapdata is never freed because we can not be certain that
  * there are no threads sleeping on the snap lock.  Persisting
  * them permanently avoids costly synchronization in ffs_lock().
  */
 static void
 ffs_snapdata_free(struct snapdata *sn)
 {
 	mtx_lock(&snapfree_lock);
 	LIST_INSERT_HEAD(&snapfree, sn, sn_link);
 	mtx_unlock(&snapfree_lock);
 }
 
 /* Try to free snapdata associated with devvp */
 static void
 try_free_snapdata(struct vnode *devvp)
 {
 	struct snapdata *sn;
 	ufs2_daddr_t *snapblklist;
 
 	ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
 	sn = devvp->v_rdev->si_snapdata;
 
 	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
 	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
 		VI_UNLOCK(devvp);
 		return;
 	}
 
 	devvp->v_rdev->si_snapdata = NULL;
 	devvp->v_vflag &= ~VV_COPYONWRITE;
 	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
 	snapblklist = sn->sn_blklist;
 	sn->sn_blklist = NULL;
 	sn->sn_listsize = 0;
 	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 	if (snapblklist != NULL)
 		free(snapblklist, M_UFSMNT);
 	ffs_snapdata_free(sn);
 }
 
 static struct snapdata *
 ffs_snapdata_acquire(struct vnode *devvp)
 {
 	struct snapdata *nsn, *sn;
 	int error;
 
 	/*
 	 * Allocate a free snapdata.  This is done before acquiring the
 	 * devvp lock to avoid allocation while the devvp interlock is
 	 * held.
 	 */
 	nsn = ffs_snapdata_alloc();
 
 	for (;;) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL) {
 			/*
 			 * This is the first snapshot on this
 			 * filesystem and we use our pre-allocated
 			 * snapdata.  Publish sn with the sn_lock
 			 * owned by us, to avoid the race.
 			 */
 			error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE |
 			    LK_NOWAIT, NULL);
 			if (error != 0)
 				panic("leaked sn, lockmgr error %d", error);
 			sn = devvp->v_rdev->si_snapdata = nsn;
 			VI_UNLOCK(devvp);
 			nsn = NULL;
 			break;
 		}
 
 		/*
 		 * There is a snapshots which already exists on this
 		 * filesystem, grab a reference to the common lock.
 		 */
 		error = lockmgr(&sn->sn_lock, LK_INTERLOCK |
 		    LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp));
 		if (error == 0)
 			break;
 	}
 
 	/*
 	 * Free any unused snapdata.
 	 */
 	if (nsn != NULL)
 		ffs_snapdata_free(nsn);
 
 	return (sn);
 }
 
 #endif
Index: head/sys/x86/iommu/intel_dmar.h
===================================================================
--- head/sys/x86/iommu/intel_dmar.h	(revision 336913)
+++ head/sys/x86/iommu/intel_dmar.h	(revision 336914)
@@ -1,557 +1,556 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __X86_IOMMU_INTEL_DMAR_H
 #define	__X86_IOMMU_INTEL_DMAR_H
 
 /* Host or physical memory address, after translation. */
 typedef uint64_t dmar_haddr_t;
 /* Guest or bus address, before translation. */
 typedef uint64_t dmar_gaddr_t;
 
 struct dmar_qi_genseq {
 	u_int gen;
 	uint32_t seq;
 };
 
 struct dmar_map_entry {
 	dmar_gaddr_t start;
 	dmar_gaddr_t end;
 	dmar_gaddr_t free_after;	/* Free space after the entry */
 	dmar_gaddr_t free_down;		/* Max free space below the
 					   current R/B tree node */
 	u_int flags;
 	TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */
 	RB_ENTRY(dmar_map_entry) rb_entry;	 /* Links for domain entries */
 	TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after
 						    dmamap_load failure */
 	struct dmar_domain *domain;
 	struct dmar_qi_genseq gseq;
 };
 
 RB_HEAD(dmar_gas_entries_tree, dmar_map_entry);
 RB_PROTOTYPE(dmar_gas_entries_tree, dmar_map_entry, rb_entry,
     dmar_gas_cmp_entries);
 
 #define	DMAR_MAP_ENTRY_PLACE	0x0001	/* Fake entry */
 #define	DMAR_MAP_ENTRY_RMRR	0x0002	/* Permanent, not linked by
 					   dmamap_link */
 #define	DMAR_MAP_ENTRY_MAP	0x0004	/* Busdma created, linked by
 					   dmamap_link */
 #define	DMAR_MAP_ENTRY_UNMAPPED	0x0010	/* No backing pages */
 #define	DMAR_MAP_ENTRY_QI_NF	0x0020	/* qi task, do not free entry */
 #define	DMAR_MAP_ENTRY_READ	0x1000	/* Read permitted */
 #define	DMAR_MAP_ENTRY_WRITE	0x2000	/* Write permitted */
 #define	DMAR_MAP_ENTRY_SNOOP	0x4000	/* Snoop */
 #define	DMAR_MAP_ENTRY_TM	0x8000	/* Transient */
 
 /*
  * Locking annotations:
  * (u) - Protected by dmar unit lock
  * (d) - Protected by domain lock
  * (c) - Immutable after initialization
  */
 
 /*
  * The domain abstraction.  Most non-constant members of the domain
  * are protected by owning dmar unit lock, not by the domain lock.
  * Most important, the dmar lock protects the contexts list.
  *
  * The domain lock protects the address map for the domain, and list
  * of unload entries delayed.
  *
  * Page tables pages and pages content is protected by the vm object
  * lock pgtbl_obj, which contains the page tables pages.
  */
 struct dmar_domain {
 	int domain;			/* (c) DID, written in context entry */
 	int mgaw;			/* (c) Real max address width */
 	int agaw;			/* (c) Adjusted guest address width */
 	int pglvl;			/* (c) The pagelevel */
 	int awlvl;			/* (c) The pagelevel as the bitmask,
 					   to set in context entry */
 	dmar_gaddr_t end;		/* (c) Highest address + 1 in
 					   the guest AS */
 	u_int ctx_cnt;			/* (u) Number of contexts owned */
 	u_int refs;			/* (u) Refs, including ctx */
 	struct dmar_unit *dmar;		/* (c) */
 	struct mtx lock;		/* (c) */
 	LIST_ENTRY(dmar_domain) link;	/* (u) Member in the dmar list */
 	LIST_HEAD(, dmar_ctx) contexts;	/* (u) */
 	vm_object_t pgtbl_obj;		/* (c) Page table pages */
 	u_int flags;			/* (u) */
 	u_int entries_cnt;		/* (d) */
 	struct dmar_gas_entries_tree rb_root; /* (d) */
 	struct dmar_map_entries_tailq unload_entries; /* (d) Entries to
 							 unload */
 	struct dmar_map_entry *first_place, *last_place; /* (d) */
 	struct task unload_task;	/* (c) */
 	u_int batch_no;
 };
 
 struct dmar_ctx {
 	struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */
 	uint16_t rid;			/* (c) pci RID */
 	uint64_t last_fault_rec[2];	/* Last fault reported */
 	struct dmar_domain *domain;	/* (c) */
 	LIST_ENTRY(dmar_ctx) link;	/* (u) Member in the domain list */
 	u_int refs;			/* (u) References from tags */
 	u_int flags;			/* (u) */
 	u_long loads;			/* atomic updates, for stat only */
 	u_long unloads;			/* same */
 };
 
 #define	DMAR_DOMAIN_GAS_INITED		0x0001
 #define	DMAR_DOMAIN_PGTBL_INITED	0x0002
 #define	DMAR_DOMAIN_IDMAP		0x0010	/* Domain uses identity
 						   page table */
 #define	DMAR_DOMAIN_RMRR		0x0020	/* Domain contains RMRR entry,
 						   cannot be turned off */
 
 /* struct dmar_ctx flags */
 #define	DMAR_CTX_FAULTED	0x0001	/* Fault was reported,
 					   last_fault_rec is valid */
 #define	DMAR_CTX_DISABLED	0x0002	/* Device is disabled, the
 					   ephemeral reference is kept
 					   to prevent context destruction */
 
 #define	DMAR_DOMAIN_PGLOCK(dom)		VM_OBJECT_WLOCK((dom)->pgtbl_obj)
 #define	DMAR_DOMAIN_PGTRYLOCK(dom)	VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj)
 #define	DMAR_DOMAIN_PGUNLOCK(dom)	VM_OBJECT_WUNLOCK((dom)->pgtbl_obj)
 #define	DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \
 	VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj)
 
 #define	DMAR_DOMAIN_LOCK(dom)	mtx_lock(&(dom)->lock)
 #define	DMAR_DOMAIN_UNLOCK(dom)	mtx_unlock(&(dom)->lock)
 #define	DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED)
 
 struct dmar_msi_data {
 	int irq;
 	int irq_rid;
 	struct resource *irq_res;
 	void *intr_handle;
 	int (*handler)(void *);
 	int msi_data_reg;
 	int msi_addr_reg;
 	int msi_uaddr_reg;
 	void (*enable_intr)(struct dmar_unit *);
 	void (*disable_intr)(struct dmar_unit *);
 	const char *name;
 };
 
 #define	DMAR_INTR_FAULT		0
 #define	DMAR_INTR_QI		1
 #define	DMAR_INTR_TOTAL		2
 
 struct dmar_unit {
 	device_t dev;
 	int unit;
 	uint16_t segment;
 	uint64_t base;
 
 	/* Resources */
 	int reg_rid;
 	struct resource *regs;
 
 	struct dmar_msi_data intrs[DMAR_INTR_TOTAL];
 
 	/* Hardware registers cache */
 	uint32_t hw_ver;
 	uint64_t hw_cap;
 	uint64_t hw_ecap;
 	uint32_t hw_gcmd;
 
 	/* Data for being a dmar */
 	struct mtx lock;
 	LIST_HEAD(, dmar_domain) domains;
 	struct unrhdr *domids;
 	vm_object_t ctx_obj;
 	u_int barrier_flags;
 
 	/* Fault handler data */
 	struct mtx fault_lock;
 	uint64_t *fault_log;
 	int fault_log_head;
 	int fault_log_tail;
 	int fault_log_size;
 	struct task fault_task;
 	struct taskqueue *fault_taskqueue;
 
 	/* QI */
 	int qi_enabled;
 	vm_offset_t inv_queue;
 	vm_size_t inv_queue_size;
 	uint32_t inv_queue_avail;
 	uint32_t inv_queue_tail;
 	volatile uint32_t inv_waitd_seq_hw; /* hw writes there on wait
 					       descr completion */
 	uint64_t inv_waitd_seq_hw_phys;
 	uint32_t inv_waitd_seq; /* next sequence number to use for wait descr */
 	u_int inv_waitd_gen;	/* seq number generation AKA seq overflows */
 	u_int inv_seq_waiters;	/* count of waiters for seq */
 	u_int inv_queue_full;	/* informational counter */
 
 	/* IR */
 	int ir_enabled;
 	vm_paddr_t irt_phys;
 	dmar_irte_t *irt;
 	u_int irte_cnt;
 	vmem_t *irtids;
 
 	/* Delayed freeing of map entries queue processing */
 	struct dmar_map_entries_tailq tlb_flush_entries;
 	struct task qi_task;
 	struct taskqueue *qi_taskqueue;
 
 	/* Busdma delayed map load */
 	struct task dmamap_load_task;
 	TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps;
 	struct taskqueue *delayed_taskqueue;
 
 	int dma_enabled;
 };
 
 #define	DMAR_LOCK(dmar)		mtx_lock(&(dmar)->lock)
 #define	DMAR_UNLOCK(dmar)	mtx_unlock(&(dmar)->lock)
 #define	DMAR_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->lock, MA_OWNED)
 
 #define	DMAR_FAULT_LOCK(dmar)	mtx_lock_spin(&(dmar)->fault_lock)
 #define	DMAR_FAULT_UNLOCK(dmar)	mtx_unlock_spin(&(dmar)->fault_lock)
 #define	DMAR_FAULT_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->fault_lock, MA_OWNED)
 
 #define	DMAR_IS_COHERENT(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_C) != 0)
 #define	DMAR_HAS_QI(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_QI) != 0)
 #define	DMAR_X2APIC(dmar) \
 	(x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0)
 
 /* Barrier ids */
 #define	DMAR_BARRIER_RMRR	0
 #define	DMAR_BARRIER_USEQ	1
 
 struct dmar_unit *dmar_find(device_t dev);
 struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid);
 struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid);
 
 u_int dmar_nd2mask(u_int nd);
 bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl);
 int domain_set_agaw(struct dmar_domain *domain, int mgaw);
 int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr,
     bool allow_less);
 vm_pindex_t pglvl_max_pages(int pglvl);
 int domain_is_sp_lvl(struct dmar_domain *domain, int lvl);
 dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl);
 dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl);
 int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
     dmar_gaddr_t *isizep);
 struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags);
 void dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags);
 void *dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags,
     struct sf_buf **sf);
 void dmar_unmap_pgtbl(struct sf_buf *sf);
 int dmar_load_root_entry_ptr(struct dmar_unit *unit);
 int dmar_inv_ctx_glob(struct dmar_unit *unit);
 int dmar_inv_iotlb_glob(struct dmar_unit *unit);
 int dmar_flush_write_bufs(struct dmar_unit *unit);
 void dmar_flush_pte_to_ram(struct dmar_unit *unit, dmar_pte_t *dst);
 void dmar_flush_ctx_to_ram(struct dmar_unit *unit, dmar_ctx_entry_t *dst);
 void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst);
 int dmar_enable_translation(struct dmar_unit *unit);
 int dmar_disable_translation(struct dmar_unit *unit);
 int dmar_load_irt_ptr(struct dmar_unit *unit);
 int dmar_enable_ir(struct dmar_unit *unit);
 int dmar_disable_ir(struct dmar_unit *unit);
 bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id);
 void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id);
 uint64_t dmar_get_timeout(void);
 void dmar_update_timeout(uint64_t newval);
 
 int dmar_fault_intr(void *arg);
 void dmar_enable_fault_intr(struct dmar_unit *unit);
 void dmar_disable_fault_intr(struct dmar_unit *unit);
 int dmar_init_fault_log(struct dmar_unit *unit);
 void dmar_fini_fault_log(struct dmar_unit *unit);
 
 int dmar_qi_intr(void *arg);
 void dmar_enable_qi_intr(struct dmar_unit *unit);
 void dmar_disable_qi_intr(struct dmar_unit *unit);
 int dmar_init_qi(struct dmar_unit *unit);
 void dmar_fini_qi(struct dmar_unit *unit);
 void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait);
 void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit);
 void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit);
 void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit);
 void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt);
 
 vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain,
     dmar_gaddr_t maxaddr);
 void put_idmap_pgtbl(vm_object_t obj);
 int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags);
 int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size, int flags);
 void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size);
 int domain_alloc_pgtbl(struct dmar_domain *domain);
 void domain_free_pgtbl(struct dmar_domain *domain);
 
 struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev,
     bool rmrr);
 struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev,
     uint16_t rid, bool id_mapped, bool rmrr_init);
 int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx);
 void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx);
 void dmar_free_ctx(struct dmar_ctx *ctx);
 struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid);
 void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free);
 void dmar_domain_unload(struct dmar_domain *domain,
     struct dmar_map_entries_tailq *entries, bool cansleep);
 void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free);
 
 int dmar_init_busdma(struct dmar_unit *unit);
 void dmar_fini_busdma(struct dmar_unit *unit);
 device_t dmar_get_requester(device_t dev, uint16_t *rid);
 
 void dmar_gas_init_domain(struct dmar_domain *domain);
 void dmar_gas_fini_domain(struct dmar_domain *domain);
 struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain,
     u_int flags);
 void dmar_gas_free_entry(struct dmar_domain *domain,
     struct dmar_map_entry *entry);
 void dmar_gas_free_space(struct dmar_domain *domain,
     struct dmar_map_entry *entry);
 int dmar_gas_map(struct dmar_domain *domain,
     const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
     u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res);
 void dmar_gas_free_region(struct dmar_domain *domain,
     struct dmar_map_entry *entry);
 int dmar_gas_map_region(struct dmar_domain *domain,
     struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma);
 int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t end);
 
 void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
     struct dmar_map_entries_tailq *rmrr_entries);
 int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar);
 
 void dmar_quirks_post_ident(struct dmar_unit *dmar);
 void dmar_quirks_pre_use(struct dmar_unit *dmar);
 
 int dmar_init_irt(struct dmar_unit *unit);
 void dmar_fini_irt(struct dmar_unit *unit);
 
 #define	DMAR_GM_CANWAIT	0x0001
 #define	DMAR_GM_CANSPLIT 0x0002
 
 #define	DMAR_PGF_WAITOK	0x0001
 #define	DMAR_PGF_ZERO	0x0002
 #define	DMAR_PGF_ALLOC	0x0004
 #define	DMAR_PGF_NOALLOC 0x0008
 #define	DMAR_PGF_OBJL	0x0010
 
 extern dmar_haddr_t dmar_high;
 extern int haw;
 extern int dmar_tbl_pagecnt;
 extern int dmar_match_verbose;
 extern int dmar_batch_coalesce;
 extern int dmar_check_free;
 
 static inline uint32_t
 dmar_read4(const struct dmar_unit *unit, int reg)
 {
 
 	return (bus_read_4(unit->regs, reg));
 }
 
 static inline uint64_t
 dmar_read8(const struct dmar_unit *unit, int reg)
 {
 #ifdef __i386__
 	uint32_t high, low;
 
 	low = bus_read_4(unit->regs, reg);
 	high = bus_read_4(unit->regs, reg + 4);
 	return (low | ((uint64_t)high << 32));
 #else
 	return (bus_read_8(unit->regs, reg));
 #endif
 }
 
 static inline void
 dmar_write4(const struct dmar_unit *unit, int reg, uint32_t val)
 {
 
 	KASSERT(reg != DMAR_GCMD_REG || (val & DMAR_GCMD_TE) ==
 	    (unit->hw_gcmd & DMAR_GCMD_TE),
 	    ("dmar%d clearing TE 0x%08x 0x%08x", unit->unit,
 	    unit->hw_gcmd, val));
 	bus_write_4(unit->regs, reg, val);
 }
 
 static inline void
 dmar_write8(const struct dmar_unit *unit, int reg, uint64_t val)
 {
 
 	KASSERT(reg != DMAR_GCMD_REG, ("8byte GCMD write"));
 #ifdef __i386__
 	uint32_t high, low;
 
 	low = val;
 	high = val >> 32;
 	bus_write_4(unit->regs, reg, low);
 	bus_write_4(unit->regs, reg + 4, high);
 #else
 	bus_write_8(unit->regs, reg, val);
 #endif
 }
 
 /*
  * dmar_pte_store and dmar_pte_clear ensure that on i386, 32bit writes
  * are issued in the correct order.  For store, the lower word,
  * containing the P or R and W bits, is set only after the high word
  * is written.  For clear, the P bit is cleared first, then the high
  * word is cleared.
  *
  * dmar_pte_update updates the pte.  For amd64, the update is atomic.
  * For i386, it first disables the entry by clearing the word
  * containing the P bit, and then defer to dmar_pte_store.  The locked
  * cmpxchg8b is probably available on any machine having DMAR support,
  * but interrupt translation table may be mapped uncached.
  */
 static inline void
 dmar_pte_store1(volatile uint64_t *dst, uint64_t val)
 {
 #ifdef __i386__
 	volatile uint32_t *p;
 	uint32_t hi, lo;
 
 	hi = val >> 32;
 	lo = val;
 	p = (volatile uint32_t *)dst;
 	*(p + 1) = hi;
 	*p = lo;
 #else
 	*dst = val;
 #endif
 }
 
 static inline void
 dmar_pte_store(volatile uint64_t *dst, uint64_t val)
 {
 
 	KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
 	    dst, (uintmax_t)*dst, (uintmax_t)val));
 	dmar_pte_store1(dst, val);
 }
 
 static inline void
 dmar_pte_update(volatile uint64_t *dst, uint64_t val)
 {
 
 #ifdef __i386__
 	volatile uint32_t *p;
 
 	p = (volatile uint32_t *)dst;
 	*p = 0;
 #endif
 	dmar_pte_store1(dst, val);
 }
 
 static inline void
 dmar_pte_clear(volatile uint64_t *dst)
 {
 #ifdef __i386__
 	volatile uint32_t *p;
 
 	p = (volatile uint32_t *)dst;
 	*p = 0;
 	*(p + 1) = 0;
 #else
 	*dst = 0;
 #endif
 }
 
 static inline bool
 dmar_test_boundary(dmar_gaddr_t start, dmar_gaddr_t size,
     dmar_gaddr_t boundary)
 {
 
 	if (boundary == 0)
 		return (true);
 	return (start + size <= ((start + boundary) & ~(boundary - 1)));
 }
 
 extern struct timespec dmar_hw_timeout;
 
 #define	DMAR_WAIT_UNTIL(cond)					\
 {								\
 	struct timespec last, curr;				\
 	bool forever;						\
 								\
 	if (dmar_hw_timeout.tv_sec == 0 &&			\
 	    dmar_hw_timeout.tv_nsec == 0) {			\
 		forever = true;					\
 	} else {						\
 		forever = false;				\
 		nanouptime(&curr);				\
-		last = curr;					\
-		timespecadd(&last, &dmar_hw_timeout);		\
+		timespecadd(&curr, &dmar_hw_timeout, &last);	\
 	}							\
 	for (;;) {						\
 		if (cond) {					\
 			error = 0;				\
 			break;					\
 		}						\
 		nanouptime(&curr);				\
 		if (!forever && timespeccmp(&last, &curr, <)) {	\
 			error = ETIMEDOUT;			\
 			break;					\
 		}						\
 		cpu_spinwait();					\
 	}							\
 }
 
 #ifdef INVARIANTS
 #define	TD_PREP_PINNED_ASSERT						\
 	int old_td_pinned;						\
 	old_td_pinned = curthread->td_pinned
 #define	TD_PINNED_ASSERT						\
 	KASSERT(curthread->td_pinned == old_td_pinned,			\
 	    ("pin count leak: %d %d %s:%d", curthread->td_pinned,	\
 	    old_td_pinned, __FILE__, __LINE__))
 #else
 #define	TD_PREP_PINNED_ASSERT
 #define	TD_PINNED_ASSERT
 #endif
 
 #endif
Index: head/tools/regression/posixsem/posixsem.c
===================================================================
--- head/tools/regression/posixsem/posixsem.c	(revision 336913)
+++ head/tools/regression/posixsem/posixsem.c	(revision 336914)
@@ -1,1443 +1,1414 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/_semaphore.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/user.h>
 #include <sys/wait.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <kvm.h>
 #include <limits.h>
 #include <semaphore.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 
 #include "test.h"
 
-/* Cut and pasted from kernel header, bah! */
-
-/* Operations on timespecs */
-#define	timespecclear(tvp)	((tvp)->tv_sec = (tvp)->tv_nsec = 0)
-#define	timespecisset(tvp)	((tvp)->tv_sec || (tvp)->tv_nsec)
-#define	timespeccmp(tvp, uvp, cmp)					\
-	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
-	    ((tvp)->tv_nsec cmp (uvp)->tv_nsec) :			\
-	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
-#define timespecadd(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec += (uvp)->tv_sec;				\
-		(vvp)->tv_nsec += (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec >= 1000000000) {			\
-			(vvp)->tv_sec++;				\
-			(vvp)->tv_nsec -= 1000000000;			\
-		}							\
-	} while (0)
-#define timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-
-
 #define	TEST_PATH	"/tmp/posixsem_regression_test"
 
 #define	ELAPSED(elapsed, limit)		(abs((elapsed) - (limit)) < 100)
 
 /* Macros for passing child status to parent over a pipe. */
 #define	CSTAT(class, error)		((class) << 16 | (error))
 #define	CSTAT_CLASS(stat)		((stat) >> 16)
 #define	CSTAT_ERROR(stat)		((stat) & 0xffff)
 
 /*
  * Helper routine for tests that use a child process.  This routine
  * creates a pipe and forks a child process.  The child process runs
  * the 'func' routine which returns a status integer.  The status
  * integer gets written over the pipe to the parent and returned in
  * '*stat'.  If there is an error in pipe(), fork(), or wait() this
  * returns -1 and fails the test.
  */
 static int
 child_worker(int (*func)(void *arg), void *arg, int *stat)
 {
 	pid_t pid;
 	int pfd[2], cstat;
 
 	if (pipe(pfd) < 0) {
 		fail_errno("pipe");
 		return (-1);
 	}
 
 	pid = fork();
 	switch (pid) {
 	case -1:
 		/* Error. */
 		fail_errno("fork");
 		close(pfd[0]);
 		close(pfd[1]);
 		return (-1);
 	case 0:
 		/* Child. */
 		cstat = func(arg);
 		write(pfd[1], &cstat, sizeof(cstat));
 		exit(0);
 	}
 
 	if (read(pfd[0], stat, sizeof(*stat)) < 0) {
 		fail_errno("read(pipe)");
 		close(pfd[0]);
 		close(pfd[1]);
 		return (-1);
 	}
 	if (waitpid(pid, NULL, 0) < 0) {
 		fail_errno("wait");
 		close(pfd[0]);
 		close(pfd[1]);
 		return (-1);
 	}
 	close(pfd[0]);
 	close(pfd[1]);
 	return (0);
 }
 
 /*
  * Attempt a ksem_open() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_open_should_fail(const char *path, int flags, mode_t mode, unsigned int
     value, int error)
 {
 	semid_t id;
 
 	if (ksem_open(&id, path, flags, mode, value) >= 0) {
 		fail_err("ksem_open() didn't fail");
 		ksem_close(id);
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_open");
 		return;
 	}
 	pass();
 }
 
 /*
  * Attempt a ksem_unlink() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_unlink_should_fail(const char *path, int error)
 {
 
 	if (ksem_unlink(path) >= 0) {
 		fail_err("ksem_unlink() didn't fail");
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_unlink");
 		return;
 	}
 	pass();
 }
 
 /*
  * Attempt a ksem_close() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_close_should_fail(semid_t id, int error)
 {
 
 	if (ksem_close(id) >= 0) {
 		fail_err("ksem_close() didn't fail");
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_close");
 		return;
 	}
 	pass();
 }
 
 /*
  * Attempt a ksem_init() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_init_should_fail(unsigned int value, int error)
 {
 	semid_t id;
 
 	if (ksem_init(&id, value) >= 0) {
 		fail_err("ksem_init() didn't fail");
 		ksem_destroy(id);
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_init");
 		return;
 	}
 	pass();
 }
 
 /*
  * Attempt a ksem_destroy() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_destroy_should_fail(semid_t id, int error)
 {
 
 	if (ksem_destroy(id) >= 0) {
 		fail_err("ksem_destroy() didn't fail");
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 
 /*
  * Attempt a ksem_post() that should fail with an expected error of
  * 'error'.
  */
 static void
 ksem_post_should_fail(semid_t id, int error)
 {
 
 	if (ksem_post(id) >= 0) {
 		fail_err("ksem_post() didn't fail");
 		return;
 	}
 	if (errno != error) {
 		fail_errno("ksem_post");
 		return;
 	}
 	pass();
 }
 
 static void
 open_after_unlink(void)
 {
 	semid_t id;
 
 	if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) {
 		fail_errno("ksem_open(1)");
 		return;
 	}
 	ksem_close(id);
 
 	if (ksem_unlink(TEST_PATH) < 0) {
 		fail_errno("ksem_unlink");
 		return;
 	}
 
 	ksem_open_should_fail(TEST_PATH, O_RDONLY, 0777, 1, ENOENT);
 }
 TEST(open_after_unlink, "open after unlink");
 
 static void
 open_invalid_path(void)
 {
 
 	ksem_open_should_fail("blah", 0, 0777, 1, EINVAL);
 }
 TEST(open_invalid_path, "open invalid path");
 
 static void
 open_extra_flags(void)
 {
 
 	ksem_open_should_fail(TEST_PATH, O_RDONLY | O_DIRECT, 0777, 1, EINVAL);
 }
 TEST(open_extra_flags, "open with extra flags");
 
 static void
 open_bad_value(void)
 {
 
 	(void)ksem_unlink(TEST_PATH);
 
 	ksem_open_should_fail(TEST_PATH, O_CREAT, 0777, UINT_MAX, EINVAL);
 }
 TEST(open_bad_value, "open with invalid initial value");
 
 static void
 open_bad_path_pointer(void)
 {
 
 	ksem_open_should_fail((char *)1024, O_RDONLY, 0777, 1, EFAULT);
 }
 TEST(open_bad_path_pointer, "open bad path pointer");
 
 static void
 open_path_too_long(void)
 {
 	char *page;
 
 	page = malloc(MAXPATHLEN + 1);
 	memset(page, 'a', MAXPATHLEN);
 	page[MAXPATHLEN] = '\0';
 	ksem_open_should_fail(page, O_RDONLY, 0777, 1, ENAMETOOLONG);
 	free(page);
 }
 TEST(open_path_too_long, "open pathname too long");
 
 static void
 open_nonexisting_semaphore(void)
 {
 
 	ksem_open_should_fail("/notreallythere", 0, 0777, 1, ENOENT);
 }
 TEST(open_nonexisting_semaphore, "open nonexistent semaphore");
 
 static void
 exclusive_create_existing_semaphore(void)
 {
 	semid_t id;
 
 	if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) {
 		fail_errno("ksem_open(O_CREAT)");
 		return;
 	}
 	ksem_close(id);
 
 	ksem_open_should_fail(TEST_PATH, O_CREAT | O_EXCL, 0777, 1, EEXIST);
 
 	ksem_unlink(TEST_PATH);
 }
 TEST(exclusive_create_existing_semaphore, "O_EXCL of existing semaphore");
 
 static void
 init_bad_value(void)
 {
 
 	ksem_init_should_fail(UINT_MAX, EINVAL);
 }
 TEST(init_bad_value, "init with invalid initial value");
 
 static void
 unlink_bad_path_pointer(void)
 {
 
 	ksem_unlink_should_fail((char *)1024, EFAULT);
 }
 TEST(unlink_bad_path_pointer, "unlink bad path pointer");
 
 static void
 unlink_path_too_long(void)
 {
 	char *page;
 
 	page = malloc(MAXPATHLEN + 1);
 	memset(page, 'a', MAXPATHLEN);
 	page[MAXPATHLEN] = '\0';
 	ksem_unlink_should_fail(page, ENAMETOOLONG);
 	free(page);
 }
 TEST(unlink_path_too_long, "unlink pathname too long");
 
 static void
 destroy_named_semaphore(void)
 {
 	semid_t id;
 
 	if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) {
 		fail_errno("ksem_open(O_CREAT)");
 		return;
 	}
 
 	ksem_destroy_should_fail(id, EINVAL);
 
 	ksem_close(id);
 	ksem_unlink(TEST_PATH);
 }
 TEST(destroy_named_semaphore, "destroy named semaphore");
 
 static void
 close_unnamed_semaphore(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	ksem_close_should_fail(id, EINVAL);
 
 	ksem_destroy(id);
 }
 TEST(close_unnamed_semaphore, "close unnamed semaphore");
 
 static void
 destroy_invalid_fd(void)
 {
 
 	ksem_destroy_should_fail(STDERR_FILENO, EINVAL);
 }
 TEST(destroy_invalid_fd, "destroy non-semaphore file descriptor");
 
 static void
 close_invalid_fd(void)
 {
 
 	ksem_close_should_fail(STDERR_FILENO, EINVAL);
 }
 TEST(close_invalid_fd, "close non-semaphore file descriptor");
 
 static void
 create_unnamed_semaphore(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(create_unnamed_semaphore, "create unnamed semaphore");
 
 static void
 open_named_semaphore(void)
 {
 	semid_t id;
 
 	if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) {
 		fail_errno("ksem_open(O_CREAT)");
 		return;
 	}
 
 	if (ksem_close(id) < 0) {
 		fail_errno("ksem_close");
 		return;
 	}
 
 	if (ksem_unlink(TEST_PATH) < 0) {
 		fail_errno("ksem_unlink");
 		return;
 	}
 	pass();
 }
 TEST(open_named_semaphore, "create named semaphore");
 
 static void
 getvalue_invalid_semaphore(void)
 {
 	int val;
 
 	if (ksem_getvalue(STDERR_FILENO, &val) >= 0) {
 		fail_err("ksem_getvalue() didn't fail");
 		return;
 	}
 	if (errno != EINVAL) {
 		fail_errno("ksem_getvalue");
 		return;
 	}
 	pass();
 }
 TEST(getvalue_invalid_semaphore, "get value of invalid semaphore");
 
 static void
 post_invalid_semaphore(void)
 {
 
 	ksem_post_should_fail(STDERR_FILENO, EINVAL);
 }
 TEST(post_invalid_semaphore, "post of invalid semaphore");
 
 static void
 wait_invalid_semaphore(void)
 {
 
 	if (ksem_wait(STDERR_FILENO) >= 0) {
 		fail_err("ksem_wait() didn't fail");
 		return;
 	}
 	if (errno != EINVAL) {
 		fail_errno("ksem_wait");
 		return;
 	}
 	pass();
 }
 TEST(wait_invalid_semaphore, "wait for invalid semaphore");
 
 static void
 trywait_invalid_semaphore(void)
 {
 
 	if (ksem_trywait(STDERR_FILENO) >= 0) {
 		fail_err("ksem_trywait() didn't fail");
 		return;
 	}
 	if (errno != EINVAL) {
 		fail_errno("ksem_trywait");
 		return;
 	}
 	pass();
 }
 TEST(trywait_invalid_semaphore, "try wait for invalid semaphore");
 
 static void
 timedwait_invalid_semaphore(void)
 {
 
 	if (ksem_timedwait(STDERR_FILENO, NULL) >= 0) {
 		fail_err("ksem_timedwait() didn't fail");
 		return;
 	}
 	if (errno != EINVAL) {
 		fail_errno("ksem_timedwait");
 		return;
 	}
 	pass();
 }
 TEST(timedwait_invalid_semaphore, "timed wait for invalid semaphore");
 
 static int
 checkvalue(semid_t id, int expected)
 {
 	int val;
 
 	if (ksem_getvalue(id, &val) < 0) {
 		fail_errno("ksem_getvalue");
 		return (-1);
 	}
 	if (val != expected) {
 		fail_err("sem value should be %d instead of %d", expected, val);
 		return (-1);
 	}
 	return (0);
 }
 
 static void
 post_test(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 	if (checkvalue(id, 1) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (ksem_post(id) < 0) {
 		fail_errno("ksem_post");
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 2) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(post_test, "simple post");
 
 static void
 use_after_unlink_test(void)
 {
 	semid_t id;
 
 	/*
 	 * Create named semaphore with value of 1 and then unlink it
 	 * while still retaining the initial reference.
 	 */
 	if (ksem_open(&id, TEST_PATH, O_CREAT | O_EXCL, 0777, 1) < 0) {
 		fail_errno("ksem_open(O_CREAT | O_EXCL)");
 		return;
 	}
 	if (ksem_unlink(TEST_PATH) < 0) {
 		fail_errno("ksem_unlink");
 		ksem_close(id);
 		return;
 	}
 	if (checkvalue(id, 1) < 0) {
 		ksem_close(id);
 		return;
 	}
 
 	/* Post the semaphore to set its value to 2. */
 	if (ksem_post(id) < 0) {
 		fail_errno("ksem_post");
 		ksem_close(id);
 		return;
 	}
 	if (checkvalue(id, 2) < 0) {
 		ksem_close(id);
 		return;
 	}
 
 	/* Wait on the semaphore which should set its value to 1. */
 	if (ksem_wait(id) < 0) {
 		fail_errno("ksem_wait");
 		ksem_close(id);
 		return;
 	}
 	if (checkvalue(id, 1) < 0) {
 		ksem_close(id);
 		return;
 	}
 
 	if (ksem_close(id) < 0) {
 		fail_errno("ksem_close");
 		return;
 	}
 	pass();
 }
 TEST(use_after_unlink_test, "use named semaphore after unlink");
 
 static void
 unlocked_trywait(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/* This should succeed and decrement the value to 0. */
 	if (ksem_trywait(id) < 0) {
 		fail_errno("ksem_trywait()");
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(unlocked_trywait, "unlocked trywait");
 
 static void
 locked_trywait(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, 0) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/* This should fail with EAGAIN and leave the value at 0. */
 	if (ksem_trywait(id) >= 0) {
 		fail_err("ksem_trywait() didn't fail");
 		ksem_destroy(id);
 		return;
 	}
 	if (errno != EAGAIN) {
 		fail_errno("wrong error from ksem_trywait()");
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(locked_trywait, "locked trywait");
 
 /*
  * Use a timer to post a specific semaphore after a timeout.  A timer
  * is scheduled via schedule_post().  check_alarm() must be called
  * afterwards to clean up and check for errors.
  */
 static semid_t alarm_id = -1;
 static int alarm_errno;
 static int alarm_handler_installed;
 
 static void
 alarm_handler(int signo)
 {
 
 	if (ksem_post(alarm_id) < 0)
 		alarm_errno = errno;
 }
 
 static int
 check_alarm(int just_clear)
 {
 	struct itimerval it;
 
 	bzero(&it, sizeof(it));
 	if (just_clear) {
 		setitimer(ITIMER_REAL, &it, NULL);
 		alarm_errno = 0;
 		alarm_id = -1;
 		return (0);
 	}
 	if (setitimer(ITIMER_REAL, &it, NULL) < 0) {
 		fail_errno("setitimer");
 		return (-1);
 	}
 	if (alarm_errno != 0 && !just_clear) {
 		errno = alarm_errno;
 		fail_errno("ksem_post() (via timeout)");
 		alarm_errno = 0;
 		return (-1);
 	}
 	alarm_id = -1;
 	
 	return (0);
 }
 
 static int
 schedule_post(semid_t id, u_int msec)
 {
 	struct itimerval it;
 
 	if (!alarm_handler_installed) {
 		if (signal(SIGALRM, alarm_handler) == SIG_ERR) {
 			fail_errno("signal(SIGALRM)");
 			return (-1);
 		}
 		alarm_handler_installed = 1;
 	}
 	if (alarm_id != -1) {
 		fail_err("ksem_post() already scheduled");
 		return (-1);
 	}
 	alarm_id = id;
 	bzero(&it, sizeof(it));
 	it.it_value.tv_sec = msec / 1000;
 	it.it_value.tv_usec = (msec % 1000) * 1000;
 	if (setitimer(ITIMER_REAL, &it, NULL) < 0) {
 		fail_errno("setitimer");
 		return (-1);
 	}
 	return (0);
 }
 
 static int
 timedwait(semid_t id, u_int msec, u_int *delta, int error)
 {
 	struct timespec start, end;
 
 	if (clock_gettime(CLOCK_REALTIME, &start) < 0) {
 		fail_errno("clock_gettime(CLOCK_REALTIME)");
 		return (-1);
 	}
 	end.tv_sec = msec / 1000;
 	end.tv_nsec = msec % 1000 * 1000000;
-	timespecadd(&end, &start);
+	timespecadd(&end, &start, &end);
 	if (ksem_timedwait(id, &end) < 0) {
 		if (errno != error) {
 			fail_errno("ksem_timedwait");
 			return (-1);
 		}
 	} else if (error != 0) {
 		fail_err("ksem_timedwait() didn't fail");
 		return (-1);
 	}
 	if (clock_gettime(CLOCK_REALTIME, &end) < 0) {
 		fail_errno("clock_gettime(CLOCK_REALTIME)");
 		return (-1);
 	}
-	timespecsub(&end, &start);
+	timespecsub(&end, &start, &end);
 	*delta = end.tv_nsec / 1000000;
 	*delta += end.tv_sec * 1000;
 	return (0);
 }
 
 static void
 unlocked_timedwait(void)
 {
 	semid_t id;
 	u_int elapsed;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/* This should succeed right away and set the value to 0. */
 	if (timedwait(id, 5000, &elapsed, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (!ELAPSED(elapsed, 0)) {
 		fail_err("ksem_timedwait() of unlocked sem took %ums", elapsed);
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(unlocked_timedwait, "unlocked timedwait");
 
 static void
 expired_timedwait(void)
 {
 	semid_t id;
 	u_int elapsed;
 
 	if (ksem_init(&id, 0) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/* This should fail with a timeout and leave the value at 0. */
 	if (timedwait(id, 2500, &elapsed, ETIMEDOUT) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (!ELAPSED(elapsed, 2500)) {
 		fail_err(
 	    "ksem_timedwait() of locked sem took %ums instead of 2500ms",
 		    elapsed);
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(expired_timedwait, "locked timedwait timeout");
 
 static void
 locked_timedwait(void)
 {
 	semid_t id;
 	u_int elapsed;
 
 	if (ksem_init(&id, 0) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/*
 	 * Schedule a post to trigger after 1000 ms.  The subsequent
 	 * timedwait should succeed after 1000 ms as a result w/o
 	 * timing out.
 	 */
 	if (schedule_post(id, 1000) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (timedwait(id, 2000, &elapsed, 0) < 0) {
 		check_alarm(1);
 		ksem_destroy(id);
 		return;
 	}
 	if (!ELAPSED(elapsed, 1000)) {
 		fail_err(
 	    "ksem_timedwait() with delayed post took %ums instead of 1000ms",
 		    elapsed);
 		check_alarm(1);
 		ksem_destroy(id);
 		return;
 	}
 	if (check_alarm(0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(locked_timedwait, "locked timedwait");
 
 static int
 testwait(semid_t id, u_int *delta)
 {
 	struct timespec start, end;
 
 	if (clock_gettime(CLOCK_REALTIME, &start) < 0) {
 		fail_errno("clock_gettime(CLOCK_REALTIME)");
 		return (-1);
 	}
 	if (ksem_wait(id) < 0) {
 		fail_errno("ksem_wait");
 		return (-1);
 	}
 	if (clock_gettime(CLOCK_REALTIME, &end) < 0) {
 		fail_errno("clock_gettime(CLOCK_REALTIME)");
 		return (-1);
 	}
-	timespecsub(&end, &start);
+	timespecsub(&end, &start, &end);
 	*delta = end.tv_nsec / 1000000;
 	*delta += end.tv_sec * 1000;
 	return (0);
 }
 
 static void
 unlocked_wait(void)
 {
 	semid_t id;
 	u_int elapsed;
 
 	if (ksem_init(&id, 1) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/* This should succeed right away and set the value to 0. */
 	if (testwait(id, &elapsed) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (!ELAPSED(elapsed, 0)) {
 		fail_err("ksem_wait() of unlocked sem took %ums", elapsed);
 		ksem_destroy(id);
 		return;
 	}
 	if (checkvalue(id, 0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(unlocked_wait, "unlocked wait");
 
 static void
 locked_wait(void)
 {
 	semid_t id;
 	u_int elapsed;
 
 	if (ksem_init(&id, 0) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	/*
 	 * Schedule a post to trigger after 1000 ms.  The subsequent
 	 * wait should succeed after 1000 ms as a result.
 	 */
 	if (schedule_post(id, 1000) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 	if (testwait(id, &elapsed) < 0) {
 		check_alarm(1);
 		ksem_destroy(id);
 		return;
 	}
 	if (!ELAPSED(elapsed, 1000)) {
 		fail_err(
 	    "ksem_wait() with delayed post took %ums instead of 1000ms",
 		    elapsed);
 		check_alarm(1);
 		ksem_destroy(id);
 		return;
 	}
 	if (check_alarm(0) < 0) {
 		ksem_destroy(id);
 		return;
 	}
 
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(locked_wait, "locked wait");
 
 /*
  * Fork off a child process.  The child will open the semaphore via
  * the same name.  The child will then block on the semaphore waiting
  * for the parent to post it.
  */
 static int
 wait_twoproc_child(void *arg)
 {
 	semid_t id;
 
 	if (ksem_open(&id, TEST_PATH, 0, 0, 0) < 0)
 		return (CSTAT(1, errno));
 	if (ksem_wait(id) < 0)
 		return (CSTAT(2, errno));
 	if (ksem_close(id) < 0)
 		return (CSTAT(3, errno));
 	return (CSTAT(0, 0));
 }
 
 static void
 wait_twoproc_test(void)
 {
 	semid_t id;
 	int stat;
 
 	if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 0)) {
 		fail_errno("ksem_open");
 		return;
 	}
 
 	if (schedule_post(id, 500) < 0) {
 		ksem_close(id);
 		ksem_unlink(TEST_PATH);
 		return;
 	}		
 
 	if (child_worker(wait_twoproc_child, NULL, &stat) < 0) {
 		check_alarm(1);
 		ksem_close(id);
 		ksem_unlink(TEST_PATH);
 		return;
 	}
 
 	errno = CSTAT_ERROR(stat);
 	switch (CSTAT_CLASS(stat)) {
 	case 0:
 		pass();
 		break;
 	case 1:
 		fail_errno("child ksem_open()");
 		break;
 	case 2:
 		fail_errno("child ksem_wait()");
 		break;
 	case 3:
 		fail_errno("child ksem_close()");
 		break;
 	default:
 		fail_err("bad child state %#x", stat);
 		break;
 	}
 
 	check_alarm(1);
 	ksem_close(id);
 	ksem_unlink(TEST_PATH);
 }
 TEST(wait_twoproc_test, "two proc wait");
 
 static void
 maxvalue_test(void)
 {
 	semid_t id;
 	int val;
 
 	if (ksem_init(&id, SEM_VALUE_MAX) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 	if (ksem_getvalue(id, &val) < 0) {
 		fail_errno("ksem_getvalue");
 		ksem_destroy(id);
 		return;
 	}
 	if (val != SEM_VALUE_MAX) {
 		fail_err("value %d != SEM_VALUE_MAX");
 		ksem_destroy(id);
 		return;
 	}
 	if (val < 0) {
 		fail_err("value < 0");
 		ksem_destroy(id);
 		return;
 	}
 	if (ksem_destroy(id) < 0) {
 		fail_errno("ksem_destroy");
 		return;
 	}
 	pass();
 }
 TEST(maxvalue_test, "get value of SEM_VALUE_MAX semaphore");
 
 static void
 maxvalue_post_test(void)
 {
 	semid_t id;
 
 	if (ksem_init(&id, SEM_VALUE_MAX) < 0) {
 		fail_errno("ksem_init");
 		return;
 	}
 
 	ksem_post_should_fail(id, EOVERFLOW);
 
 	ksem_destroy(id);
 }
 TEST(maxvalue_post_test, "post SEM_VALUE_MAX semaphore");
 
 static void
 busy_destroy_test(void)
 {
 	char errbuf[_POSIX2_LINE_MAX];
 	struct kinfo_proc *kp;
 	semid_t id;
 	pid_t pid;
 	kvm_t *kd;
 	int count;
 
 	kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf);
 	if (kd == NULL) {
 		fail_err("kvm_openfiles: %s", errbuf);
 		return;
 	}
 
 	if (ksem_init(&id, 0) < 0) {
 		fail_errno("ksem_init");
 		kvm_close(kd);
 		return;
 	}
 
 	pid = fork();
 	switch (pid) {
 	case -1:
 		/* Error. */
 		fail_errno("fork");
 		ksem_destroy(id);
 		kvm_close(kd);
 		return;
 	case 0:
 		/* Child. */
 		ksem_wait(id);
 		exit(0);
 	}
 
 	/*
 	 * Wait for the child process to block on the semaphore.  This
 	 * is a bit gross.
 	 */
 	for (;;) {
 		kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &count);
 		if (kp == NULL) {
 			fail_err("kvm_getprocs: %s", kvm_geterr(kd));
 			kvm_close(kd);
 			ksem_destroy(id);
 			return;
 		}
 		if (kp->ki_stat == SSLEEP &&
 		    (strcmp(kp->ki_wmesg, "sem") == 0 ||
 		    strcmp(kp->ki_wmesg, "ksem") == 0))
 			break;
 		usleep(1000);
 	}
 	kvm_close(kd);
 
 	ksem_destroy_should_fail(id, EBUSY);
 
 	/* Cleanup. */
 	ksem_post(id);
 	waitpid(pid, NULL, 0);
 	ksem_destroy(id);
 }
 TEST(busy_destroy_test, "destroy unnamed semaphore with waiter");
 
 static int
 exhaust_unnamed_child(void *arg)
 {
 	semid_t id;
 	int i, max;
 
 	max = (intptr_t)arg;
 	for (i = 0; i < max + 1; i++) {
 		if (ksem_init(&id, 1) < 0) {
 			if (errno == ENOSPC)
 				return (CSTAT(0, 0));
 			return (CSTAT(1, errno));
 		}
 	}
 	return (CSTAT(2, 0));
 }
 
 static void
 exhaust_unnamed_sems(void)
 {
 	size_t len;
 	int nsems_max, stat;
 
 	len = sizeof(nsems_max);
 	if (sysctlbyname("p1003_1b.sem_nsems_max", &nsems_max, &len, NULL, 0) <
 	    0) {
 		fail_errno("sysctl(p1003_1b.sem_nsems_max)");
 		return;
 	}
 
 	if (child_worker(exhaust_unnamed_child, (void *)(uintptr_t)nsems_max,
 	    &stat))
 		return;
 	errno = CSTAT_ERROR(stat);
 	switch (CSTAT_CLASS(stat)) {
 	case 0:
 		pass();
 		break;
 	case 1:
 		fail_errno("ksem_init");
 		break;
 	case 2:
 		fail_err("Limit of %d semaphores not enforced", nsems_max);
 		break;
 	default:
 		fail_err("bad child state %#x", stat);
 		break;
 	}
 }
 TEST(exhaust_unnamed_sems, "exhaust unnamed semaphores (1)");
 
 static int
 exhaust_named_child(void *arg)
 {
 	char buffer[64];
 	semid_t id;
 	int i, max;
 
 	max = (intptr_t)arg;
 	for (i = 0; i < max + 1; i++) {
 		snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i);
 		if (ksem_open(&id, buffer, O_CREAT, 0777, 1) < 0) {
 			if (errno == ENOSPC || errno == EMFILE ||
 			    errno == ENFILE)
 				return (CSTAT(0, 0));
 			return (CSTAT(1, errno));
 		}
 	}
 	return (CSTAT(2, errno));
 }
 
 static void
 exhaust_named_sems(void)
 {
 	char buffer[64];
 	size_t len;
 	int i, nsems_max, stat;
 
 	len = sizeof(nsems_max);
 	if (sysctlbyname("p1003_1b.sem_nsems_max", &nsems_max, &len, NULL, 0) <
 	    0) {
 		fail_errno("sysctl(p1003_1b.sem_nsems_max)");
 		return;
 	}
 
 	if (child_worker(exhaust_named_child, (void *)(uintptr_t)nsems_max,
 	    &stat) < 0)
 		return;
 	errno = CSTAT_ERROR(stat);
 	switch (CSTAT_CLASS(stat)) {
 	case 0:
 		pass();
 		break;
 	case 1:
 		fail_errno("ksem_open");
 		break;
 	case 2:
 		fail_err("Limit of %d semaphores not enforced", nsems_max);
 		break;
 	default:
 		fail_err("bad child state %#x", stat);
 		break;
 	}
 
 	/* Cleanup any semaphores created by the child. */
 	for (i = 0; i < nsems_max + 1; i++) {
 		snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i);
 		ksem_unlink(buffer);
 	}
 }
 TEST(exhaust_named_sems, "exhaust named semaphores (1)");
 
 static int
 fdlimit_set(void *arg)
 {
 	struct rlimit rlim;
 	int max;
 
 	max = (intptr_t)arg;
 	if (getrlimit(RLIMIT_NOFILE, &rlim) < 0)
 		return (CSTAT(3, errno));
 	rlim.rlim_cur = max;
 	if (setrlimit(RLIMIT_NOFILE, &rlim) < 0)
 		return (CSTAT(4, errno));
 	return (0);
 }
 
 static int
 fdlimit_unnamed_child(void *arg)
 {
 	int stat;
 
 	stat = fdlimit_set(arg);
 	if (stat == 0)
 		stat = exhaust_unnamed_child(arg);
 	return (stat);
 }
 
 static void
 fdlimit_unnamed_sems(void)
 {
 	int nsems_max, stat;
 
 	nsems_max = 10;
 	if (child_worker(fdlimit_unnamed_child, (void *)(uintptr_t)nsems_max,
 	    &stat))
 		return;
 	errno = CSTAT_ERROR(stat);
 	switch (CSTAT_CLASS(stat)) {
 	case 0:
 		pass();
 		break;
 	case 1:
 		fail_errno("ksem_init");
 		break;
 	case 2:
 		fail_err("Limit of %d semaphores not enforced", nsems_max);
 		break;
 	case 3:
 		fail_errno("getrlimit");
 		break;
 	case 4:
 		fail_errno("getrlimit");
 		break;
 	default:
 		fail_err("bad child state %#x", stat);
 		break;
 	}
 }
 TEST(fdlimit_unnamed_sems, "exhaust unnamed semaphores (2)");
 
 static int
 fdlimit_named_child(void *arg)
 {
 	int stat;
 
 	stat = fdlimit_set(arg);
 	if (stat == 0)
 		stat = exhaust_named_child(arg);
 	return (stat);
 }
 
 static void
 fdlimit_named_sems(void)
 {
 	char buffer[64];
 	int i, nsems_max, stat;
 
 	nsems_max = 10;
 	if (child_worker(fdlimit_named_child, (void *)(uintptr_t)nsems_max,
 	    &stat) < 0)
 		return;
 	errno = CSTAT_ERROR(stat);
 	switch (CSTAT_CLASS(stat)) {
 	case 0:
 		pass();
 		break;
 	case 1:
 		fail_errno("ksem_open");
 		break;
 	case 2:
 		fail_err("Limit of %d semaphores not enforced", nsems_max);
 		break;
 	case 3:
 		fail_errno("getrlimit");
 		break;
 	case 4:
 		fail_errno("getrlimit");
 		break;
 	default:
 		fail_err("bad child state %#x", stat);
 		break;
 	}
 
 	/* Cleanup any semaphores created by the child. */
 	for (i = 0; i < nsems_max + 1; i++) {
 		snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i);
 		ksem_unlink(buffer);
 	}
 }
 TEST(fdlimit_named_sems, "exhaust named semaphores (2)");
 
 int
 main(int argc, char *argv[])
 {
 
 	signal(SIGSYS, SIG_IGN);
 	run_tests();
 	return (0);
 }
Index: head/tools/regression/sockets/udp_pingpong/udp_pingpong.c
===================================================================
--- head/tools/regression/sockets/udp_pingpong/udp_pingpong.c	(revision 336913)
+++ head/tools/regression/sockets/udp_pingpong/udp_pingpong.c	(revision 336914)
@@ -1,651 +1,626 @@
 /*-
  * Copyright (c) 2017 Maksym Sobolyev <sobomax@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The test that setups two processes A and B and make A sending
  * B UDP packet(s) and B send it back. The time of sending is recorded
  * in the payload and time of the arrival is either determined by
  * reading clock after recv() completes or using kernel-supplied
  * via recvmsg(). End-to-end time t(A->B->A) is then calculated
  * and compared against time for both t(A->B) + t(B->A) to make
  * sure it makes sense.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <sys/time.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <err.h>
 #include <poll.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <time.h>
 #include <unistd.h>
 
 #define	NPKTS		1000
 #define	PKT_SIZE	128
 /* Timeout to receive pong on the side A, 100ms */
 #define SRECV_TIMEOUT	(1 * 100)
 /*
  * Timeout to receive ping on the side B. 4x as large as on the side A,
  * so that in the case of packet loss the side A will have a chance to
  * realize that and send few more before B bails out.
  */
 #define RRECV_TIMEOUT	(SRECV_TIMEOUT * 4)
 #define MIN_NRECV	((NPKTS * 99) / 100) /* 99% */
 
 //#define	SIMULATE_PLOSS
 
 struct trip_ts {
     struct timespec sent;
     struct timespec recvd;
 };
 
 struct test_pkt {
     int pnum;
     struct trip_ts tss[2];
     int lost;
     unsigned char data[PKT_SIZE];
 };
 
 struct test_ctx {
     const char *name;
     int fds[2];
     struct pollfd pfds[2];
     union {
         struct sockaddr_in v4;
         struct sockaddr_in6 v6;
     } sin[2];
     struct test_pkt test_pkts[NPKTS];
     int nsent;
     int nrecvd;
     clockid_t clock;
     int use_recvmsg;
     int ts_type;
 };
 
 struct rtt {
     struct timespec a2b;
     struct timespec b2a;
     struct timespec e2e;
     struct timespec a2b_b2a;
 };
 
 #define SEC(x)		((x)->tv_sec)
 #define NSEC(x)		((x)->tv_nsec)
 #define NSEC_MAX	1000000000L
 #define NSEC_IN_USEC	1000L
 
-#define timespecsub2(r, v, u)                                      \
-    do {                                                           \
-        SEC(r) = SEC(v) - SEC(u);                                  \
-        NSEC(r) = NSEC(v) - NSEC(u);                               \
-        if (NSEC(r) < 0 && (SEC(r) > 0 || NSEC(r) <= -NSEC_MAX)) { \
-            SEC(r)--;                                              \
-            NSEC(r) += NSEC_MAX;                                   \
-        }                                                          \
-    } while (0);
-
-#define timespecadd2(r, v, u)                                      \
-    do {                                                           \
-        SEC(r) = SEC(v) + SEC(u);                                  \
-        NSEC(r) = NSEC(v) + NSEC(u);                               \
-        if (NSEC(r) >= NSEC_MAX) {                                 \
-            SEC(r)++;                                              \
-            NSEC(r) -= NSEC_MAX;                                   \
-        }                                                          \
-    } while (0);
-
-#define timespeccmp(t, c, u)                                       \
-    ((SEC(t) == SEC(u)) ?                                          \
-      (NSEC(t) c NSEC(u)) :                                        \
-      (SEC(t) c SEC(u)))
-
 #define timeval2timespec(tv, ts)                                   \
     do {                                                           \
         SEC(ts) = (tv)->tv_sec;                                    \
         NSEC(ts) = (tv)->tv_usec * NSEC_IN_USEC;                   \
     } while (0);
 
 static const struct timespec zero_ts;
 /* 0.01s, should be more than enough for the loopback communication  */
 static const struct timespec max_ts = {.tv_nsec = (NSEC_MAX / 100)};
 
 enum ts_types {TT_TIMESTAMP = -2, TT_BINTIME = -1,
   TT_REALTIME_MICRO = SO_TS_REALTIME_MICRO, TT_TS_BINTIME = SO_TS_BINTIME,
   TT_REALTIME = SO_TS_REALTIME, TT_MONOTONIC = SO_TS_MONOTONIC};
 
 static clockid_t
 get_clock_type(struct test_ctx *tcp)
 {
     switch (tcp->ts_type) {
     case TT_TIMESTAMP:
     case TT_BINTIME:
     case TT_REALTIME_MICRO:
     case TT_TS_BINTIME:
     case TT_REALTIME:
         return (CLOCK_REALTIME);
 
     case TT_MONOTONIC:
         return (CLOCK_MONOTONIC);
     }
     abort();
 }
 
 static int
 get_scm_type(struct test_ctx *tcp)
 {
     switch (tcp->ts_type) {
     case TT_TIMESTAMP:
     case TT_REALTIME_MICRO:
         return (SCM_TIMESTAMP);
 
     case TT_BINTIME:
     case TT_TS_BINTIME:
         return (SCM_BINTIME);
 
     case TT_REALTIME:
         return (SCM_REALTIME);
 
     case TT_MONOTONIC:
         return (SCM_MONOTONIC);
     }
     abort();
 }
 
 static size_t
 get_scm_size(struct test_ctx *tcp)
 {
     switch (tcp->ts_type) {
     case TT_TIMESTAMP:
     case TT_REALTIME_MICRO:
         return (sizeof(struct timeval));
 
     case TT_BINTIME:
     case TT_TS_BINTIME:
         return (sizeof(struct bintime));
 
     case TT_REALTIME:
     case TT_MONOTONIC:
         return (sizeof(struct timespec));
     }
     abort();
 }
 
 static void
 setup_ts_sockopt(struct test_ctx *tcp, int fd)
 {
     int rval, oname1, oname2, sval1, sval2;
 
     oname1 = SO_TIMESTAMP;
     oname2 = -1;
     sval2 = -1;
 
     switch (tcp->ts_type) {
     case TT_REALTIME_MICRO:
     case TT_TS_BINTIME:
     case TT_REALTIME:
     case TT_MONOTONIC:
         oname2 = SO_TS_CLOCK;
         sval2 = tcp->ts_type;
         break;
 
     case TT_TIMESTAMP:
         break;
 
     case TT_BINTIME:
         oname1 = SO_BINTIME;
         break;
 
     default:
         abort();
     }
 
     sval1 = 1;
     rval = setsockopt(fd, SOL_SOCKET, oname1, &sval1,
       sizeof(sval1));
     if (rval != 0) {
         err(1, "%s: setup_udp: setsockopt(%d, %d, 1)", tcp->name,
           fd, oname1);
     }
     if (oname2 == -1)
         return;
     rval = setsockopt(fd, SOL_SOCKET, oname2, &sval2,
       sizeof(sval2));
     if (rval != 0) {
         err(1, "%s: setup_udp: setsockopt(%d, %d, %d)",
           tcp->name, fd, oname2, sval2);
     }
 }
 
 
 static void
 setup_udp(struct test_ctx *tcp)
 {
     int i;
     socklen_t sin_len, af_len;
 
     af_len = sizeof(tcp->sin[0].v4);
     for (i = 0; i < 2; i++) {
         tcp->sin[i].v4.sin_len = af_len;
         tcp->sin[i].v4.sin_family = AF_INET;
         tcp->sin[i].v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
         tcp->fds[i] = socket(PF_INET, SOCK_DGRAM, 0);
         if (tcp->fds[i] < 0)
             err(1, "%s: setup_udp: socket", tcp->name);
         if (bind(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], af_len) < 0)
             err(1, "%s: setup_udp: bind(%s, %d)", tcp->name,
               inet_ntoa(tcp->sin[i].v4.sin_addr), 0);
         sin_len = af_len;
         if (getsockname(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], &sin_len) < 0)
             err(1, "%s: setup_udp: getsockname(%d)", tcp->name, tcp->fds[i]);
         if (tcp->use_recvmsg != 0) {
             setup_ts_sockopt(tcp, tcp->fds[i]);
         }
 
         tcp->pfds[i].fd = tcp->fds[i];
         tcp->pfds[i].events = POLLIN;
     }
 
     if (connect(tcp->fds[0], (struct sockaddr *)&tcp->sin[1], af_len) < 0)
         err(1, "%s: setup_udp: connect(%s, %d)", tcp->name,
           inet_ntoa(tcp->sin[1].v4.sin_addr), ntohs(tcp->sin[1].v4.sin_port));
     if (connect(tcp->fds[1], (struct sockaddr *)&tcp->sin[0], af_len) < 0)
         err(1, "%s: setup_udp: connect(%s, %d)", tcp->name,
           inet_ntoa(tcp->sin[0].v4.sin_addr), ntohs(tcp->sin[0].v4.sin_port));
 }
 
 static char *
 inet_ntoa6(const void *sin6_addr)
 {
     static char straddr[INET6_ADDRSTRLEN];
 
     inet_ntop(AF_INET6, sin6_addr, straddr, sizeof(straddr));
     return (straddr);
 }
 
 static void
 setup_udp6(struct test_ctx *tcp)
 {
     int i;
     socklen_t sin_len, af_len;
 
     af_len = sizeof(tcp->sin[0].v6);
     for (i = 0; i < 2; i++) {
         tcp->sin[i].v6.sin6_len = af_len;
         tcp->sin[i].v6.sin6_family = AF_INET6;
         tcp->sin[i].v6.sin6_addr = in6addr_loopback;
         tcp->fds[i] = socket(PF_INET6, SOCK_DGRAM, 0);
         if (tcp->fds[i] < 0)
             err(1, "%s: setup_udp: socket", tcp->name);
         if (bind(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], af_len) < 0)
             err(1, "%s: setup_udp: bind(%s, %d)", tcp->name,
               inet_ntoa6(&tcp->sin[i].v6.sin6_addr), 0);
         sin_len = af_len;
         if (getsockname(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], &sin_len) < 0)
             err(1, "%s: setup_udp: getsockname(%d)", tcp->name, tcp->fds[i]);
         if (tcp->use_recvmsg != 0) {
             setup_ts_sockopt(tcp, tcp->fds[i]);
         }
 
         tcp->pfds[i].fd = tcp->fds[i];
         tcp->pfds[i].events = POLLIN;
     }
 
     if (connect(tcp->fds[0], (struct sockaddr *)&tcp->sin[1], af_len) < 0)
         err(1, "%s: setup_udp: connect(%s, %d)", tcp->name,
           inet_ntoa6(&tcp->sin[1].v6.sin6_addr),
           ntohs(tcp->sin[1].v6.sin6_port));
     if (connect(tcp->fds[1], (struct sockaddr *)&tcp->sin[0], af_len) < 0)
         err(1, "%s: setup_udp: connect(%s, %d)", tcp->name,
           inet_ntoa6(&tcp->sin[0].v6.sin6_addr),
           ntohs(tcp->sin[0].v6.sin6_port));
 }
 
 static void
 teardown_udp(struct test_ctx *tcp)
 {
 
     close(tcp->fds[0]);
     close(tcp->fds[1]);
 }
 
 static void
 send_pkt(struct test_ctx *tcp, int pnum, int fdidx, const char *face)
 {
     ssize_t r;
     size_t slen;
 
     slen = sizeof(tcp->test_pkts[pnum]);
     clock_gettime(get_clock_type(tcp), &tcp->test_pkts[pnum].tss[fdidx].sent);
     r = send(tcp->fds[fdidx], &tcp->test_pkts[pnum], slen, 0);
     if (r < 0) {
         err(1, "%s: %s: send(%d)", tcp->name, face, tcp->fds[fdidx]);
     }
     if (r < (ssize_t)slen) {
         errx(1, "%s: %s: send(%d): short send", tcp->name, face,
           tcp->fds[fdidx]);
     }
     tcp->nsent += 1;
 }
 
 #define PDATA(tcp, i) ((tcp)->test_pkts[(i)].data)
 
 static void
 hdr_extract_ts(struct test_ctx *tcp, struct msghdr *mhp, struct timespec *tp)
 {
     int scm_type;
     size_t scm_size;
     union {
         struct timespec ts;
         struct bintime bt;
         struct timeval tv;
     } tdata;
     struct cmsghdr *cmsg;
 
     scm_type = get_scm_type(tcp);
     scm_size = get_scm_size(tcp);
     for (cmsg = CMSG_FIRSTHDR(mhp); cmsg != NULL;
       cmsg = CMSG_NXTHDR(mhp, cmsg)) {
         if ((cmsg->cmsg_level == SOL_SOCKET) &&
           (cmsg->cmsg_type == scm_type)) {
             memcpy(&tdata, CMSG_DATA(cmsg), scm_size);
             break;
         }
     }
     if (cmsg == NULL) {
         abort();
     }
     switch (tcp->ts_type) {
     case TT_REALTIME:
     case TT_MONOTONIC:
         *tp = tdata.ts;
         break;
 
     case TT_TIMESTAMP:
     case TT_REALTIME_MICRO:
         timeval2timespec(&tdata.tv, tp);
         break;
 
     case TT_BINTIME:
     case TT_TS_BINTIME:
         bintime2timespec(&tdata.bt, tp);
         break;
 
     default:
         abort();
     }
 }
 
 static void
 recv_pkt_recvmsg(struct test_ctx *tcp, int fdidx, const char *face, void *buf,
   size_t rlen, struct timespec *tp)
 {
     /* We use a union to make sure hdr is aligned */
     union {
         struct cmsghdr hdr;
         unsigned char buf[CMSG_SPACE(1024)];
     } cmsgbuf;
     struct msghdr msg;
     struct iovec iov;
     ssize_t rval;
 
     memset(&msg, '\0', sizeof(msg));
     iov.iov_base = buf;
     iov.iov_len = rlen;
     msg.msg_iov = &iov;
     msg.msg_iovlen = 1;
     msg.msg_control = cmsgbuf.buf;
     msg.msg_controllen = sizeof(cmsgbuf.buf);
 
     rval = recvmsg(tcp->fds[fdidx], &msg, 0);
     if (rval < 0) {
         err(1, "%s: %s: recvmsg(%d)", tcp->name, face, tcp->fds[fdidx]);
     }
     if (rval < (ssize_t)rlen) {
         errx(1, "%s: %s: recvmsg(%d): short recv", tcp->name, face,
           tcp->fds[fdidx]);
     }
 
     hdr_extract_ts(tcp, &msg, tp);
 }
 
 static void
 recv_pkt_recv(struct test_ctx *tcp, int fdidx, const char *face, void *buf,
   size_t rlen, struct timespec *tp)
 {
     ssize_t rval;
 
     rval = recv(tcp->fds[fdidx], buf, rlen, 0);
     clock_gettime(get_clock_type(tcp), tp);
     if (rval < 0) {
         err(1, "%s: %s: recv(%d)", tcp->name, face, tcp->fds[fdidx]);
     }
     if (rval < (ssize_t)rlen) {
         errx(1, "%s: %s: recv(%d): short recv", tcp->name, face,
             tcp->fds[fdidx]);
     }
 }
 
 static int
 recv_pkt(struct test_ctx *tcp, int fdidx, const char *face, int tout)
 {
     int pr;
     struct test_pkt recv_buf;
     size_t rlen;
 
     pr = poll(&tcp->pfds[fdidx], 1, tout);
     if (pr < 0) {
         err(1, "%s: %s: poll(%d)", tcp->name, face, tcp->fds[fdidx]);
     }
     if (pr == 0) {
         return (-1);
     }
     if(tcp->pfds[fdidx].revents != POLLIN) {
         errx(1, "%s: %s: poll(%d): unexpected result", tcp->name, face,
           tcp->fds[fdidx]);
     }
     rlen = sizeof(recv_buf);
     if (tcp->use_recvmsg == 0) {
         recv_pkt_recv(tcp, fdidx, face, &recv_buf, rlen,
           &recv_buf.tss[fdidx].recvd);
     } else {
         recv_pkt_recvmsg(tcp, fdidx, face, &recv_buf, rlen,
           &recv_buf.tss[fdidx].recvd);
     }
     if (recv_buf.pnum < 0 || recv_buf.pnum >= NPKTS ||
       memcmp(recv_buf.data, PDATA(tcp, recv_buf.pnum), PKT_SIZE) != 0) {
         errx(1, "%s: %s: recv(%d): corrupted data, packet %d", tcp->name,
           face, tcp->fds[fdidx], recv_buf.pnum);
     }
     tcp->nrecvd += 1;
     memcpy(tcp->test_pkts[recv_buf.pnum].tss, recv_buf.tss,
       sizeof(recv_buf.tss));
     tcp->test_pkts[recv_buf.pnum].lost = 0;
     return (recv_buf.pnum);
 }
 
 static void
 test_server(struct test_ctx *tcp)
 {
     int i, j;
 
     for (i = 0; i < NPKTS; i++) {
         send_pkt(tcp, i, 0, __FUNCTION__);
         j = recv_pkt(tcp, 0, __FUNCTION__, SRECV_TIMEOUT);
         if (j < 0) {
             warnx("packet %d is lost", i);
             /* timeout */
             continue;
         }
     }
 }
 
 static void
 test_client(struct test_ctx *tcp)
 {
     int i, j;
 
     for (i = 0; i < NPKTS; i++) {
         j = recv_pkt(tcp, 1, __FUNCTION__, RRECV_TIMEOUT);
         if (j < 0) {
             /* timeout */
             return;
         }
 #if defined(SIMULATE_PLOSS)
         if ((i % 99) == 0) {
             warnx("dropping packet %d", i);
             continue;
         }
 #endif
         send_pkt(tcp, j, 1, __FUNCTION__);
     }
 }
 
 static void
 calc_rtt(struct test_pkt *tpp, struct rtt *rttp)
 {
 
-    timespecsub2(&rttp->a2b, &tpp->tss[1].recvd, &tpp->tss[0].sent);
-    timespecsub2(&rttp->b2a, &tpp->tss[0].recvd, &tpp->tss[1].sent);
-    timespecadd2(&rttp->a2b_b2a, &rttp->a2b, &rttp->b2a);
-    timespecsub2(&rttp->e2e, &tpp->tss[0].recvd, &tpp->tss[0].sent);
+    timespecsub(&tpp->tss[1].recvd, &tpp->tss[0].sent, &rttp->a2b);
+    timespecsub(&tpp->tss[0].recvd, &tpp->tss[1].sent, &rttp->b2a);
+    timespecadd(&rttp->a2b, &rttp->b2a, &rttp->a2b_b2a);
+    timespecsub(&tpp->tss[0].recvd, &tpp->tss[0].sent, &rttp->e2e);
 }
 
 static void
 test_run(int ts_type, int use_ipv6, int use_recvmsg, const char *name)
 {
     struct test_ctx test_ctx;
     pid_t pid, cpid;
     int i, j, status;
 
     printf("Testing %s via %s: ", name, (use_ipv6 == 0) ? "IPv4" : "IPv6");
     fflush(stdout);
     bzero(&test_ctx, sizeof(test_ctx));
     test_ctx.name = name;
     test_ctx.use_recvmsg = use_recvmsg;
     test_ctx.ts_type = ts_type;
     if (use_ipv6 == 0) {
         setup_udp(&test_ctx);
     } else {
         setup_udp6(&test_ctx);
     }
     for (i = 0; i < NPKTS; i++) {
         test_ctx.test_pkts[i].pnum = i;
         test_ctx.test_pkts[i].lost = 1;
         for (j = 0; j < PKT_SIZE; j++) {
             test_ctx.test_pkts[i].data[j] = (unsigned char)random();
         }
     }
     cpid = fork();
     if (cpid < 0) {
         err(1, "%s: fork()", test_ctx.name);
     }
     if (cpid == 0) {
         test_client(&test_ctx);
         exit(0);
     }
     test_server(&test_ctx);
     pid = waitpid(cpid, &status, 0);
     if (pid == (pid_t)-1) {
         err(1, "%s: waitpid(%d)", test_ctx.name, cpid);
     }
 
     if (WIFEXITED(status)) {
         if (WEXITSTATUS(status) != EXIT_SUCCESS) {
             errx(1, "client exit status is %d",
               WEXITSTATUS(status));
         }
     } else {
         if (WIFSIGNALED(status))
             errx(1, "abnormal termination of client, signal %d%s",
               WTERMSIG(status), WCOREDUMP(status) ?
               " (core file generated)" : "");
         else
             errx(1, "termination of client, unknown status");
     }
     if (test_ctx.nrecvd < MIN_NRECV) {
         errx(1, "packet loss is too high %d received out of %d, min %d",
           test_ctx.nrecvd, test_ctx.nsent, MIN_NRECV);
     }
     for (i = 0; i < NPKTS; i++) {
         struct rtt rtt;
         if (test_ctx.test_pkts[i].lost != 0) {
             continue;
         }
         calc_rtt(&test_ctx.test_pkts[i], &rtt);
-        if (!timespeccmp(&rtt.e2e, >, &rtt.a2b_b2a))
+        if (!timespeccmp(&rtt.e2e, &rtt.a2b_b2a, >))
             errx(1, "end-to-end trip time is too small");
-        if (!timespeccmp(&rtt.e2e, <, &max_ts))
+        if (!timespeccmp(&rtt.e2e, &max_ts, <))
             errx(1, "end-to-end trip time is too large");
-        if (!timespeccmp(&rtt.a2b, >, &zero_ts))
+        if (!timespeccmp(&rtt.a2b, &zero_ts, >))
             errx(1, "A2B trip time is not positive");
-        if (!timespeccmp(&rtt.b2a, >, &zero_ts))
+        if (!timespeccmp(&rtt.b2a, &zero_ts, >))
             errx(1, "B2A trip time is not positive");
     }
     teardown_udp(&test_ctx);
 }
 
 int
 main(void)
 {
     int i;
 
     srandomdev();
 
     for (i = 0; i < 2; i++) {
         test_run(0, i, 0, "send()/recv()");
         printf("OK\n");
         test_run(TT_TIMESTAMP, i, 1,
           "send()/recvmsg(), setsockopt(SO_TIMESTAMP, 1)");
         printf("OK\n");
         if (i == 0) {
             test_run(TT_BINTIME, i, 1,
               "send()/recvmsg(), setsockopt(SO_BINTIME, 1)");
             printf("OK\n");
         }
         test_run(TT_REALTIME_MICRO, i, 1,
           "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_REALTIME_MICRO)");
         printf("OK\n");
         test_run(TT_TS_BINTIME, i, 1,
           "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_BINTIME)");
         printf("OK\n");
         test_run(TT_REALTIME, i, 1,
           "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_REALTIME)");
         printf("OK\n");
         test_run(TT_MONOTONIC, i, 1,
           "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_MONOTONIC)");
         printf("OK\n");
     }
     exit(0);
 }
Index: head/tools/regression/sockets/unix_cmsg/uc_check_time.c
===================================================================
--- head/tools/regression/sockets/unix_cmsg/uc_check_time.c	(revision 336913)
+++ head/tools/regression/sockets/unix_cmsg/uc_check_time.c	(revision 336914)
@@ -1,101 +1,87 @@
 /*-
  * Copyright (c) 2016 Maksym Sobolyev <sobomax@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/time.h>
 #include <time.h>
 
 #include "uc_check_time.h"
 
 static const struct timeval max_diff_tv = {.tv_sec = 1, .tv_usec = 0};
 static const struct timespec max_diff_ts = {.tv_sec = 1, .tv_nsec = 0};
 
-#define timespeccmp(tvp, uvp, cmp)                                      \
-	(((tvp)->tv_sec == (uvp)->tv_sec) ?                             \
-	    ((tvp)->tv_nsec cmp (uvp)->tv_nsec) :                       \
-	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
-#define timespecsub(vvp, uvp)                                           \
-	do {                                                            \
-		(vvp)->tv_sec -= (uvp)->tv_sec;                         \
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;                       \
-		if ((vvp)->tv_nsec < 0) {                               \
-			(vvp)->tv_sec--;                                \
-			(vvp)->tv_nsec += 1000000000;                   \
-		}                                                       \
-	} while (0)
-
 int
 uc_check_bintime(const struct bintime *mt)
 {
 	struct timespec bt;
 
 	bintime2timespec(mt, &bt);
 	return (uc_check_timespec_real(&bt));
 }
 
 int
 uc_check_timeval(const struct timeval *bt)
 {
 	struct timeval ct, dt;
 
 	if (gettimeofday(&ct, NULL) < 0)
 		return (-1);
 	timersub(&ct, bt, &dt);
 	if (!timercmp(&dt, &max_diff_tv, <))
 		return (-1);
 
 	return (0);
 }
 
 int
 uc_check_timespec_real(const struct timespec *bt)
 {
 	struct timespec ct;
 
 	if (clock_gettime(CLOCK_REALTIME, &ct) < 0)
 		return (-1);
-	timespecsub(&ct, bt);
+	timespecsub(&ct, bt, &ct);
 	if (!timespeccmp(&ct, &max_diff_ts, <))
 		return (-1);
 
 	return (0);
 }
 
 int
 uc_check_timespec_mono(const struct timespec *bt)
 {
 	struct timespec ct;
 
 	if (clock_gettime(CLOCK_MONOTONIC, &ct) < 0)
 		return (-1);
-	timespecsub(&ct, bt);
+	timespecsub(&ct, bt, &ct);
 	if (!timespeccmp(&ct, &max_diff_ts, <))
 		return (-1);
 
 	return (0);
 }
Index: head/tools/tools/netrate/juggle/juggle.c
===================================================================
--- head/tools/tools/netrate/juggle/juggle.c	(revision 336913)
+++ head/tools/tools/netrate/juggle/juggle.c	(revision 336914)
@@ -1,592 +1,579 @@
 /*-
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/stdint.h>
 #include <sys/time.h>
 #include <sys/utsname.h>
 #include <sys/wait.h>
 
 #include <netinet/in.h>
 
 #include <err.h>
 #include <errno.h>
 #include <pthread.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 /*
  * juggle is a simple IPC/context switch performance test, which works on
  * pairs of file descriptors of various types.  In various runs, it considers
  * the cost of bouncing a message synchronously across the descriptor pair,
  * either in the same thread, two different threads, or two different
  * processes.  Timing measurements for each series of I/O's are reported, but
  * the first measurement in each series discarded as "warmup" on the IPC
  * primitive.  Variations on the test permit for pipelining, or the insertion
  * of more than one packet into the stream at a time, intended to permit
  * greater parallelism, hopefully allowing performance numbers to reflect
  * use of available parallelism, and/or intelligence in context switching to
  * avoid premature switching when multiple messages are queued.
  */
 
 /*
  * The UDP test uses UDP over the loopback interface.  Two arbitrary but
  * fixed port numbers.
  */
 #define	UDP_PORT1	2020
 #define	UDP_PORT2	2021
 
 /*
  * Size of each message.  Must be smaller than the socket buffer or pipe
  * buffer maximum size, as we want to send it atomically without blocking.
  * If pipelining is in use, must be able to fit PIPELINE_MAX of these
  * messages into the send queue.
  */
 #define	MESSAGELEN	128
 
 /*
  * Number of message cycles -- into fd1, out of fd2, into fd2, and out of
  * fd1.  By counting in cycles, we allow the master thread or process to
  * perform timing without explicitly synchronizing with the secondary thread
  * or process.
  */
 #define	NUMCYCLES	1024
 
 /*
  * Number of times to run each test.
  */
 #define	LOOPS		10
 
 /*
  * Number of in-flight messages per cycle.  I adjusting this value, be
  * careful not to exceed the socket/etc buffer depth, or messages may be lost
  * or result in blocking.
  */
 #define	PIPELINE_MAX	4
 
-/*
- * As in all programs, steal timespecsub() from time.h.
- */
-#define timespecsub(vvp, uvp)                                           \
-        do {                                                            \
-                (vvp)->tv_sec -= (uvp)->tv_sec;                         \
-                (vvp)->tv_nsec -= (uvp)->tv_nsec;                       \
-                if ((vvp)->tv_nsec < 0) {                               \
-                        (vvp)->tv_sec--;                                \
-                        (vvp)->tv_nsec += 1000000000;                   \
-                }                                                       \
-        } while (0)
-
 static int
 udp_create(int *fd1p, int *fd2p)
 {
 	struct sockaddr_in sin1, sin2;
 	int sock1, sock2;
 
 	sock1 = socket(PF_INET, SOCK_DGRAM, 0);
 	if (sock1 == -1)
 		return (-1);
 
 	sock2 = socket(PF_INET, SOCK_DGRAM, 0);
 	if (sock2 == -1) {
 		close(sock1);
 		return (-1);
 	}
 
 	bzero(&sin1, sizeof(sin1));
 	sin1.sin_len = sizeof(sin1);
 	sin1.sin_family = AF_INET;
 	sin1.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 	sin1.sin_port = htons(UDP_PORT1);
 
 	bzero(&sin2, sizeof(sin2));
 	sin2.sin_len = sizeof(sin2);
 	sin2.sin_family = AF_INET;
 	sin2.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 	sin2.sin_port = htons(UDP_PORT2);
 
 	if (bind(sock1, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
 		close(sock1);
 		close(sock2);
 		return (-1);
 	}
 
 	if (bind(sock2, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
 		close(sock1);
 		close(sock2);
 		return (-1);
 	}
 
 	if (connect(sock1, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
 		close(sock1);
 		close(sock2);
 		return (-1);
 	}
 
 	if (connect(sock2, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
 		close(sock1);
 		close(sock2);
 		return (-1);
 	}
 
 	*fd1p = sock1;
 	*fd2p = sock2;
 
 	return (0);
 }
 
 static int
 pipe_create(int *fd1p, int *fd2p)
 {
 	int fds[2];
 
 	if (pipe(fds) < 0)
 		return (-1);
 
 	*fd1p = fds[0];
 	*fd2p = fds[1];
 
 	return (0);
 }
 
 static int
 socketpairdgram_create(int *fd1p, int *fd2p)
 {
 	int fds[2];
 
 	if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, fds) < 0)
 		return (-1);
 
 	*fd1p = fds[0];
 	*fd2p = fds[1];
 
 	return (0);
 }
 
 static int
 socketpairstream_create(int *fd1p, int *fd2p)
 {
 	int fds[2];
 
 	if (socketpair(PF_LOCAL, SOCK_STREAM, 0, fds) < 0)
 		return (-1);
 
 	*fd1p = fds[0];
 	*fd2p = fds[1];
 
 	return (0);
 }
 
 static int
 message_send(int s)
 {
 	u_char buffer[MESSAGELEN];
 	ssize_t len;
 
 	bzero(buffer, sizeof(buffer));
 
 	len = write(s, buffer, sizeof(buffer));
 	if (len == -1)
 		return (-1);
 	if (len != sizeof(buffer)) {
 		errno = EMSGSIZE;
 		return (-1);
 	}
 	return (0);
 }
 
 static int
 message_recv(int s)
 {
 	u_char buffer[MESSAGELEN];
 	ssize_t len;
 
 	len = read(s, buffer, sizeof(buffer));
 	if (len == -1)
 		return (-1);
 	if (len != sizeof(buffer)) {
 		errno = EMSGSIZE;
 		return (-1);
 	}
 	return (0);
 }
 
 /*
  * Juggle messages between two file descriptors in a single thread/process,
  * so simply a measure of IPC performance.
  */
 static struct timespec
 juggle(int fd1, int fd2, int pipeline)
 {
 	struct timespec tstart, tfinish;
 	int i, j;
 
 	if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
 		err(-1, "juggle: clock_gettime");
 
 	for (i = 0; i < NUMCYCLES; i++) {
 
 		for (j = 0; j < pipeline; j++) {
 			if (message_send(fd1) < 0)
 				err(-1, "message_send fd1");
 		}
 
 		for (j = 0; j < pipeline; j++) {
 			if (message_recv(fd2) < 0)
 				err(-1, "message_recv fd2");
 
 			if (message_send(fd2) < 0)
 				err(-1, "message_send fd2");
 		}
 
 		for (j = 0; j < pipeline; j++) {
 			if (message_recv(fd1) < 0)
 				err(-1, "message_recv fd1");
 		}
 	}
 
 	if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
 		err(-1, "juggle: clock_gettime");
 
-	timespecsub(&tfinish, &tstart);
+	timespecsub(&tfinish, &tstart, &tfinish);
 
 	return (tfinish);
 }
 
 /*
  * Juggle messages between two file descriptors in two threads, so measure
  * the cost of IPC and the cost of a thread context switch.
  *
  * In order to avoid measuring thread creation time, we make use of a
  * condition variable to decide when both threads are ready to begin
  * juggling.
  */
 static int threaded_child_ready;
 static pthread_mutex_t threaded_mtx;
 static pthread_cond_t threaded_cond;
 static int threaded_pipeline;
 
 static void *
 juggling_thread(void *arg)
 {
 	int fd2, i, j;
 
 	fd2 = *(int *)arg;
 
 	if (pthread_mutex_lock(&threaded_mtx) != 0)
 		err(-1, "juggling_thread: pthread_mutex_lock");
 
 	threaded_child_ready = 1;
 
 	if (pthread_cond_signal(&threaded_cond) != 0)
 		err(-1, "juggling_thread: pthread_cond_signal");
 
 	if (pthread_mutex_unlock(&threaded_mtx) != 0)
 		err(-1, "juggling_thread: pthread_mutex_unlock");
 
 	for (i = 0; i < NUMCYCLES; i++) {
 		for (j = 0; j < threaded_pipeline; j++) {
 			if (message_recv(fd2) < 0)
 				err(-1, "message_recv fd2");
 
 			if (message_send(fd2) < 0)
 				err(-1, "message_send fd2");
 		}
 	}
 
 	return (NULL);
 }
 
 static struct timespec
 thread_juggle(int fd1, int fd2, int pipeline)
 {
 	struct timespec tstart, tfinish;
 	pthread_t thread;
 	int i, j;
 
 	threaded_pipeline = pipeline;
 
 	if (pthread_mutex_init(&threaded_mtx, NULL) != 0)
 		err(-1, "thread_juggle: pthread_mutex_init");
 
 	if (pthread_create(&thread, NULL, juggling_thread, &fd2) != 0)
 		err(-1, "thread_juggle: pthread_create");
 
 	if (pthread_mutex_lock(&threaded_mtx) != 0)
 		err(-1, "thread_juggle: pthread_mutex_lock");
 
 	while (!threaded_child_ready) {
 		if (pthread_cond_wait(&threaded_cond, &threaded_mtx) != 0)
 			err(-1, "thread_juggle: pthread_cond_wait");
 	}
 
 	if (pthread_mutex_unlock(&threaded_mtx) != 0)
 		err(-1, "thread_juggle: pthread_mutex_unlock");
 
 	if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
 		err(-1, "thread_juggle: clock_gettime");
 
 	for (i = 0; i < NUMCYCLES; i++) {
 		for (j = 0; j < pipeline; j++) {
 			if (message_send(fd1) < 0)
 				err(-1, "message_send fd1");
 		}
 
 		for (j = 0; j < pipeline; j++) {
 			if (message_recv(fd1) < 0)
 				err(-1, "message_recv fd1");
 		}
 	}
 
 	if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
 		err(-1, "thread_juggle: clock_gettime");
 
 	if (pthread_join(thread, NULL) != 0)
 		err(-1, "thread_juggle: pthread_join");
 
-	timespecsub(&tfinish, &tstart);
+	timespecsub(&tfinish, &tstart, &tfinish);
 
 	return (tfinish);
 }
 
 /*
  * Juggle messages between two file descriptors in two processes, so measure
  * the cost of IPC and the cost of a process context switch.
  *
  * Since we can't use a mutex between the processes, we simply do an extra
  * write on the child to let the parent know that it's ready to start.
  */
 static struct timespec
 process_juggle(int fd1, int fd2, int pipeline)
 {
 	struct timespec tstart, tfinish;
 	pid_t pid, ppid, wpid;
 	int error, i, j;
 
 	ppid = getpid();
 
 	pid = fork();
 	if (pid < 0)
 		err(-1, "process_juggle: fork");
 
 	if (pid == 0) {
 		if (message_send(fd2) < 0) {
 			error = errno;
 			kill(ppid, SIGTERM);
 			errno = error;
 			err(-1, "process_juggle: child: message_send");
 		}
 
 		for (i = 0; i < NUMCYCLES; i++) {
 			for (j = 0; j < pipeline; j++) {
 				if (message_send(fd2) < 0)
 					err(-1, "message_send fd2");
 
 				if (message_recv(fd2) < 0)
 					err(-1, "message_recv fd2");
 			}
 		}
 
 		exit(0);
 	} else {
 		if (message_recv(fd1) < 0) {
 			error = errno;
 			kill(pid, SIGTERM);
 			errno = error;
 			err(-1, "process_juggle: parent: message_recv");
 		}
 
 		if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
 			err(-1, "process_juggle: clock_gettime");
 
 		for (i = 0; i < NUMCYCLES; i++) {
 			for (j = 0; j < pipeline; j++) {
 				if (message_send(fd1) < 0) {
 					error = errno;
 					kill(pid, SIGTERM);
 					errno = error;
 					err(-1, "message_send fd1");
 				}
 			}
 
 			for (j = 0; j < pipeline; j++) {
 				if (message_recv(fd1) < 0) {
 					error = errno;
 					kill(pid, SIGTERM);
 					errno = error;
 					err(-1, "message_recv fd1");
 				}
 			}
 		}
 
 		if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
 			err(-1, "process_juggle: clock_gettime");
 	}
 
 	wpid = waitpid(pid, NULL, 0);
 	if (wpid < 0)
 		err(-1, "process_juggle: waitpid");
 	if (wpid != pid)
 		errx(-1, "process_juggle: waitpid: pid != wpid");
 
-	timespecsub(&tfinish, &tstart);
+	timespecsub(&tfinish, &tstart, &tfinish);
 
 	return (tfinish);
 }
 
 /*
  * When we print out results for larger pipeline sizes, we scale back by the
  * depth of the pipeline.  This generally means dividing by the pipeline
  * depth.  Except when it means dividing by zero.
  */
 static void
 scale_timespec(struct timespec *ts, int p)
 {
 
 	if (p == 0)
 		return;
 
 	ts->tv_sec /= p;
 	ts->tv_nsec /= p;
 }
 
 static const struct ipctype {
 	int		(*it_create)(int *fd1p, int *fd2p);
 	const char	*it_name;
 } ipctypes[] = {
 	{ pipe_create, "pipe" },
 	{ udp_create, "udp" },
 	{ socketpairdgram_create, "socketpairdgram" },
 	{ socketpairstream_create, "socketpairstream" },
 };
 static const int ipctypes_len = (sizeof(ipctypes) / sizeof(struct ipctype));
 
 int
 main(int argc, char *argv[])
 {
 	struct timespec juggle_results[LOOPS], process_results[LOOPS];
 	struct timespec thread_results[LOOPS];
 	int fd1, fd2, i, j, p;
 	struct utsname uts;
 
 	printf("version, juggle.c %s\n", "$FreeBSD$");
 
 	if (uname(&uts) < 0)
 		err(-1, "utsname");
 	printf("sysname, %s\n", uts.sysname);
 	printf("nodename, %s\n", uts.nodename);
 	printf("release, %s\n", uts.release);
 	printf("version, %s\n", uts.version);
 	printf("machine, %s\n", uts.machine);
 	printf("\n");
 
 	printf("MESSAGELEN, %d\n", MESSAGELEN);
 	printf("NUMCYCLES, %d\n", NUMCYCLES);
 	printf("LOOPS, %d\n", LOOPS);
 	printf("PIPELINE_MAX, %d\n", PIPELINE_MAX);
 	printf("\n\n");
 
 	printf("ipctype, test, pipeline_depth");
 	for (j = 0; j < LOOPS; j++)
 		printf(", data%d", j);
 	printf("\n");
 	fflush(stdout);
 	for (p = 0; p < PIPELINE_MAX + 1; p++) {
 		for (i = 0; i < ipctypes_len; i++) {
 			if (ipctypes[i].it_create(&fd1, &fd2) < 0)
 				err(-1, "main: %s", ipctypes[i].it_name);
 
 			/*
 			 * For each test, do one uncounted warmup, then LOOPS
 			 * runs of the actual test.
 			 */
 			juggle(fd1, fd2, p);
 			for (j = 0; j < LOOPS; j++)
 				juggle_results[j] = juggle(fd1, fd2, p);
 			process_juggle(fd1, fd2, p);
 			for (j = 0; j < LOOPS; j++)
 				process_results[j] = process_juggle(fd1, fd2,
 				    p);
 			thread_juggle(fd1, fd2, p);
 			for (j = 0; j < LOOPS; j++)
 				thread_results[j] = thread_juggle(fd1, fd2,
 				    p);
 			for (j = 0; j < LOOPS; j++) {
 				thread_results[j].tv_sec = 0;
 				thread_results[j].tv_nsec = 0;
 			}
 			close(fd1);
 			close(fd2);
 		}
 		/*
 		 * When printing results for the round, normalize the results
 		 * with respect to the pipeline depth.  We're doing p times
 		 * as much work, and are we taking p times as long?
 		 */
 		for (i = 0; i < ipctypes_len; i++) {
 			printf("%s, juggle, %d, ", ipctypes[i].it_name, p);
 			for (j = 0; j < LOOPS; j++) {
 				if (j != 0)
 					printf(", ");
 				scale_timespec(&juggle_results[j], p);
 				printf("%jd.%09lu",
 				    (intmax_t)juggle_results[j].tv_sec,
 				    juggle_results[j].tv_nsec);
 			}
 			printf("\n");
 			printf("%s, process_juggle, %d, ",
 			    ipctypes[i].it_name, p);
 			for (j = 0; j < LOOPS; j++) {
 				if (j != 0)
 					printf(", ");
 				scale_timespec(&process_results[j], p);
 				printf("%jd.%09lu",
                                     (intmax_t)process_results[j].tv_sec,
 				    process_results[j].tv_nsec);
 			}
 			printf("\n");
 			printf("%s, thread_juggle, %d, ",
 			    ipctypes[i].it_name, p);
 			for (j = 0; j < LOOPS; j++) {
 				if (j != 0)
 					printf(", ");
 				scale_timespec(&thread_results[j], p);
 				printf("%jd.%09lu",
 				    (intmax_t)thread_results[j].tv_sec,
 				    thread_results[j].tv_nsec);
 			}
 			printf("\n");
 		}
 		fflush(stdout);
 	}
 	return (0);
 }
Index: head/tools/tools/netrate/tcpp/tcpp_client.c
===================================================================
--- head/tools/tools/netrate/tcpp/tcpp_client.c	(revision 336913)
+++ head/tools/tools/netrate/tcpp/tcpp_client.c	(revision 336914)
@@ -1,377 +1,367 @@
 /*-
  * Copyright (c) 2008-2009 Robert N. M. Watson
  * Copyright (c) 2010 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/event.h>
 #include <sys/resource.h>
 #include <sys/sched.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 #include <sys/wait.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "tcpp.h"
 
 #define	min(x, y)	(x < y ? x : y)
 
-#define timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
 
-
 /*
  * Gist of each client worker: build up to mflag connections at a time, and
  * pump data in to them somewhat fairly until tflag connections have been
  * completed.
  */
 #define	CONNECTION_MAGIC	0x87a3f56e
 struct connection {
 	uint32_t	conn_magic;		/* Just magic. */
 	int		conn_fd;
 	struct tcpp_header	conn_header;	/* Header buffer. */
 	u_int		conn_header_sent;	/* Header bytes sent. */
 	u_int64_t	conn_data_sent;		/* Data bytes sent.*/
 };
 
 static u_char			 buffer[256 * 1024];	/* Buffer to send. */
 static pid_t			*pid_list;
 static int			 kq;
 static int			 started;	/* Number started so far. */
 static int			 finished;	/* Number finished so far. */
 static int			 counter;	/* IP number offset. */
 static uint64_t			 payload_len;
 
 static struct connection *
 tcpp_client_newconn(void)
 {
 	struct sockaddr_in sin;
 	struct connection *conn;
 	struct kevent kev;
 	int fd, i;
 
 	/*
 	 * Spread load over available IPs, rotating through them as we go.  No
 	 * attempt to localize IPs to particular workers.
 	 */
 	sin = localipbase;
 	sin.sin_addr.s_addr = htonl(ntohl(localipbase.sin_addr.s_addr) +
 	    (counter++ % Mflag));
 
 	fd = socket(PF_INET, SOCK_STREAM, 0);
 	if (fd < 0)
 		err(-1, "socket");
 
 	if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
 		err(-1, "fcntl");
 
 	i = 1;
 	if (setsockopt(fd, SOL_SOCKET, SO_NOSIGPIPE, &i, sizeof(i)) < 0)
 		err(-1, "setsockopt");
 	i = 1;
 	if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &i, sizeof(i)) < 0)
 		err(-1, "setsockopt");
 #if 0
 	i = 1;
 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &i, sizeof(i)) < 0)
 		err(-1, "setsockopt");
 #endif
 
 	if (lflag) {
 		if (bind(fd, (struct sockaddr *)&sin, sizeof(sin)) < 0)
 			err(-1, "bind");
 	}
 
 	if (connect(fd, (struct sockaddr *)&remoteip, sizeof(remoteip)) < 0 &&
 	    errno != EINPROGRESS)
 		err(-1, "connect");
 
 	conn = malloc(sizeof(*conn));
 	if (conn == NULL)
 		return (NULL);
 	bzero(conn, sizeof(*conn));
 	conn->conn_magic = CONNECTION_MAGIC;
 	conn->conn_fd = fd;
 	conn->conn_header.th_magic = TCPP_MAGIC;
 	conn->conn_header.th_len = payload_len;
 	tcpp_header_encode(&conn->conn_header);
 
 	EV_SET(&kev, fd, EVFILT_WRITE, EV_ADD, 0, 0, conn);
 	if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0)
 		err(-1, "newconn kevent");
 
 	started++;
 	return (conn);
 }
 
 static void
 tcpp_client_closeconn(struct connection *conn)
 {
 
 	close(conn->conn_fd);
 	bzero(conn, sizeof(*conn));
 	free(conn);
 	finished++;
 }
 
 static void
 tcpp_client_handleconn(struct kevent *kev)
 {
 	struct connection *conn;
 	struct iovec iov[2];
 	ssize_t len, header_left;
 
 	conn = kev->udata;
 	if (conn->conn_magic != CONNECTION_MAGIC)
 		errx(-1, "tcpp_client_handleconn: magic");
 
 	if (conn->conn_header_sent < sizeof(conn->conn_header)) {
 		header_left = sizeof(conn->conn_header) -
 		    conn->conn_header_sent;
 		iov[0].iov_base = ((u_char *)&conn->conn_header) +
 		    conn->conn_header_sent;
 		iov[0].iov_len = header_left;
 		iov[1].iov_base = buffer;
 		iov[1].iov_len = min(sizeof(buffer), payload_len);
 		len = writev(conn->conn_fd, iov, 2);
 		if (len < 0) {
 			tcpp_client_closeconn(conn);
 			err(-1, "tcpp_client_handleconn: header write");
 		}
 		if (len == 0) {
 			tcpp_client_closeconn(conn);
 			errx(-1, "tcpp_client_handleconn: header write "
 			    "premature EOF");
 		}
 		if (len > header_left) {
 			conn->conn_data_sent += (len - header_left);
 			conn->conn_header_sent += header_left;
 		} else
 			conn->conn_header_sent += len;
 	} else {
 		len = write(conn->conn_fd, buffer, min(sizeof(buffer),
 		    payload_len - conn->conn_data_sent));
 		if (len < 0) {
 			tcpp_client_closeconn(conn);
 			err(-1, "tcpp_client_handleconn: data write");
 		}
 		if (len == 0) {
 			tcpp_client_closeconn(conn);
 			errx(-1, "tcpp_client_handleconn: data write: "
 			    "premature EOF");
 		}
 		conn->conn_data_sent += len;
 	}
 	if (conn->conn_data_sent >= payload_len) {
 		/*
 		 * All is well.
 		 */
 		tcpp_client_closeconn(conn);
 	}
 }
 
 static void
 tcpp_client_worker(int workernum)
 {
 	struct kevent *kev_array;
 	int i, numevents, kev_bytes;
 #if defined(CPU_SETSIZE) && 0
 	cpu_set_t mask;
 	int ncpus;
 	size_t len;
 
 	if (Pflag) {
 		len = sizeof(ncpus);
 		if (sysctlbyname(SYSCTLNAME_CPUS, &ncpus, &len, NULL, 0) < 0)
 			err(-1, "sysctlbyname: %s", SYSCTLNAME_CPUS);
 		if (len != sizeof(ncpus))
 			errx(-1, "sysctlbyname: %s: len %jd", SYSCTLNAME_CPUS,
 			    (intmax_t)len);
 
 		CPU_ZERO(&mask);
 		CPU_SET(workernum % ncpus, &mask);
 		if (sched_setaffinity(0, CPU_SETSIZE, &mask) < 0)
 			err(-1, "sched_setaffinity");
 	}
 #endif
 	setproctitle("tcpp_client %d", workernum);
 
 	/*
 	 * Add the worker number to the remote port.
 	 */
 	remoteip.sin_port = htons(rflag + workernum);
 
 	kev_bytes = sizeof(*kev_array) * mflag;
 	kev_array = malloc(kev_bytes);
 	if (kev_array == NULL)
 		err(-1, "malloc");
 	bzero(kev_array, kev_bytes);
 
 	kq = kqueue();
 	if (kq < 0)
 		err(-1, "kqueue");
 
 	while (finished < tflag) {
 		while ((started - finished < mflag) && (started < tflag))
 			(void)tcpp_client_newconn();
 		numevents = kevent(kq, NULL, 0, kev_array, mflag, NULL);
 		if (numevents < 0)
 			err(-1, "kevent");
 		if (numevents > mflag)
 			errx(-1, "kevent: %d", numevents);
 		for (i = 0; i < numevents; i++)
 			tcpp_client_handleconn(&kev_array[i]);
 	}
 	/* printf("Worker %d done - %d finished\n", workernum, finished); */
 }
 
 void
 tcpp_client(void)
 {
 	struct timespec ts_start, ts_finish;
 	long cp_time_start[CPUSTATES], cp_time_finish[CPUSTATES];
 	long ticks;
 	size_t size;
 	pid_t pid;
 	int i, failed, status;
 
 	if (bflag < sizeof(struct tcpp_header))
 		errx(-1, "Can't use -b less than %zu\n",
 		   sizeof(struct tcpp_header));
 	payload_len = bflag - sizeof(struct tcpp_header);
 
 	pid_list = malloc(sizeof(*pid_list) * pflag);
 	if (pid_list == NULL)
 		err(-1, "malloc pid_list");
 	bzero(pid_list, sizeof(*pid_list) * pflag);
 
 	/*
 	 * Start workers.
 	 */
 	size = sizeof(cp_time_start);
 	if (sysctlbyname(SYSCTLNAME_CPTIME, &cp_time_start, &size, NULL, 0)
 	    < 0)
 		err(-1, "sysctlbyname: %s", SYSCTLNAME_CPTIME);
 	if (clock_gettime(CLOCK_REALTIME, &ts_start) < 0)
 		err(-1, "clock_gettime");
 	for (i = 0; i < pflag; i++) {
 		pid = fork();
 		if (pid < 0) {
 			warn("fork");
 			for (i = 0; i < pflag; i++) {
 				if (pid_list[i] != 0)
 					(void)kill(pid_list[i], SIGKILL);
 			}
 			exit(-1);
 		}
 		if (pid == 0) {
 			tcpp_client_worker(i);
 			exit(0);
 		}
 		pid_list[i] = pid;
 	}
 
 	/*
 	 * GC workers.
 	 */
 	failed = 0;
 	for (i = 0; i < pflag; i++) {
 		if (pid_list[i] != 0) {
 			while (waitpid(pid_list[i], &status, 0) != pid_list[i]);
 			if (WEXITSTATUS(status) != 0)
 				failed = 1;
 		}
 	}
 	if (clock_gettime(CLOCK_REALTIME, &ts_finish) < 0)
 		err(-1, "clock_gettime");
 	size = sizeof(cp_time_finish);
 	if (sysctlbyname(SYSCTLNAME_CPTIME, &cp_time_finish, &size, NULL, 0)
 	    < 0)
 		err(-1, "sysctlbyname: %s", SYSCTLNAME_CPTIME);
-	timespecsub(&ts_finish, &ts_start);
+	timespecsub(&ts_finish, &ts_start, &ts_finish);
 
 	if (failed)
 		errx(-1, "Too many errors");
 
 	if (hflag)
 		printf("bytes,seconds,conn/s,Gb/s,user%%,nice%%,sys%%,"
 		    "intr%%,idle%%\n");
 
 	/*
 	 * Configuration parameters.
 	 */
 	printf("%jd,", bflag * tflag * pflag);
 	printf("%jd.%09jd,", (intmax_t)ts_finish.tv_sec,
 	    (intmax_t)(ts_finish.tv_nsec));
 
 	/*
 	 * Effective transmit rates.
 	 */
 	printf("%f,", (double)(pflag * tflag)/
 	    (ts_finish.tv_sec + ts_finish.tv_nsec * 1e-9));
 	printf("%f,", (double)(bflag * tflag * pflag * 8) /
 	    (ts_finish.tv_sec + ts_finish.tv_nsec * 1e-9) * 1e-9);
 
 	/*
 	 * CPU time (est).
 	 */
 	ticks = 0;
 	for (i = 0; i < CPUSTATES; i++) {
 		cp_time_finish[i] -= cp_time_start[i];
 		ticks += cp_time_finish[i];
 	}
 	printf("%0.02f,", (float)(100 * cp_time_finish[CP_USER]) / ticks);
 	printf("%0.02f,", (float)(100 * cp_time_finish[CP_NICE]) / ticks);
 	printf("%0.02f,", (float)(100 * cp_time_finish[CP_SYS]) / ticks);
 	printf("%0.02f,", (float)(100 * cp_time_finish[CP_INTR]) / ticks);
 	printf("%0.02f", (float)(100 * cp_time_finish[CP_IDLE]) / ticks);
 	printf("\n");
 }
Index: head/tools/tools/syscall_timing/syscall_timing.c
===================================================================
--- head/tools/tools/syscall_timing/syscall_timing.c	(revision 336913)
+++ head/tools/tools/syscall_timing/syscall_timing.c	(revision 336914)
@@ -1,1143 +1,1133 @@
 /*-
  * Copyright (c) 2003-2004, 2010 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed at the University of Cambridge
  * Computer Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/procdesc.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <limits.h>
 #ifdef WITH_PTHREAD
 #include <pthread.h>
 #endif
 #include <semaphore.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 static struct timespec ts_start, ts_end;
 static int alarm_timeout;
 static volatile int alarm_fired;
 
-#define timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-
 #define	BENCHMARK_FOREACH(I, NUM) for (I = 0; I < NUM && alarm_fired == 0; I++)
 
 static void
 alarm_handler(int signum __unused)
 {
 
 	alarm_fired = 1;
 }
 
 static void
 benchmark_start(void)
 {
 	int error;
 
 	alarm_fired = 0;
 	if (alarm_timeout) {
 		signal(SIGALRM, alarm_handler);
 		alarm(alarm_timeout);
 	}
 	error = clock_gettime(CLOCK_REALTIME, &ts_start);
 	assert(error == 0);
 }
 
 static void
 benchmark_stop(void)
 {
 	int error;
 
 	error = clock_gettime(CLOCK_REALTIME, &ts_end);
 	assert(error == 0);
 }
 
 static uintmax_t
 test_access(uintmax_t num, uintmax_t int_arg __unused, const char *path)
 {
 	uintmax_t i;
 	int fd;
 
 	fd = access(path, O_RDONLY);
 	if (fd < 0)
 		err(-1, "test_access: %s", path);
 	close(fd);
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		access(path, O_RDONLY);
 		close(fd);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_bad_open(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		open("", O_RDONLY);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_chroot(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	if (chroot("/") < 0)
 		err(-1, "test_chroot: chroot");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		if (chroot("/") < 0)
 			err(-1, "test_chroot: chroot");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_clock_gettime(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	struct timespec ts;
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)clock_gettime(CLOCK_REALTIME, &ts);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_create_unlink(uintmax_t num, uintmax_t int_arg __unused, const char *path)
 {
 	uintmax_t i;
 	int fd;
 
 	(void)unlink(path);
 	fd = open(path, O_RDWR | O_CREAT, 0600);
 	if (fd < 0)
 		err(-1, "test_create_unlink: create: %s", path);
 	close(fd);
 	if (unlink(path) < 0)
 		err(-1, "test_create_unlink: unlink: %s", path);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		fd = open(path, O_RDWR | O_CREAT, 0600);
 		if (fd < 0)
 			err(-1, "test_create_unlink: create: %s", path);
 		close(fd);
 		if (unlink(path) < 0)
 			err(-1, "test_create_unlink: unlink: %s", path);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_fork(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	pid_t pid;
 	uintmax_t i;
 
 	pid = fork();
 	if (pid < 0)
 		err(-1, "test_fork: fork");
 	if (pid == 0)
 		_exit(0);
 	if (waitpid(pid, NULL, 0) < 0)
 		err(-1, "test_fork: waitpid");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		pid = fork();
 		if (pid < 0)
 			err(-1, "test_fork: fork");
 		if (pid == 0)
 			_exit(0);
 		if (waitpid(pid, NULL, 0) < 0)
 			err(-1, "test_fork: waitpid");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 #define	USR_BIN_TRUE	"/usr/bin/true"
 static char *execve_args[] = { __DECONST(char *, USR_BIN_TRUE), NULL};
 extern char **environ;
 
 static uintmax_t
 test_fork_exec(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	pid_t pid;
 	uintmax_t i;
 
 	pid = fork();
 	if (pid < 0)
 		err(-1, "test_fork_exec: fork");
 	if (pid == 0) {
 		(void)execve(USR_BIN_TRUE, execve_args, environ);
 		err(-1, "execve");
 	}
 	if (waitpid(pid, NULL, 0) < 0)
 		err(-1, "test_fork: waitpid");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		pid = fork();
 		if (pid < 0)
 			err(-1, "test_fork_exec: fork");
 		if (pid == 0) {
 			(void)execve(USR_BIN_TRUE, execve_args, environ);
 			err(-1, "test_fork_exec: execve");
 		}
 		if (waitpid(pid, NULL, 0) < 0)
 			err(-1, "test_fork_exec: waitpid");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_getppid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	/*
 	 * This is process-local, but can change, so will require a
 	 * lock.
 	 */
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		getppid();
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_getpriority(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)getpriority(PRIO_PROCESS, 0);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 /*
  * The point of this one is to figure out the cost of a call into libc,
  * through PLT, and back.
  */
 static uintmax_t
 test_getprogname(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)getprogname();
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_getresuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uid_t ruid, euid, suid;
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)getresuid(&ruid, &euid, &suid);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_gettimeofday(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	struct timeval tv;
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)gettimeofday(&tv, NULL);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_getuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 
 	/*
 	 * Thread-local data should require no locking if system
 	 * call is MPSAFE.
 	 */
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		getuid();
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_memcpy(uintmax_t num, uintmax_t int_arg, const char *path __unused)
 {
 	char buf[int_arg], buf2[int_arg];
 	uintmax_t i;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		/*
 		 * Copy the memory there and back, to match the total amount
 		 * moved by pipeping/pipepingtd tests.
 		 */
 		memcpy(buf2, buf, int_arg);
 		memcpy(buf, buf2, int_arg);
 	}
 	benchmark_stop();
 
 	return (i);
 }
 
 static uintmax_t
 test_open_close(uintmax_t num, uintmax_t int_arg __unused, const char *path)
 {
 	uintmax_t i;
 	int fd;
 
 	fd = open(path, O_RDONLY);
 	if (fd < 0)
 		err(-1, "test_open_close: %s", path);
 	close(fd);
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		fd = open(path, O_RDONLY);
 		if (fd < 0)
 			err(-1, "test_open_close: %s", path);
 		close(fd);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_open_read_close(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	char buf[int_arg];
 	uintmax_t i;
 	int fd;
 
 	fd = open(path, O_RDONLY);
 	if (fd < 0)
 		err(-1, "test_open_read_close: %s", path);
 	(void)read(fd, buf, int_arg);
 	close(fd);
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		fd = open(path, O_RDONLY);
 		if (fd < 0)
 			err(-1, "test_open_read_close: %s", path);
 		(void)read(fd, buf, int_arg);
 		close(fd);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_pipe(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	int fd[2];
 	uintmax_t i;
 
 	/*
 	 * pipe creation is expensive, as it will allocate a new file
 	 * descriptor, allocate a new pipe, hook it all up, and return.
 	 * Destroying is also expensive, as we now have to free up
 	 * the file descriptors and return the pipe.
 	 */
 	if (pipe(fd) < 0)
 		err(-1, "test_pipe: pipe");
 	close(fd[0]);
 	close(fd[1]);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		if (pipe(fd) == -1)
 			err(-1, "test_pipe: pipe");
 		close(fd[0]);
 		close(fd[1]);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static void
 readx(int fd, char *buf, size_t size)
 {
 	ssize_t ret;
 
 	do {
 		ret = read(fd, buf, size);
 		if (ret == -1)
 			err(1, "read");
 		assert((size_t)ret <= size);
 		size -= ret;
 		buf += ret;
 	} while (size > 0);
 }
 
 static void
 writex(int fd, const char *buf, size_t size)
 {
 	ssize_t ret;
 
 	do {
 		ret = write(fd, buf, size);
 		if (ret == -1)
 			err(1, "write");
 		assert((size_t)ret <= size);
 		size -= ret;
 		buf += ret;
 	} while (size > 0);
 }
 
 static uintmax_t
 test_pipeping(uintmax_t num, uintmax_t int_arg, const char *path __unused)
 {
 	char buf[int_arg];
 	uintmax_t i;
 	pid_t pid;
 	int fd[2], procfd;
 
 	if (pipe(fd) < 0)
 		err(-1, "pipe");
 
 	pid = pdfork(&procfd, 0);
 	if (pid < 0)
 		err(1, "pdfork");
 
 	if (pid == 0) {
 		close(fd[0]);
 
 		for (;;) {
 			readx(fd[1], buf, int_arg);
 			writex(fd[1], buf, int_arg);
 		}
 	}
 
 	close(fd[1]);
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		writex(fd[0], buf, int_arg);
 		readx(fd[0], buf, int_arg);
 	}
 	benchmark_stop();
 
 	close(procfd);
 	return (i);
 }
 
 #ifdef WITH_PTHREAD
 struct pipepingtd_ctx {
 	int		fd;
 	uintmax_t	int_arg;
 };
 
 static void *
 pipepingtd_proc(void *arg)
 {
 	struct pipepingtd_ctx *ctxp;
 	int fd;
 	void *buf;
 	uintmax_t int_arg;
 
 	ctxp = arg;
 	fd = ctxp->fd;
 	int_arg = ctxp->int_arg;
 
 	buf = malloc(int_arg);
 	if (buf == NULL)
 		err(1, "malloc");
 
 	for (;;) {
 		readx(fd, buf, int_arg);
 		writex(fd, buf, int_arg);
 	}
 }
 
 static uintmax_t
 test_pipepingtd(uintmax_t num, uintmax_t int_arg, const char *path __unused)
 {
 	struct pipepingtd_ctx ctx;
 	char buf[int_arg];
 	pthread_t td;
 	uintmax_t i;
 	int error, fd[2];
 
 	if (pipe(fd) < 0)
 		err(-1, "pipe");
 
 	ctx.fd = fd[1];
 	ctx.int_arg = int_arg;
 
 	error = pthread_create(&td, NULL, pipepingtd_proc, &ctx);
 	if (error != 0)
 		err(1, "pthread_create");
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		writex(fd[0], buf, int_arg);
 		readx(fd[0], buf, int_arg);
 	}
 	benchmark_stop();
 	pthread_cancel(td);
 
 	return (i);
 }
 #endif /* WITH_PTHREAD */
 
 static uintmax_t
 test_read(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	char buf[int_arg];
 	uintmax_t i;
 	int fd;
 
 	fd = open(path, O_RDONLY);
 	if (fd < 0)
 		err(-1, "test_open_read: %s", path);
 	(void)pread(fd, buf, int_arg, 0);
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)pread(fd, buf, int_arg, 0);
 	}
 	benchmark_stop();
 	close(fd);
 	return (i);
 }
 
 static uintmax_t
 test_select(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	fd_set readfds, writefds, exceptfds;
 	struct timeval tv;
 	uintmax_t i;
 
 	FD_ZERO(&readfds);
 	FD_ZERO(&writefds);
 	FD_ZERO(&exceptfds);
 
 	tv.tv_sec = 0;
 	tv.tv_usec = 0;
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)select(0, &readfds, &writefds, &exceptfds, &tv);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_semaping(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 	pid_t pid;
 	sem_t *buf;
 	int error, j, procfd;
 
 	buf = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
 	if (buf == MAP_FAILED)
 		err(1, "mmap");
 
 	for (j = 0; j < 2; j++) {
 		error = sem_init(&buf[j], 1, 0);
 		if (error != 0)
 			err(1, "sem_init");
 	}
 
 	pid = pdfork(&procfd, 0);
 	if (pid < 0)
 		err(1, "pdfork");
 
 	if (pid == 0) {
 		for (;;) {
 			error = sem_wait(&buf[0]);
 			if (error != 0)
 				err(1, "sem_wait");
 			error = sem_post(&buf[1]);
 			if (error != 0)
 				err(1, "sem_post");
 		}
 	}
 
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		error = sem_post(&buf[0]);
 		if (error != 0)
 			err(1, "sem_post");
 		error = sem_wait(&buf[1]);
 		if (error != 0)
 			err(1, "sem_wait");
 	}
 	benchmark_stop();
 
 	close(procfd);
 
 	for (j = 0; j < 2; j++) {
 		error = sem_destroy(&buf[j]);
 		if (error != 0)
 			err(1, "sem_destroy");
 	}
 
 	error = munmap(buf, PAGE_SIZE);
 	if (error != 0)
 		err(1, "munmap");
 
 	return (i);
 }
 
 static uintmax_t
 test_setuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uid_t uid;
 	uintmax_t i;
 
 	uid = getuid();
 	if (setuid(uid) < 0)
 		err(-1, "test_setuid: setuid");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		if (setuid(uid) < 0)
 			err(-1, "test_setuid: setuid");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_shmfd(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 	int shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
 		err(-1, "test_shmfd: shm_open");
 	close(shmfd);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 		if (shmfd < 0)
 			err(-1, "test_shmfd: shm_open");
 		close(shmfd);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_shmfd_dup(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 	int fd, shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
 		err(-1, "test_shmfd_dup: shm_open");
 	fd = dup(shmfd);
 	if (fd >= 0)
 		close(fd);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		fd = dup(shmfd);
 		if (fd >= 0)
 			close(fd);
 	}
 	benchmark_stop();
 	close(shmfd);
 	return (i);
 }
 
 static uintmax_t
 test_shmfd_fstat(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	struct stat sb;
 	uintmax_t i;
 	int shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
 		err(-1, "test_shmfd_fstat: shm_open");
 	if (fstat(shmfd, &sb) < 0)
 		err(-1, "test_shmfd_fstat: fstat");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		(void)fstat(shmfd, &sb);
 	}
 	benchmark_stop();
 	close(shmfd);
 	return (i);
 }
 
 static uintmax_t
 test_socket_stream(uintmax_t num, uintmax_t int_arg, const char *path __unused)
 {
 	uintmax_t i;
 	int so;
 
 	so = socket(int_arg, SOCK_STREAM, 0);
 	if (so < 0)
 		err(-1, "test_socket_stream: socket");
 	close(so);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		so = socket(int_arg, SOCK_STREAM, 0);
 		if (so == -1)
 			err(-1, "test_socket_stream: socket");
 		close(so);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_socket_dgram(uintmax_t num, uintmax_t int_arg, const char *path __unused)
 {
 	uintmax_t i;
 	int so;
 
 	so = socket(int_arg, SOCK_DGRAM, 0);
 	if (so < 0)
 		err(-1, "test_socket_dgram: socket");
 	close(so);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		so = socket(int_arg, SOCK_DGRAM, 0);
 		if (so == -1)
 			err(-1, "test_socket_dgram: socket");
 		close(so);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_socketpair_stream(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
 		err(-1, "test_socketpair_stream: socketpair");
 	close(so[0]);
 	close(so[1]);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
 			err(-1, "test_socketpair_stream: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_socketpair_dgram(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	uintmax_t i;
 	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
 		err(-1, "test_socketpair_dgram: socketpair");
 	close(so[0]);
 	close(so[1]);
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
 			err(-1, "test_socketpair_dgram: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_vfork(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	pid_t pid;
 	uintmax_t i;
 
 	pid = vfork();
 	if (pid < 0)
 		err(-1, "test_vfork: vfork");
 	if (pid == 0)
 		_exit(0);
 	if (waitpid(pid, NULL, 0) < 0)
 		err(-1, "test_vfork: waitpid");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		pid = vfork();
 		if (pid < 0)
 			err(-1, "test_vfork: vfork");
 		if (pid == 0)
 			_exit(0);
 		if (waitpid(pid, NULL, 0) < 0)
 			err(-1, "test_vfork: waitpid");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 static uintmax_t
 test_vfork_exec(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused)
 {
 	pid_t pid;
 	uintmax_t i;
 
 	pid = vfork();
 	if (pid < 0)
 		err(-1, "test_vfork_exec: vfork");
 	if (pid == 0) {
 		(void)execve(USR_BIN_TRUE, execve_args, environ);
 		err(-1, "test_vfork_exec: execve");
 	}
 	if (waitpid(pid, NULL, 0) < 0)
 		err(-1, "test_vfork_exec: waitpid");
 	benchmark_start();
 	BENCHMARK_FOREACH(i, num) {
 		pid = vfork();
 		if (pid < 0)
 			err(-1, "test_vfork_exec: vfork");
 		if (pid == 0) {
 			(void)execve(USR_BIN_TRUE, execve_args, environ);
 			err(-1, "execve");
 		}
 		if (waitpid(pid, NULL, 0) < 0)
 			err(-1, "test_vfork_exec: waitpid");
 	}
 	benchmark_stop();
 	return (i);
 }
 
 struct test {
 	const char	*t_name;
 	uintmax_t	(*t_func)(uintmax_t, uintmax_t, const char *);
 	int		 t_flags;
 	uintmax_t	 t_int;
 };
 
 #define	FLAG_PATH	0x00000001
 
 static const struct test tests[] = {
 	{ "access", test_access, .t_flags = FLAG_PATH },
 	{ "bad_open", test_bad_open, .t_flags = 0 },
 	{ "chroot", test_chroot, .t_flags = 0 },
 	{ "clock_gettime", test_clock_gettime, .t_flags = 0 },
 	{ "create_unlink", test_create_unlink, .t_flags = FLAG_PATH },
 	{ "fork", test_fork, .t_flags = 0 },
 	{ "fork_exec", test_fork_exec, .t_flags = 0 },
 	{ "getppid", test_getppid, .t_flags = 0 },
 	{ "getpriority", test_getpriority, .t_flags = 0 },
 	{ "getprogname", test_getprogname, .t_flags = 0 },
 	{ "getresuid", test_getresuid, .t_flags = 0 },
 	{ "gettimeofday", test_gettimeofday, .t_flags = 0 },
 	{ "getuid", test_getuid, .t_flags = 0 },
 	{ "memcpy_1", test_memcpy, .t_flags = 0, .t_int = 1 },
 	{ "memcpy_10", test_memcpy, .t_flags = 0, .t_int = 10 },
 	{ "memcpy_100", test_memcpy, .t_flags = 0, .t_int = 100 },
 	{ "memcpy_1000", test_memcpy, .t_flags = 0, .t_int = 1000 },
 	{ "memcpy_10000", test_memcpy, .t_flags = 0, .t_int = 10000 },
 	{ "memcpy_100000", test_memcpy, .t_flags = 0, .t_int = 100000 },
 	{ "memcpy_1000000", test_memcpy, .t_flags = 0, .t_int = 1000000 },
 	{ "open_close", test_open_close, .t_flags = FLAG_PATH },
 	{ "open_read_close_1", test_open_read_close, .t_flags = FLAG_PATH,
 	    .t_int = 1 },
 	{ "open_read_close_10", test_open_read_close, .t_flags = FLAG_PATH,
 	    .t_int = 10 },
 	{ "open_read_close_100", test_open_read_close, .t_flags = FLAG_PATH,
 	    .t_int = 100 },
 	{ "open_read_close_1000", test_open_read_close, .t_flags = FLAG_PATH,
 	    .t_int = 1000 },
 	{ "open_read_close_10000", test_open_read_close,
 	    .t_flags = FLAG_PATH, .t_int = 10000 },
 	{ "open_read_close_100000", test_open_read_close,
 	    .t_flags = FLAG_PATH, .t_int = 100000 },
 	{ "open_read_close_1000000", test_open_read_close,
 	    .t_flags = FLAG_PATH, .t_int = 1000000 },
 	{ "pipe", test_pipe, .t_flags = 0 },
 	{ "pipeping_1", test_pipeping, .t_flags = 0, .t_int = 1 },
 	{ "pipeping_10", test_pipeping, .t_flags = 0, .t_int = 10 },
 	{ "pipeping_100", test_pipeping, .t_flags = 0, .t_int = 100 },
 	{ "pipeping_1000", test_pipeping, .t_flags = 0, .t_int = 1000 },
 	{ "pipeping_10000", test_pipeping, .t_flags = 0, .t_int = 10000 },
 	{ "pipeping_100000", test_pipeping, .t_flags = 0, .t_int = 100000 },
 	{ "pipeping_1000000", test_pipeping, .t_flags = 0, .t_int = 1000000 },
 #ifdef WITH_PTHREAD
 	{ "pipepingtd_1", test_pipepingtd, .t_flags = 0, .t_int = 1 },
 	{ "pipepingtd_10", test_pipepingtd, .t_flags = 0, .t_int = 10 },
 	{ "pipepingtd_100", test_pipepingtd, .t_flags = 0, .t_int = 100 },
 	{ "pipepingtd_1000", test_pipepingtd, .t_flags = 0, .t_int = 1000 },
 	{ "pipepingtd_10000", test_pipepingtd, .t_flags = 0, .t_int = 10000 },
 	{ "pipepingtd_100000", test_pipepingtd, .t_flags = 0, .t_int = 100000 },
 	{ "pipepingtd_1000000", test_pipepingtd, .t_flags = 0, .t_int = 1000000 },
 #endif
 	{ "read_1", test_read, .t_flags = FLAG_PATH, .t_int = 1 },
 	{ "read_10", test_read, .t_flags = FLAG_PATH, .t_int = 10 },
 	{ "read_100", test_read, .t_flags = FLAG_PATH, .t_int = 100 },
 	{ "read_1000", test_read, .t_flags = FLAG_PATH, .t_int = 1000 },
 	{ "read_10000", test_read, .t_flags = FLAG_PATH, .t_int = 10000 },
 	{ "read_100000", test_read, .t_flags = FLAG_PATH, .t_int = 100000 },
 	{ "read_1000000", test_read, .t_flags = FLAG_PATH, .t_int = 1000000 },
 	{ "select", test_select, .t_flags = 0 },
 	{ "semaping", test_semaping, .t_flags = 0 },
 	{ "setuid", test_setuid, .t_flags = 0 },
 	{ "shmfd", test_shmfd, .t_flags = 0 },
 	{ "shmfd_dup", test_shmfd_dup, .t_flags = 0 },
 	{ "shmfd_fstat", test_shmfd_fstat, .t_flags = 0 },
 	{ "socket_local_stream", test_socket_stream, .t_int = PF_LOCAL },
 	{ "socket_local_dgram", test_socket_dgram, .t_int = PF_LOCAL },
 	{ "socketpair_stream", test_socketpair_stream, .t_flags = 0 },
 	{ "socketpair_dgram", test_socketpair_dgram, .t_flags = 0 },
 	{ "socket_tcp", test_socket_stream, .t_int = PF_INET },
 	{ "socket_udp", test_socket_dgram, .t_int = PF_INET },
 	{ "vfork", test_vfork, .t_flags = 0 },
 	{ "vfork_exec", test_vfork_exec, .t_flags = 0 },
 };
 static const int tests_count = sizeof(tests) / sizeof(tests[0]);
 
 static void
 usage(void)
 {
 	int i;
 
 	fprintf(stderr, "syscall_timing [-i iterations] [-l loops] "
 	    "[-p path] [-s seconds] test\n");
 	for (i = 0; i < tests_count; i++)
 		fprintf(stderr, "  %s\n", tests[i].t_name);
 	exit(-1);
 }
 
 int
 main(int argc, char *argv[])
 {
 	struct timespec ts_res;
 	const struct test *the_test;
 	const char *path;
 	char *tmp_dir, *tmp_path;
 	long long ll;
 	char *endp;
 	int ch, fd, error, i, j, rv;
 	uintmax_t iterations, k, loops;
 
 	alarm_timeout = 1;
 	iterations = 0;
 	loops = 10;
 	path = NULL;
 	tmp_path = NULL;
 	while ((ch = getopt(argc, argv, "i:l:p:s:")) != -1) {
 		switch (ch) {
 		case 'i':
 			ll = strtol(optarg, &endp, 10);
 			if (*endp != 0 || ll < 1)
 				usage();
 			iterations = ll;
 			break;
 
 		case 'l':
 			ll = strtol(optarg, &endp, 10);
 			if (*endp != 0 || ll < 1 || ll > 100000)
 				usage();
 			loops = ll;
 			break;
 
 		case 'p':
 			path = optarg;
 			break;
 
 		case 's':
 			ll = strtol(optarg, &endp, 10);
 			if (*endp != 0 || ll < 1 || ll > 60*60)
 				usage();
 			alarm_timeout = ll;
 			break;
 
 		case '?':
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (iterations < 1 && alarm_timeout < 1)
 		usage();
 	if (iterations < 1)
 		iterations = UINT64_MAX;
 	if (loops < 1)
 		loops = 1;
 
 	if (argc < 1)
 		usage();
 
 	/*
 	 * Validate test list and that, if a path is required, it is
 	 * defined.
 	 */
 	for (j = 0; j < argc; j++) {
 		the_test = NULL;
 		for (i = 0; i < tests_count; i++) {
 			if (strcmp(argv[j], tests[i].t_name) == 0)
 				the_test = &tests[i];
 		}
 		if (the_test == NULL)
 			usage();
 		if ((the_test->t_flags & FLAG_PATH) && (path == NULL)) {
 			tmp_dir = strdup("/tmp/syscall_timing.XXXXXXXX");
 			if (tmp_dir == NULL)
 				err(1, "strdup");
 			tmp_dir = mkdtemp(tmp_dir);
 			if (tmp_dir == NULL)
 				err(1, "mkdtemp");
 			rv = asprintf(&tmp_path, "%s/testfile", tmp_dir);
 			if (rv <= 0)
 				err(1, "asprintf");
 		}
 	}
 
 	error = clock_getres(CLOCK_REALTIME, &ts_res);
 	assert(error == 0);
 	printf("Clock resolution: %ju.%09ju\n", (uintmax_t)ts_res.tv_sec,
 	    (uintmax_t)ts_res.tv_nsec);
 	printf("test\tloop\ttime\titerations\tperiteration\n");
 
 	for (j = 0; j < argc; j++) {
 		uintmax_t calls, nsecsperit;
 
 		the_test = NULL;
 		for (i = 0; i < tests_count; i++) {
 			if (strcmp(argv[j], tests[i].t_name) == 0)
 				the_test = &tests[i];
 		}
 
 		if (tmp_path != NULL) {
 			fd = open(tmp_path, O_WRONLY | O_CREAT, 0700);
 			if (fd < 0)
 				err(1, "cannot open %s", tmp_path);
 			error = ftruncate(fd, 1000000);
 			if (error != 0)
 				err(1, "ftruncate");
 			error = close(fd);
 			if (error != 0)
 				err(1, "close");
 			path = tmp_path;
 		}
 
 		/*
 		 * Run one warmup, then do the real thing (loops) times.
 		 */
 		the_test->t_func(iterations, the_test->t_int, path);
 		calls = 0;
 		for (k = 0; k < loops; k++) {
 			calls = the_test->t_func(iterations, the_test->t_int,
 			    path);
-			timespecsub(&ts_end, &ts_start);
+			timespecsub(&ts_end, &ts_start, &ts_end);
 			printf("%s\t%ju\t", the_test->t_name, k);
 			printf("%ju.%09ju\t%ju\t", (uintmax_t)ts_end.tv_sec,
 			    (uintmax_t)ts_end.tv_nsec, calls);
 
 		/*
 		 * Note.  This assumes that each iteration takes less than
 		 * a second, and that our total nanoseconds doesn't exceed
 		 * the room in our arithmetic unit.  Fine for system calls,
 		 * but not for long things.
 		 */
 			nsecsperit = ts_end.tv_sec * 1000000000;
 			nsecsperit += ts_end.tv_nsec;
 			nsecsperit /= calls;
 			printf("0.%09ju\n", (uintmax_t)nsecsperit);
 		}
 	}
 
 	if (tmp_path != NULL) {
 		error = unlink(tmp_path);
 		if (error != 0 && errno != ENOENT)
 			warn("cannot unlink %s", tmp_path);
 		error = rmdir(tmp_dir);
 		if (error != 0)
 			warn("cannot rmdir %s", tmp_dir);
 	}
 
 	return (0);
 }
Index: head/usr.bin/truss/setup.c
===================================================================
--- head/usr.bin/truss/setup.c	(revision 336913)
+++ head/usr.bin/truss/setup.c	(revision 336914)
@@ -1,748 +1,749 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright 1997 Sean Eric Fagan
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan
  * 4. Neither the name of the author may be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Various setup functions for truss.  Not the cleanest-written code,
  * I'm afraid.
  */
 
 #include <sys/ptrace.h>
 #include <sys/sysctl.h>
+#include <sys/time.h>
 #include <sys/wait.h>
 
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysdecode.h>
 #include <time.h>
 #include <unistd.h>
 
 #include "truss.h"
 #include "syscall.h"
 #include "extern.h"
 
 SET_DECLARE(procabi, struct procabi);
 
 static sig_atomic_t detaching;
 
 static void	enter_syscall(struct trussinfo *, struct threadinfo *,
 		    struct ptrace_lwpinfo *);
 static void	new_proc(struct trussinfo *, pid_t, lwpid_t);
 
 /*
  * setup_and_wait() is called to start a process.  All it really does
  * is fork(), enable tracing in the child, and then exec the given
  * command.  At that point, the child process stops, and the parent
  * can wake up and deal with it.
  */
 void
 setup_and_wait(struct trussinfo *info, char *command[])
 {
 	pid_t pid;
 
 	pid = vfork();
 	if (pid == -1)
 		err(1, "fork failed");
 	if (pid == 0) {	/* Child */
 		ptrace(PT_TRACE_ME, 0, 0, 0);
 		execvp(command[0], command);
 		err(1, "execvp %s", command[0]);
 	}
 
 	/* Only in the parent here */
 	if (waitpid(pid, NULL, 0) < 0)
 		err(1, "unexpect stop in waitpid");
 
 	new_proc(info, pid, 0);
 }
 
 /*
  * start_tracing is called to attach to an existing process.
  */
 void
 start_tracing(struct trussinfo *info, pid_t pid)
 {
 	int ret, retry;
 
 	retry = 10;
 	do {
 		ret = ptrace(PT_ATTACH, pid, NULL, 0);
 		usleep(200);
 	} while (ret && retry-- > 0);
 	if (ret)
 		err(1, "can not attach to target process");
 
 	if (waitpid(pid, NULL, 0) < 0)
 		err(1, "Unexpect stop in waitpid");
 
 	new_proc(info, pid, 0);
 }
 
 /*
  * Restore a process back to it's pre-truss state.
  * Called for SIGINT, SIGTERM, SIGQUIT.  This only
  * applies if truss was told to monitor an already-existing
  * process.
  */
 void
 restore_proc(int signo __unused)
 {
 
 	detaching = 1;
 }
 
 static void
 detach_proc(pid_t pid)
 {
 
 	/* stop the child so that we can detach */
 	kill(pid, SIGSTOP);
 	if (waitpid(pid, NULL, 0) < 0)
 		err(1, "Unexpected stop in waitpid");
 
 	if (ptrace(PT_DETACH, pid, (caddr_t)1, 0) < 0)
 		err(1, "Can not detach the process");
 
 	kill(pid, SIGCONT);
 }
 
 /*
  * Determine the ABI.  This is called after every exec, and when
  * a process is first monitored.
  */
 static struct procabi *
 find_abi(pid_t pid)
 {
 	struct procabi **pabi;
 	size_t len;
 	int error;
 	int mib[4];
 	char progt[32];
 
 	len = sizeof(progt);
 	mib[0] = CTL_KERN;
 	mib[1] = KERN_PROC;
 	mib[2] = KERN_PROC_SV_NAME;
 	mib[3] = pid;
 	error = sysctl(mib, 4, progt, &len, NULL, 0);
 	if (error != 0)
 		err(2, "can not get sysvec name");
 
 	SET_FOREACH(pabi, procabi) {
 		if (strcmp((*pabi)->type, progt) == 0)
 			return (*pabi);
 	}
 	warnx("ABI %s for pid %ld is not supported", progt, (long)pid);
 	return (NULL);
 }
 
 static struct threadinfo *
 new_thread(struct procinfo *p, lwpid_t lwpid)
 {
 	struct threadinfo *nt;
 
 	/*
 	 * If this happens it means there is a bug in truss.  Unfortunately
 	 * this will kill any processes truss is attached to.
 	 */
 	LIST_FOREACH(nt, &p->threadlist, entries) {
 		if (nt->tid == lwpid)
 			errx(1, "Duplicate thread for LWP %ld", (long)lwpid);
 	}
 
 	nt = calloc(1, sizeof(struct threadinfo));
 	if (nt == NULL)
 		err(1, "calloc() failed");
 	nt->proc = p;
 	nt->tid = lwpid;
 	LIST_INSERT_HEAD(&p->threadlist, nt, entries);
 	return (nt);
 }
 
 static void
 free_thread(struct threadinfo *t)
 {
 
 	LIST_REMOVE(t, entries);
 	free(t);
 }
 
 static void
 add_threads(struct trussinfo *info, struct procinfo *p)
 {
 	struct ptrace_lwpinfo pl;
 	struct threadinfo *t;
 	lwpid_t *lwps;
 	int i, nlwps;
 
 	nlwps = ptrace(PT_GETNUMLWPS, p->pid, NULL, 0);
 	if (nlwps == -1)
 		err(1, "Unable to fetch number of LWPs");
 	assert(nlwps > 0);
 	lwps = calloc(nlwps, sizeof(*lwps));
 	nlwps = ptrace(PT_GETLWPLIST, p->pid, (caddr_t)lwps, nlwps);
 	if (nlwps == -1)
 		err(1, "Unable to fetch LWP list");
 	for (i = 0; i < nlwps; i++) {
 		t = new_thread(p, lwps[i]);
 		if (ptrace(PT_LWPINFO, lwps[i], (caddr_t)&pl, sizeof(pl)) == -1)
 			err(1, "ptrace(PT_LWPINFO)");
 		if (pl.pl_flags & PL_FLAG_SCE) {
 			info->curthread = t;
 			enter_syscall(info, t, &pl);
 		}
 	}
 	free(lwps);
 }
 
 static void
 new_proc(struct trussinfo *info, pid_t pid, lwpid_t lwpid)
 {
 	struct procinfo *np;
 
 	/*
 	 * If this happens it means there is a bug in truss.  Unfortunately
 	 * this will kill any processes truss is attached to.
 	 */
 	LIST_FOREACH(np, &info->proclist, entries) {
 		if (np->pid == pid)
 			errx(1, "Duplicate process for pid %ld", (long)pid);
 	}
 
 	if (info->flags & FOLLOWFORKS)
 		if (ptrace(PT_FOLLOW_FORK, pid, NULL, 1) == -1)
 			err(1, "Unable to follow forks for pid %ld", (long)pid);
 	if (ptrace(PT_LWP_EVENTS, pid, NULL, 1) == -1)
 		err(1, "Unable to enable LWP events for pid %ld", (long)pid);
 	np = calloc(1, sizeof(struct procinfo));
 	np->pid = pid;
 	np->abi = find_abi(pid);
 	LIST_INIT(&np->threadlist);
 	LIST_INSERT_HEAD(&info->proclist, np, entries);
 
 	if (lwpid != 0)
 		new_thread(np, lwpid);
 	else
 		add_threads(info, np);
 }
 
 static void
 free_proc(struct procinfo *p)
 {
 	struct threadinfo *t, *t2;
 
 	LIST_FOREACH_SAFE(t, &p->threadlist, entries, t2) {
 		free(t);
 	}
 	LIST_REMOVE(p, entries);
 	free(p);
 }
 
 static void
 detach_all_procs(struct trussinfo *info)
 {
 	struct procinfo *p, *p2;
 
 	LIST_FOREACH_SAFE(p, &info->proclist, entries, p2) {
 		detach_proc(p->pid);
 		free_proc(p);
 	}
 }
 
 static struct procinfo *
 find_proc(struct trussinfo *info, pid_t pid)
 {
 	struct procinfo *np;
 
 	LIST_FOREACH(np, &info->proclist, entries) {
 		if (np->pid == pid)
 			return (np);
 	}
 
 	return (NULL);
 }
 
 /*
  * Change curthread member based on (pid, lwpid).
  */
 static void
 find_thread(struct trussinfo *info, pid_t pid, lwpid_t lwpid)
 {
 	struct procinfo *np;
 	struct threadinfo *nt;
 
 	np = find_proc(info, pid);
 	assert(np != NULL);
 
 	LIST_FOREACH(nt, &np->threadlist, entries) {
 		if (nt->tid == lwpid) {
 			info->curthread = nt;
 			return;
 		}
 	}
 	errx(1, "could not find thread");
 }
 
 /*
  * When a process exits, it should have exactly one thread left.
  * All of the other threads should have reported thread exit events.
  */
 static void
 find_exit_thread(struct trussinfo *info, pid_t pid)
 {
 	struct procinfo *p;
 
 	p = find_proc(info, pid);
 	assert(p != NULL);
 
 	info->curthread = LIST_FIRST(&p->threadlist);
 	assert(info->curthread != NULL);
 	assert(LIST_NEXT(info->curthread, entries) == NULL);
 }
 
 static void
 alloc_syscall(struct threadinfo *t, struct ptrace_lwpinfo *pl)
 {
 	u_int i;
 
 	assert(t->in_syscall == 0);
 	assert(t->cs.number == 0);
 	assert(t->cs.sc == NULL);
 	assert(t->cs.nargs == 0);
 	for (i = 0; i < nitems(t->cs.s_args); i++)
 		assert(t->cs.s_args[i] == NULL);
 	memset(t->cs.args, 0, sizeof(t->cs.args));
 	t->cs.number = pl->pl_syscall_code;
 	t->in_syscall = 1;
 }
 
 static void
 free_syscall(struct threadinfo *t)
 {
 	u_int i;
 
 	for (i = 0; i < t->cs.nargs; i++)
 		free(t->cs.s_args[i]);
 	memset(&t->cs, 0, sizeof(t->cs));
 	t->in_syscall = 0;
 }
 
 static void
 enter_syscall(struct trussinfo *info, struct threadinfo *t,
     struct ptrace_lwpinfo *pl)
 {
 	struct syscall *sc;
 	u_int i, narg;
 
 	alloc_syscall(t, pl);
 	narg = MIN(pl->pl_syscall_narg, nitems(t->cs.args));
 	if (narg != 0 && t->proc->abi->fetch_args(info, narg) != 0) {
 		free_syscall(t);
 		return;
 	}
 
 	sc = get_syscall(t, t->cs.number, narg);
 	if (sc->unknown)
 		fprintf(info->outfile, "-- UNKNOWN %s SYSCALL %d --\n",
 		    t->proc->abi->type, t->cs.number);
 
 	t->cs.nargs = sc->nargs;
 	assert(sc->nargs <= nitems(t->cs.s_args));
 
 	t->cs.sc = sc;
 
 	/*
 	 * At this point, we set up the system call arguments.
 	 * We ignore any OUT ones, however -- those are arguments that
 	 * are set by the system call, and so are probably meaningless
 	 * now.	This doesn't currently support arguments that are
 	 * passed in *and* out, however.
 	 */
 #if DEBUG
 	fprintf(stderr, "syscall %s(", sc->name);
 #endif
 	for (i = 0; i < t->cs.nargs; i++) {
 #if DEBUG
 		fprintf(stderr, "0x%lx%s", t->cs.args[sc->args[i].offset],
 		    i < (t->cs.nargs - 1) ? "," : "");
 #endif
 		if (!(sc->args[i].type & OUT)) {
 			t->cs.s_args[i] = print_arg(&sc->args[i],
 			    t->cs.args, 0, info);
 		}
 	}
 #if DEBUG
 	fprintf(stderr, ")\n");
 #endif
 
 	clock_gettime(CLOCK_REALTIME, &t->before);
 }
 
 /*
  * When a thread exits voluntarily (including when a thread calls
  * exit() to trigger a process exit), the thread's internal state
  * holds the arguments passed to the exit system call.  When the
  * thread's exit is reported, log that system call without a return
  * value.
  */
 static void
 thread_exit_syscall(struct trussinfo *info)
 {
 	struct threadinfo *t;
 
 	t = info->curthread;
 	if (!t->in_syscall)
 		return;
 
 	clock_gettime(CLOCK_REALTIME, &t->after);
 
 	print_syscall_ret(info, 0, NULL);
 	free_syscall(t);
 }
 
 static void
 exit_syscall(struct trussinfo *info, struct ptrace_lwpinfo *pl)
 {
 	struct threadinfo *t;
 	struct procinfo *p;
 	struct syscall *sc;
 	long retval[2];
 	u_int i;
 	int errorp;
 
 	t = info->curthread;
 	if (!t->in_syscall)
 		return;
 
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	p = t->proc;
 	if (p->abi->fetch_retval(info, retval, &errorp) < 0) {
 		free_syscall(t);
 		return;
 	}
 
 	sc = t->cs.sc;
 	/*
 	 * Here, we only look for arguments that have OUT masked in --
 	 * otherwise, they were handled in enter_syscall().
 	 */
 	for (i = 0; i < sc->nargs; i++) {
 		char *temp;
 
 		if (sc->args[i].type & OUT) {
 			/*
 			 * If an error occurred, then don't bother
 			 * getting the data; it may not be valid.
 			 */
 			if (errorp) {
 				asprintf(&temp, "0x%lx",
 				    t->cs.args[sc->args[i].offset]);
 			} else {
 				temp = print_arg(&sc->args[i],
 				    t->cs.args, retval, info);
 			}
 			t->cs.s_args[i] = temp;
 		}
 	}
 
 	print_syscall_ret(info, errorp, retval);
 	free_syscall(t);
 
 	/*
 	 * If the process executed a new image, check the ABI.  If the
 	 * new ABI isn't supported, stop tracing this process.
 	 */
 	if (pl->pl_flags & PL_FLAG_EXEC) {
 		assert(LIST_NEXT(LIST_FIRST(&p->threadlist), entries) == NULL);
 		p->abi = find_abi(p->pid);
 		if (p->abi == NULL) {
 			if (ptrace(PT_DETACH, p->pid, (caddr_t)1, 0) < 0)
 				err(1, "Can not detach the process");
 			free_proc(p);
 		}
 	}
 }
 
 int
 print_line_prefix(struct trussinfo *info)
 {
 	struct timespec timediff;
 	struct threadinfo *t;
 	int len;
 
 	len = 0;
 	t = info->curthread;
 	if (info->flags & (FOLLOWFORKS | DISPLAYTIDS)) {
 		if (info->flags & FOLLOWFORKS)
 			len += fprintf(info->outfile, "%5d", t->proc->pid);
 		if ((info->flags & (FOLLOWFORKS | DISPLAYTIDS)) ==
 		    (FOLLOWFORKS | DISPLAYTIDS))
 			len += fprintf(info->outfile, " ");
 		if (info->flags & DISPLAYTIDS)
 			len += fprintf(info->outfile, "%6d", t->tid);
 		len += fprintf(info->outfile, ": ");
 	}
 	if (info->flags & ABSOLUTETIMESTAMPS) {
-		timespecsubt(&t->after, &info->start_time, &timediff);
+		timespecsub(&t->after, &info->start_time, &timediff);
 		len += fprintf(info->outfile, "%jd.%09ld ",
 		    (intmax_t)timediff.tv_sec, timediff.tv_nsec);
 	}
 	if (info->flags & RELATIVETIMESTAMPS) {
-		timespecsubt(&t->after, &t->before, &timediff);
+		timespecsub(&t->after, &t->before, &timediff);
 		len += fprintf(info->outfile, "%jd.%09ld ",
 		    (intmax_t)timediff.tv_sec, timediff.tv_nsec);
 	}
 	return (len);
 }
 
 static void
 report_thread_death(struct trussinfo *info)
 {
 	struct threadinfo *t;
 
 	t = info->curthread;
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	print_line_prefix(info);
 	fprintf(info->outfile, "<thread %ld exited>\n", (long)t->tid);
 }
 
 static void
 report_thread_birth(struct trussinfo *info)
 {
 	struct threadinfo *t;
 
 	t = info->curthread;
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	t->before = t->after;
 	print_line_prefix(info);
 	fprintf(info->outfile, "<new thread %ld>\n", (long)t->tid);
 }
 
 static void
 report_exit(struct trussinfo *info, siginfo_t *si)
 {
 	struct threadinfo *t;
 
 	t = info->curthread;
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	print_line_prefix(info);
 	if (si->si_code == CLD_EXITED)
 		fprintf(info->outfile, "process exit, rval = %u\n",
 		    si->si_status);
 	else
 		fprintf(info->outfile, "process killed, signal = %u%s\n",
 		    si->si_status, si->si_code == CLD_DUMPED ?
 		    " (core dumped)" : "");
 }
 
 static void
 report_new_child(struct trussinfo *info)
 {
 	struct threadinfo *t;
 
 	t = info->curthread;
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	t->before = t->after;
 	print_line_prefix(info);
 	fprintf(info->outfile, "<new process>\n");
 }
 
 void
 decode_siginfo(FILE *fp, siginfo_t *si)
 {
 	const char *str;
 
 	fprintf(fp, " code=");
 	str = sysdecode_sigcode(si->si_signo, si->si_code);
 	if (str == NULL)
 		fprintf(fp, "%d", si->si_code);
 	else
 		fprintf(fp, "%s", str);
 	switch (si->si_code) {
 	case SI_NOINFO:
 		break;
 	case SI_QUEUE:
 		fprintf(fp, " value=%p", si->si_value.sival_ptr);
 		/* FALLTHROUGH */
 	case SI_USER:
 	case SI_LWP:
 		fprintf(fp, " pid=%jd uid=%jd", (intmax_t)si->si_pid,
 		    (intmax_t)si->si_uid);
 		break;
 	case SI_TIMER:
 		fprintf(fp, " value=%p", si->si_value.sival_ptr);
 		fprintf(fp, " timerid=%d", si->si_timerid);
 		fprintf(fp, " overrun=%d", si->si_overrun);
 		if (si->si_errno != 0)
 			fprintf(fp, " errno=%d", si->si_errno);
 		break;
 	case SI_ASYNCIO:
 		fprintf(fp, " value=%p", si->si_value.sival_ptr);
 		break;
 	case SI_MESGQ:
 		fprintf(fp, " value=%p", si->si_value.sival_ptr);
 		fprintf(fp, " mqd=%d", si->si_mqd);
 		break;
 	default:
 		switch (si->si_signo) {
 		case SIGILL:
 		case SIGFPE:
 		case SIGSEGV:
 		case SIGBUS:
 			fprintf(fp, " trapno=%d", si->si_trapno);
 			fprintf(fp, " addr=%p", si->si_addr);
 			break;
 		case SIGCHLD:
 			fprintf(fp, " pid=%jd uid=%jd", (intmax_t)si->si_pid,
 			    (intmax_t)si->si_uid);
 			fprintf(fp, " status=%d", si->si_status);
 			break;
 		}
 	}
 }
 
 static void
 report_signal(struct trussinfo *info, siginfo_t *si, struct ptrace_lwpinfo *pl)
 {
 	struct threadinfo *t;
 	const char *signame;
 
 	t = info->curthread;
 	clock_gettime(CLOCK_REALTIME, &t->after);
 	print_line_prefix(info);
 	signame = sysdecode_signal(si->si_status);
 	if (signame == NULL)
 		signame = "?";
 	fprintf(info->outfile, "SIGNAL %u (%s)", si->si_status, signame);
 	if (pl->pl_event == PL_EVENT_SIGNAL && pl->pl_flags & PL_FLAG_SI)
 		decode_siginfo(info->outfile, &pl->pl_siginfo);
 	fprintf(info->outfile, "\n");
 	
 }
 
 /*
  * Wait for events until all the processes have exited or truss has been
  * asked to stop.
  */
 void
 eventloop(struct trussinfo *info)
 {
 	struct ptrace_lwpinfo pl;
 	siginfo_t si;
 	int pending_signal;
 
 	while (!LIST_EMPTY(&info->proclist)) {
 		if (detaching) {
 			detach_all_procs(info);
 			return;
 		}
 
 		if (waitid(P_ALL, 0, &si, WTRAPPED | WEXITED) == -1) {
 			if (errno == EINTR)
 				continue;
 			err(1, "Unexpected error from waitid");
 		}
 
 		assert(si.si_signo == SIGCHLD);
 
 		switch (si.si_code) {
 		case CLD_EXITED:
 		case CLD_KILLED:
 		case CLD_DUMPED:
 			find_exit_thread(info, si.si_pid);
 			if ((info->flags & COUNTONLY) == 0) {
 				if (si.si_code == CLD_EXITED)
 					thread_exit_syscall(info);
 				report_exit(info, &si);
 			}
 			free_proc(info->curthread->proc);
 			info->curthread = NULL;
 			break;
 		case CLD_TRAPPED:
 			if (ptrace(PT_LWPINFO, si.si_pid, (caddr_t)&pl,
 			    sizeof(pl)) == -1)
 				err(1, "ptrace(PT_LWPINFO)");
 
 			if (pl.pl_flags & PL_FLAG_CHILD) {
 				new_proc(info, si.si_pid, pl.pl_lwpid);
 				assert(LIST_FIRST(&info->proclist)->abi !=
 				    NULL);
 			} else if (pl.pl_flags & PL_FLAG_BORN)
 				new_thread(find_proc(info, si.si_pid),
 				    pl.pl_lwpid);
 			find_thread(info, si.si_pid, pl.pl_lwpid);
 
 			if (si.si_status == SIGTRAP &&
 			    (pl.pl_flags & (PL_FLAG_BORN|PL_FLAG_EXITED|
 			    PL_FLAG_SCE|PL_FLAG_SCX)) != 0) {
 				if (pl.pl_flags & PL_FLAG_BORN) {
 					if ((info->flags & COUNTONLY) == 0)
 						report_thread_birth(info);
 				} else if (pl.pl_flags & PL_FLAG_EXITED) {
 					if ((info->flags & COUNTONLY) == 0)
 						report_thread_death(info);
 					free_thread(info->curthread);
 					info->curthread = NULL;
 				} else if (pl.pl_flags & PL_FLAG_SCE)
 					enter_syscall(info, info->curthread, &pl);
 				else if (pl.pl_flags & PL_FLAG_SCX)
 					exit_syscall(info, &pl);
 				pending_signal = 0;
 			} else if (pl.pl_flags & PL_FLAG_CHILD) {
 				if ((info->flags & COUNTONLY) == 0)
 					report_new_child(info);
 				pending_signal = 0;
 			} else {
 				if ((info->flags & NOSIGS) == 0)
 					report_signal(info, &si, &pl);
 				pending_signal = si.si_status;
 			}
 			ptrace(PT_SYSCALL, si.si_pid, (caddr_t)1,
 			    pending_signal);
 			break;
 		case CLD_STOPPED:
 			errx(1, "waitid reported CLD_STOPPED");
 		case CLD_CONTINUED:
 			break;
 		}
 	}
 }
Index: head/usr.bin/truss/syscalls.c
===================================================================
--- head/usr.bin/truss/syscalls.c	(revision 336913)
+++ head/usr.bin/truss/syscalls.c	(revision 336914)
@@ -1,2709 +1,2710 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright 1997 Sean Eric Fagan
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan
  * 4. Neither the name of the author may be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * This file has routines used to print out system calls and their
  * arguments.
  */
 
 #include <sys/capsicum.h>
 #include <sys/types.h>
 #define	_WANT_FREEBSD11_KEVENT
 #include <sys/event.h>
 #include <sys/ioccom.h>
 #include <sys/mount.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
 #define _WANT_FREEBSD11_STAT
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <sys/un.h>
 #include <sys/wait.h>
 #include <netinet/in.h>
 #include <netinet/sctp.h>
 #include <arpa/inet.h>
 
 #include <assert.h>
 #include <ctype.h>
 #include <err.h>
 #include <fcntl.h>
 #include <poll.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysdecode.h>
 #include <unistd.h>
 #include <vis.h>
 
 #include <contrib/cloudabi/cloudabi_types_common.h>
 
 #include "truss.h"
 #include "extern.h"
 #include "syscall.h"
 
 /*
  * This should probably be in its own file, sorted alphabetically.
  */
 static struct syscall decoded_syscalls[] = {
 	/* Native ABI */
 	{ .name = "__acl_aclcheck_fd", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_aclcheck_file", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_aclcheck_link", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_delete_fd", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Acltype, 1 } } },
 	{ .name = "__acl_delete_file", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Acltype, 1 } } },
 	{ .name = "__acl_delete_link", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Acltype, 1 } } },
 	{ .name = "__acl_get_fd", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_get_file", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_get_link", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_set_fd", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_set_file", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__acl_set_link", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } },
 	{ .name = "__cap_rights_get", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Int, 1 }, { CapRights | OUT, 2 } } },
 	{ .name = "__getcwd", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | OUT, 0 }, { Int, 1 } } },
 	{ .name = "_umtx_op", .ret_type = 1, .nargs = 5,
 	  .args = { { Ptr, 0 }, { Umtxop, 1 }, { LongHex, 2 }, { Ptr, 3 },
 		    { Ptr, 4 } } },
 	{ .name = "accept", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } },
 	{ .name = "access", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Accessmode, 1 } } },
 	{ .name = "bind", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } },
 	{ .name = "bindat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 },
 		    { Int, 3 } } },
 	{ .name = "break", .ret_type = 1, .nargs = 1,
 	  .args = { { Ptr, 0 } } },
 	{ .name = "cap_fcntls_get", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CapFcntlRights | OUT, 1 } } },
 	{ .name = "cap_fcntls_limit", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CapFcntlRights, 1 } } },
 	{ .name = "cap_getmode", .ret_type = 1, .nargs = 1,
 	  .args = { { PUInt | OUT, 0 } } },
 	{ .name = "cap_rights_limit", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CapRights, 1 } } },
 	{ .name = "chdir", .ret_type = 1, .nargs = 1,
 	  .args = { { Name, 0 } } },
 	{ .name = "chflags", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { FileFlags, 1 } } },
 	{ .name = "chflagsat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { FileFlags, 2 },
 		    { Atflags, 3 } } },
 	{ .name = "chmod", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Octal, 1 } } },
 	{ .name = "chown", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "chroot", .ret_type = 1, .nargs = 1,
 	  .args = { { Name, 0 } } },
 	{ .name = "clock_gettime", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Timespec | OUT, 1 } } },
 	{ .name = "close", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "compat11.fstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Stat11 | OUT, 1 } } },
 	{ .name = "compat11.fstatat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat11 | OUT, 2 },
 		    { Atflags, 3 } } },
 	{ .name = "compat11.kevent", .ret_type = 1, .nargs = 6,
 	  .args = { { Int, 0 }, { Kevent11, 1 }, { Int, 2 },
 		    { Kevent11 | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } },
 	{ .name = "compat11.lstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } },
 	{ .name = "compat11.stat", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } },
 	{ .name = "connect", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } },
 	{ .name = "connectat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 },
 		    { Int, 3 } } },
 	{ .name = "dup", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "dup2", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Int, 1 } } },
 	{ .name = "eaccess", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Accessmode, 1 } } },
 	{ .name = "execve", .ret_type = 1, .nargs = 3,
 	  .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 },
 		    { ExecEnv | IN, 2 } } },
 	{ .name = "exit", .ret_type = 0, .nargs = 1,
 	  .args = { { Hex, 0 } } },
 	{ .name = "extattr_delete_fd", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } },
 	{ .name = "extattr_delete_file", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } },
 	{ .name = "extattr_delete_link", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } },
 	{ .name = "extattr_get_fd", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | OUT, 3 }, { Sizet, 4 } } },
 	{ .name = "extattr_get_file", .ret_type = 1, .nargs = 5,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | OUT, 3 }, { Sizet, 4 } } },
 	{ .name = "extattr_get_link", .ret_type = 1, .nargs = 5,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | OUT, 3 }, { Sizet, 4 } } },
 	{ .name = "extattr_list_fd", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 },
 		    { Sizet, 3 } } },
 	{ .name = "extattr_list_file", .ret_type = 1, .nargs = 4,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 },
 		    { Sizet, 3 } } },
 	{ .name = "extattr_list_link", .ret_type = 1, .nargs = 4,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 },
 		    { Sizet, 3 } } },
 	{ .name = "extattr_set_fd", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | IN, 3 }, { Sizet, 4 } } },
 	{ .name = "extattr_set_file", .ret_type = 1, .nargs = 5,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | IN, 3 }, { Sizet, 4 } } },
 	{ .name = "extattr_set_link", .ret_type = 1, .nargs = 5,
 	  .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 },
 		    { BinString | IN, 3 }, { Sizet, 4 } } },
 	{ .name = "extattrctl", .ret_type = 1, .nargs = 5,
 	  .args = { { Name, 0 }, { Hex, 1 }, { Name, 2 },
 		    { Extattrnamespace, 3 }, { Name, 4 } } },
 	{ .name = "faccessat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Accessmode, 2 },
 		    { Atflags, 3 } } },
 	{ .name = "fchflags", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { FileFlags, 1 } } },
 	{ .name = "fchmod", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Octal, 1 } } },
 	{ .name = "fchmodat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Atflags, 3 } } },
 	{ .name = "fchown", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "fchownat", .ret_type = 1, .nargs = 5,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Int, 2 }, { Int, 3 },
 		    { Atflags, 4 } } },
 	{ .name = "fcntl", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Fcntl, 1 }, { Fcntlflag, 2 } } },
 	{ .name = "flock", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Flockop, 1 } } },
 	{ .name = "fstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Stat | OUT, 1 } } },
 	{ .name = "fstatat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat | OUT, 2 },
 		    { Atflags, 3 } } },
 	{ .name = "fstatfs", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { StatFs | OUT, 1 } } },
 	{ .name = "ftruncate", .ret_type = 1, .nargs = 2,
 	  .args = { { Int | IN, 0 }, { QuadHex | IN, 1 } } },
 	{ .name = "futimens", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Timespec2 | IN, 1 } } },
 	{ .name = "futimes", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Timeval2 | IN, 1 } } },
 	{ .name = "futimesat", .ret_type = 1, .nargs = 3,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timeval2 | IN, 2 } } },
 	{ .name = "getdirentries", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 },
 		    { PQuadHex | OUT, 3 } } },
 	{ .name = "getfsstat", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Long, 1 }, { Getfsstatmode, 2 } } },
 	{ .name = "getitimer", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Itimerval | OUT, 2 } } },
 	{ .name = "getpeername", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } },
 	{ .name = "getpgid", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "getpriority", .ret_type = 1, .nargs = 2,
 	  .args = { { Priowhich, 0 }, { Int, 1 } } },
 	{ .name = "getrandom", .ret_type = 1, .nargs = 3,
 	  .args = { { BinString | OUT, 0 }, { Sizet, 1 }, { UInt, 2 } } },
 	{ .name = "getrlimit", .ret_type = 1, .nargs = 2,
 	  .args = { { Resource, 0 }, { Rlimit | OUT, 1 } } },
 	{ .name = "getrusage", .ret_type = 1, .nargs = 2,
 	  .args = { { RusageWho, 0 }, { Rusage | OUT, 1 } } },
 	{ .name = "getsid", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "getsockname", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } },
 	{ .name = "getsockopt", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 },
 		    { Ptr | OUT, 3 }, { Ptr | OUT, 4 } } },
 	{ .name = "gettimeofday", .ret_type = 1, .nargs = 2,
 	  .args = { { Timeval | OUT, 0 }, { Ptr, 1 } } },
 	{ .name = "ioctl", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Ioctl, 1 }, { Ptr, 2 } } },
 	{ .name = "kevent", .ret_type = 1, .nargs = 6,
 	  .args = { { Int, 0 }, { Kevent, 1 }, { Int, 2 }, { Kevent | OUT, 3 },
 		    { Int, 4 }, { Timespec, 5 } } },
 	{ .name = "kill", .ret_type = 1, .nargs = 2,
 	  .args = { { Int | IN, 0 }, { Signal | IN, 1 } } },
 	{ .name = "kldfind", .ret_type = 1, .nargs = 1,
 	  .args = { { Name | IN, 0 } } },
 	{ .name = "kldfirstmod", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "kldload", .ret_type = 1, .nargs = 1,
 	  .args = { { Name | IN, 0 } } },
 	{ .name = "kldnext", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "kldstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Ptr, 1 } } },
 	{ .name = "kldsym", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Kldsymcmd, 1 }, { Ptr, 2 } } },
 	{ .name = "kldunload", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "kldunloadf", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Kldunloadflags, 1 } } },
 	{ .name = "kse_release", .ret_type = 0, .nargs = 1,
 	  .args = { { Timespec, 0 } } },
 	{ .name = "lchflags", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { FileFlags, 1 } } },
 	{ .name = "lchmod", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Octal, 1 } } },
 	{ .name = "lchown", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "link", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Name, 1 } } },
 	{ .name = "linkat", .ret_type = 1, .nargs = 5,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 },
 		    { Atflags, 4 } } },
 	{ .name = "listen", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Int, 1 } } },
  	{ .name = "lseek", .ret_type = 2, .nargs = 3,
 	  .args = { { Int, 0 }, { QuadHex, 1 }, { Whence, 2 } } },
 	{ .name = "lstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } },
 	{ .name = "lutimes", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } },
 	{ .name = "madvise", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Sizet, 1 }, { Madvice, 2 } } },
 	{ .name = "minherit", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Sizet, 1 }, { Minherit, 2 } } },
 	{ .name = "mkdir", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Octal, 1 } } },
 	{ .name = "mkdirat", .ret_type = 1, .nargs = 3,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } },
 	{ .name = "mkfifo", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Octal, 1 } } },
 	{ .name = "mkfifoat", .ret_type = 1, .nargs = 3,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } },
 	{ .name = "mknod", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Octal, 1 }, { Int, 2 } } },
 	{ .name = "mknodat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Int, 3 } } },
 	{ .name = "mlock", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { Sizet, 1 } } },
 	{ .name = "mlockall", .ret_type = 1, .nargs = 1,
 	  .args = { { Mlockall, 0 } } },
 	{ .name = "mmap", .ret_type = 1, .nargs = 6,
 	  .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 }, { Mmapflags, 3 },
 		    { Int, 4 }, { QuadHex, 5 } } },
 	{ .name = "modfind", .ret_type = 1, .nargs = 1,
 	  .args = { { Name | IN, 0 } } },
 	{ .name = "mount", .ret_type = 1, .nargs = 4,
 	  .args = { { Name, 0 }, { Name, 1 }, { Mountflags, 2 }, { Ptr, 3 } } },
 	{ .name = "mprotect", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 } } },
 	{ .name = "msync", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Sizet, 1 }, { Msync, 2 } } },
 	{ .name = "munlock", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { Sizet, 1 } } },
 	{ .name = "munmap", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { Sizet, 1 } } },
 	{ .name = "nanosleep", .ret_type = 1, .nargs = 1,
 	  .args = { { Timespec, 0 } } },
 	{ .name = "nmount", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { UInt, 1 }, { Mountflags, 2 } } },
 	{ .name = "open", .ret_type = 1, .nargs = 3,
 	  .args = { { Name | IN, 0 }, { Open, 1 }, { Octal, 2 } } },
 	{ .name = "openat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Open, 2 },
 		    { Octal, 3 } } },
 	{ .name = "pathconf", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Pathconf, 1 } } },
 	{ .name = "pipe", .ret_type = 1, .nargs = 1,
 	  .args = { { PipeFds | OUT, 0 } } },
 	{ .name = "pipe2", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { Pipe2, 1 } } },
 	{ .name = "poll", .ret_type = 1, .nargs = 3,
 	  .args = { { Pollfd, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "posix_fadvise", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { QuadHex, 1 }, { QuadHex, 2 },
 		    { Fadvice, 3 } } },
 	{ .name = "posix_openpt", .ret_type = 1, .nargs = 1,
 	  .args = { { Open, 0 } } },
 	{ .name = "pread", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 },
 		    { QuadHex, 3 } } },
 	{ .name = "procctl", .ret_type = 1, .nargs = 4,
 	  .args = { { Idtype, 0 }, { Quad, 1 }, { Procctl, 2 }, { Ptr, 3 } } },
 	{ .name = "ptrace", .ret_type = 1, .nargs = 4,
 	  .args = { { Ptraceop, 0 }, { Int, 1 }, { Ptr, 2 }, { Int, 3 } } },
 	{ .name = "pwrite", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 },
 		    { QuadHex, 3 } } },
 	{ .name = "quotactl", .ret_type = 1, .nargs = 4,
 	  .args = { { Name, 0 }, { Quotactlcmd, 1 }, { Int, 2 }, { Ptr, 3 } } },
 	{ .name = "read", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 } } },
 	{ .name = "readlink", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Readlinkres | OUT, 1 }, { Sizet, 2 } } },
 	{ .name = "readlinkat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Readlinkres | OUT, 2 },
 		    { Sizet, 3 } } },
 	{ .name = "readv", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 } } },
 	{ .name = "reboot", .ret_type = 1, .nargs = 1,
 	  .args = { { Reboothowto, 0 } } },
 	{ .name = "recvfrom", .ret_type = 1, .nargs = 6,
 	  .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 },
 	            { Msgflags, 3 }, { Sockaddr | OUT, 4 },
 	            { Ptr | OUT, 5 } } },
 	{ .name = "recvmsg", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Msghdr | OUT, 1 }, { Msgflags, 2 } } },
 	{ .name = "rename", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Name, 1 } } },
 	{ .name = "renameat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 } } },
 	{ .name = "rfork", .ret_type = 1, .nargs = 1,
 	  .args = { { Rforkflags, 0 } } },
 	{ .name = "rmdir", .ret_type = 1, .nargs = 1,
 	  .args = { { Name, 0 } } },
 	{ .name = "rtprio", .ret_type = 1, .nargs = 3,
 	  .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } },
 	{ .name = "rtprio_thread", .ret_type = 1, .nargs = 3,
 	  .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } },
 	{ .name = "sched_get_priority_max", .ret_type = 1, .nargs = 1,
 	  .args = { { Schedpolicy, 0 } } },
 	{ .name = "sched_get_priority_min", .ret_type = 1, .nargs = 1,
 	  .args = { { Schedpolicy, 0 } } },
 	{ .name = "sched_getparam", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Schedparam | OUT, 1 } } },
 	{ .name = "sched_getscheduler", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "sched_rr_get_interval", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Timespec | OUT, 1 } } },
 	{ .name = "sched_setparam", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Schedparam, 1 } } },
 	{ .name = "sched_setscheduler", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Schedpolicy, 1 }, { Schedparam, 2 } } },
 	{ .name = "sctp_generic_recvmsg", .ret_type = 1, .nargs = 7,
 	  .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 },
 	            { Sockaddr | OUT, 3 }, { Ptr | OUT, 4 },
 	            { Sctpsndrcvinfo | OUT, 5 }, { Ptr | OUT, 6 } } },
 	{ .name = "sctp_generic_sendmsg", .ret_type = 1, .nargs = 7,
 	  .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 },
 	            { Sockaddr | IN, 3 }, { Socklent, 4 },
 	            { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } },
 	{ .name = "sctp_generic_sendmsg_iov", .ret_type = 1, .nargs = 7,
 	  .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 },
 	            { Sockaddr | IN, 3 }, { Socklent, 4 },
 	            { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } },
 	{ .name = "select", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { Fd_set, 1 }, { Fd_set, 2 }, { Fd_set, 3 },
 		    { Timeval, 4 } } },
 	{ .name = "sendmsg", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Msghdr | IN, 1 }, { Msgflags, 2 } } },
 	{ .name = "sendto", .ret_type = 1, .nargs = 6,
 	  .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 },
 	            { Msgflags, 3 }, { Sockaddr | IN, 4 },
 	            { Socklent | IN, 5 } } },
 	{ .name = "setitimer", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Itimerval, 1 }, { Itimerval | OUT, 2 } } },
 	{ .name = "setpriority", .ret_type = 1, .nargs = 3,
 	  .args = { { Priowhich, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "setrlimit", .ret_type = 1, .nargs = 2,
 	  .args = { { Resource, 0 }, { Rlimit | IN, 1 } } },
 	{ .name = "setsockopt", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 },
 		    { Ptr | IN, 3 }, { Socklent, 4 } } },
 	{ .name = "shutdown", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Shutdown, 1 } } },
 	{ .name = "sigaction", .ret_type = 1, .nargs = 3,
 	  .args = { { Signal, 0 }, { Sigaction | IN, 1 },
 		    { Sigaction | OUT, 2 } } },
 	{ .name = "sigpending", .ret_type = 1, .nargs = 1,
 	  .args = { { Sigset | OUT, 0 } } },
 	{ .name = "sigprocmask", .ret_type = 1, .nargs = 3,
 	  .args = { { Sigprocmask, 0 }, { Sigset, 1 }, { Sigset | OUT, 2 } } },
 	{ .name = "sigqueue", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Signal, 1 }, { LongHex, 2 } } },
 	{ .name = "sigreturn", .ret_type = 1, .nargs = 1,
 	  .args = { { Ptr, 0 } } },
 	{ .name = "sigsuspend", .ret_type = 1, .nargs = 1,
 	  .args = { { Sigset | IN, 0 } } },
 	{ .name = "sigtimedwait", .ret_type = 1, .nargs = 3,
 	  .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 },
 		    { Timespec | IN, 2 } } },
 	{ .name = "sigwait", .ret_type = 1, .nargs = 2,
 	  .args = { { Sigset | IN, 0 }, { PSig | OUT, 1 } } },
 	{ .name = "sigwaitinfo", .ret_type = 1, .nargs = 2,
 	  .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 } } },
 	{ .name = "socket", .ret_type = 1, .nargs = 3,
 	  .args = { { Sockdomain, 0 }, { Socktype, 1 }, { Sockprotocol, 2 } } },
 	{ .name = "stat", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } },
 	{ .name = "statfs", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { StatFs | OUT, 1 } } },
 	{ .name = "symlink", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Name, 1 } } },
 	{ .name = "symlinkat", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Atfd, 1 }, { Name, 2 } } },
 	{ .name = "sysarch", .ret_type = 1, .nargs = 2,
 	  .args = { { Sysarch, 0 }, { Ptr, 1 } } },
 	{ .name = "thr_kill", .ret_type = 1, .nargs = 2,
 	  .args = { { Long, 0 }, { Signal, 1 } } },
 	{ .name = "thr_self", .ret_type = 1, .nargs = 1,
 	  .args = { { Ptr, 0 } } },
 	{ .name = "thr_set_name", .ret_type = 1, .nargs = 2,
 	  .args = { { Long, 0 }, { Name, 1 } } },
 	{ .name = "truncate", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { QuadHex | IN, 1 } } },
 #if 0
 	/* Does not exist */
 	{ .name = "umount", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Int, 2 } } },
 #endif
 	{ .name = "unlink", .ret_type = 1, .nargs = 1,
 	  .args = { { Name, 0 } } },
 	{ .name = "unlinkat", .ret_type = 1, .nargs = 3,
 	  .args = { { Atfd, 0 }, { Name, 1 }, { Atflags, 2 } } },
 	{ .name = "unmount", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Mountflags, 1 } } },
 	{ .name = "utimensat", .ret_type = 1, .nargs = 4,
 	  .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timespec2 | IN, 2 },
 		    { Atflags, 3 } } },
 	{ .name = "utimes", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } },
 	{ .name = "utrace", .ret_type = 1, .nargs = 1,
 	  .args = { { Utrace, 0 } } },
 	{ .name = "wait4", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { ExitStatus | OUT, 1 }, { Waitoptions, 2 },
 		    { Rusage | OUT, 3 } } },
 	{ .name = "wait6", .ret_type = 1, .nargs = 6,
 	  .args = { { Idtype, 0 }, { Quad, 1 }, { ExitStatus | OUT, 2 },
 		    { Waitoptions, 3 }, { Rusage | OUT, 4 },
 		    { Siginfo | OUT, 5 } } },
 	{ .name = "write", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 } } },
 	{ .name = "writev", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 } } },
 
 	/* Linux ABI */
 	{ .name = "linux_access", .ret_type = 1, .nargs = 2,
 	  .args = { { Name, 0 }, { Accessmode, 1 } } },
 	{ .name = "linux_execve", .ret_type = 1, .nargs = 3,
 	  .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 },
 		    { ExecEnv | IN, 2 } } },
 	{ .name = "linux_lseek", .ret_type = 2, .nargs = 3,
 	  .args = { { Int, 0 }, { Int, 1 }, { Whence, 2 } } },
 	{ .name = "linux_mkdir", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Int, 1 } } },
 	{ .name = "linux_newfstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Ptr | OUT, 1 } } },
 	{ .name = "linux_newstat", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } },
 	{ .name = "linux_open", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Hex, 1 }, { Octal, 2 } } },
 	{ .name = "linux_readlink", .ret_type = 1, .nargs = 3,
 	  .args = { { Name, 0 }, { Name | OUT, 1 }, { Sizet, 2 } } },
 	{ .name = "linux_socketcall", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { LinuxSockArgs, 1 } } },
 	{ .name = "linux_stat64", .ret_type = 1, .nargs = 2,
 	  .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } },
 
 	/* CloudABI system calls. */
 	{ .name = "cloudabi_sys_clock_res_get", .ret_type = 1, .nargs = 1,
 	  .args = { { CloudABIClockID, 0 } } },
 	{ .name = "cloudabi_sys_clock_time_get", .ret_type = 1, .nargs = 2,
 	  .args = { { CloudABIClockID, 0 }, { CloudABITimestamp, 1 } } },
 	{ .name = "cloudabi_sys_condvar_signal", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { CloudABIMFlags, 1 }, { UInt, 2 } } },
 	{ .name = "cloudabi_sys_fd_close", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "cloudabi_sys_fd_create1", .ret_type = 1, .nargs = 1,
 	  .args = { { CloudABIFileType, 0 } } },
 	{ .name = "cloudabi_sys_fd_create2", .ret_type = 1, .nargs = 2,
 	  .args = { { CloudABIFileType, 0 }, { PipeFds | OUT, 0 } } },
 	{ .name = "cloudabi_sys_fd_datasync", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "cloudabi_sys_fd_dup", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "cloudabi_sys_fd_replace", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { Int, 1 } } },
 	{ .name = "cloudabi_sys_fd_seek", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Int, 1 }, { CloudABIWhence, 2 } } },
 	{ .name = "cloudabi_sys_fd_stat_get", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CloudABIFDStat | OUT, 1 } } },
 	{ .name = "cloudabi_sys_fd_stat_put", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { CloudABIFDStat | IN, 1 },
 	            { CloudABIFDSFlags, 2 } } },
 	{ .name = "cloudabi_sys_fd_sync", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "cloudabi_sys_file_advise", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { Int, 1 }, { Int, 2 },
 	            { CloudABIAdvice, 3 } } },
 	{ .name = "cloudabi_sys_file_allocate", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } },
 	{ .name = "cloudabi_sys_file_create", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { BinString | IN, 1 },
 	            { CloudABIFileType, 3 } } },
 	{ .name = "cloudabi_sys_file_link", .ret_type = 1, .nargs = 4,
 	  .args = { { CloudABILookup, 0 }, { BinString | IN, 1 },
 	            { Int, 3 }, { BinString | IN, 4 } } },
 	{ .name = "cloudabi_sys_file_open", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | IN, 1 },
 	            { CloudABIOFlags, 3 }, { CloudABIFDStat | IN, 4 } } },
 	{ .name = "cloudabi_sys_file_readdir", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 },
 	            { Int, 3 } } },
 	{ .name = "cloudabi_sys_file_readlink", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | IN, 1 },
 	            { BinString | OUT, 3 }, { Int, 4 } } },
 	{ .name = "cloudabi_sys_file_rename", .ret_type = 1, .nargs = 4,
 	  .args = { { Int, 0 }, { BinString | IN, 1 },
 	            { Int, 3 }, { BinString | IN, 4 } } },
 	{ .name = "cloudabi_sys_file_stat_fget", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CloudABIFileStat | OUT, 1 } } },
 	{ .name = "cloudabi_sys_file_stat_fput", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { CloudABIFileStat | IN, 1 },
 	            { CloudABIFSFlags, 2 } } },
 	{ .name = "cloudabi_sys_file_stat_get", .ret_type = 1, .nargs = 3,
 	  .args = { { CloudABILookup, 0 }, { BinString | IN, 1 },
 	            { CloudABIFileStat | OUT, 3 } } },
 	{ .name = "cloudabi_sys_file_stat_put", .ret_type = 1, .nargs = 4,
 	  .args = { { CloudABILookup, 0 }, { BinString | IN, 1 },
 	            { CloudABIFileStat | IN, 3 }, { CloudABIFSFlags, 4 } } },
 	{ .name = "cloudabi_sys_file_symlink", .ret_type = 1, .nargs = 3,
 	  .args = { { BinString | IN, 0 },
 	            { Int, 2 }, { BinString | IN, 3 } } },
 	{ .name = "cloudabi_sys_file_unlink", .ret_type = 1, .nargs = 3,
 	  .args = { { Int, 0 }, { BinString | IN, 1 },
 	            { CloudABIULFlags, 3 } } },
 	{ .name = "cloudabi_sys_lock_unlock", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } },
 	{ .name = "cloudabi_sys_mem_advise", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIAdvice, 2 } } },
 	{ .name = "cloudabi_sys_mem_map", .ret_type = 1, .nargs = 6,
 	  .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 },
 	            { CloudABIMFlags, 3 }, { Int, 4 }, { Int, 5 } } },
 	{ .name = "cloudabi_sys_mem_protect", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 } } },
 	{ .name = "cloudabi_sys_mem_sync", .ret_type = 1, .nargs = 3,
 	  .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMSFlags, 2 } } },
 	{ .name = "cloudabi_sys_mem_unmap", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { Int, 1 } } },
 	{ .name = "cloudabi_sys_proc_exec", .ret_type = 1, .nargs = 5,
 	  .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 },
 	            { IntArray, 3 }, { Int, 4 } } },
 	{ .name = "cloudabi_sys_proc_exit", .ret_type = 1, .nargs = 1,
 	  .args = { { Int, 0 } } },
 	{ .name = "cloudabi_sys_proc_fork", .ret_type = 1, .nargs = 0 },
 	{ .name = "cloudabi_sys_proc_raise", .ret_type = 1, .nargs = 1,
 	  .args = { { CloudABISignal, 0 } } },
 	{ .name = "cloudabi_sys_random_get", .ret_type = 1, .nargs = 2,
 	  .args = { { BinString | OUT, 0 }, { Int, 1 } } },
 	{ .name = "cloudabi_sys_sock_shutdown", .ret_type = 1, .nargs = 2,
 	  .args = { { Int, 0 }, { CloudABISDFlags, 1 } } },
 	{ .name = "cloudabi_sys_thread_exit", .ret_type = 1, .nargs = 2,
 	  .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } },
 	{ .name = "cloudabi_sys_thread_yield", .ret_type = 1, .nargs = 0 },
 
 	{ .name = 0 },
 };
 static STAILQ_HEAD(, syscall) syscalls;
 
 /* Xlat idea taken from strace */
 struct xlat {
 	int val;
 	const char *str;
 };
 
 #define	X(a)	{ a, #a },
 #define	XEND	{ 0, NULL }
 
 static struct xlat poll_flags[] = {
 	X(POLLSTANDARD) X(POLLIN) X(POLLPRI) X(POLLOUT) X(POLLERR)
 	X(POLLHUP) X(POLLNVAL) X(POLLRDNORM) X(POLLRDBAND)
 	X(POLLWRBAND) X(POLLINIGNEOF) XEND
 };
 
 static struct xlat sigaction_flags[] = {
 	X(SA_ONSTACK) X(SA_RESTART) X(SA_RESETHAND) X(SA_NOCLDSTOP)
 	X(SA_NODEFER) X(SA_NOCLDWAIT) X(SA_SIGINFO) XEND
 };
 
 static struct xlat linux_socketcall_ops[] = {
 	X(LINUX_SOCKET) X(LINUX_BIND) X(LINUX_CONNECT) X(LINUX_LISTEN)
 	X(LINUX_ACCEPT) X(LINUX_GETSOCKNAME) X(LINUX_GETPEERNAME)
 	X(LINUX_SOCKETPAIR) X(LINUX_SEND) X(LINUX_RECV) X(LINUX_SENDTO)
 	X(LINUX_RECVFROM) X(LINUX_SHUTDOWN) X(LINUX_SETSOCKOPT)
 	X(LINUX_GETSOCKOPT) X(LINUX_SENDMSG) X(LINUX_RECVMSG)
 	XEND
 };
 
 #undef X
 #define	X(a)	{ CLOUDABI_##a, #a },
 
 static struct xlat cloudabi_advice[] = {
 	X(ADVICE_DONTNEED) X(ADVICE_NOREUSE) X(ADVICE_NORMAL)
 	X(ADVICE_RANDOM) X(ADVICE_SEQUENTIAL) X(ADVICE_WILLNEED)
 	XEND
 };
 
 static struct xlat cloudabi_clockid[] = {
 	X(CLOCK_MONOTONIC) X(CLOCK_PROCESS_CPUTIME_ID)
 	X(CLOCK_REALTIME) X(CLOCK_THREAD_CPUTIME_ID)
 	XEND
 };
 
 static struct xlat cloudabi_fdflags[] = {
 	X(FDFLAG_APPEND) X(FDFLAG_DSYNC) X(FDFLAG_NONBLOCK)
 	X(FDFLAG_RSYNC) X(FDFLAG_SYNC)
 	XEND
 };
 
 static struct xlat cloudabi_fdsflags[] = {
 	X(FDSTAT_FLAGS) X(FDSTAT_RIGHTS)
 	XEND
 };
 
 static struct xlat cloudabi_filetype[] = {
 	X(FILETYPE_UNKNOWN) X(FILETYPE_BLOCK_DEVICE)
 	X(FILETYPE_CHARACTER_DEVICE) X(FILETYPE_DIRECTORY)
 	X(FILETYPE_PROCESS) X(FILETYPE_REGULAR_FILE)
 	X(FILETYPE_SHARED_MEMORY) X(FILETYPE_SOCKET_DGRAM)
 	X(FILETYPE_SOCKET_STREAM) X(FILETYPE_SYMBOLIC_LINK)
 	XEND
 };
 
 static struct xlat cloudabi_fsflags[] = {
 	X(FILESTAT_ATIM) X(FILESTAT_ATIM_NOW) X(FILESTAT_MTIM)
 	X(FILESTAT_MTIM_NOW) X(FILESTAT_SIZE)
 	XEND
 };
 
 static struct xlat cloudabi_mflags[] = {
 	X(MAP_ANON) X(MAP_FIXED) X(MAP_PRIVATE) X(MAP_SHARED)
 	XEND
 };
 
 static struct xlat cloudabi_mprot[] = {
 	X(PROT_EXEC) X(PROT_WRITE) X(PROT_READ)
 	XEND
 };
 
 static struct xlat cloudabi_msflags[] = {
 	X(MS_ASYNC) X(MS_INVALIDATE) X(MS_SYNC)
 	XEND
 };
 
 static struct xlat cloudabi_oflags[] = {
 	X(O_CREAT) X(O_DIRECTORY) X(O_EXCL) X(O_TRUNC)
 	XEND
 };
 
 static struct xlat cloudabi_sdflags[] = {
 	X(SHUT_RD) X(SHUT_WR)
 	XEND
 };
 
 static struct xlat cloudabi_signal[] = {
 	X(SIGABRT) X(SIGALRM) X(SIGBUS) X(SIGCHLD) X(SIGCONT) X(SIGFPE)
 	X(SIGHUP) X(SIGILL) X(SIGINT) X(SIGKILL) X(SIGPIPE) X(SIGQUIT)
 	X(SIGSEGV) X(SIGSTOP) X(SIGSYS) X(SIGTERM) X(SIGTRAP) X(SIGTSTP)
 	X(SIGTTIN) X(SIGTTOU) X(SIGURG) X(SIGUSR1) X(SIGUSR2)
 	X(SIGVTALRM) X(SIGXCPU) X(SIGXFSZ)
 	XEND
 };
 
 static struct xlat cloudabi_ulflags[] = {
 	X(UNLINK_REMOVEDIR)
 	XEND
 };
 
 static struct xlat cloudabi_whence[] = {
 	X(WHENCE_CUR) X(WHENCE_END) X(WHENCE_SET)
 	XEND
 };
 
 #undef X
 #undef XEND
 
 /*
  * Searches an xlat array for a value, and returns it if found.  Otherwise
  * return a string representation.
  */
 static const char *
 lookup(struct xlat *xlat, int val, int base)
 {
 	static char tmp[16];
 
 	for (; xlat->str != NULL; xlat++)
 		if (xlat->val == val)
 			return (xlat->str);
 	switch (base) {
 		case 8:
 			sprintf(tmp, "0%o", val);
 			break;
 		case 16:
 			sprintf(tmp, "0x%x", val);
 			break;
 		case 10:
 			sprintf(tmp, "%u", val);
 			break;
 		default:
 			errx(1,"Unknown lookup base");
 			break;
 	}
 	return (tmp);
 }
 
 static const char *
 xlookup(struct xlat *xlat, int val)
 {
 
 	return (lookup(xlat, val, 16));
 }
 
 /*
  * Searches an xlat array containing bitfield values.  Remaining bits
  * set after removing the known ones are printed at the end:
  * IN|0x400.
  */
 static char *
 xlookup_bits(struct xlat *xlat, int val)
 {
 	int len, rem;
 	static char str[512];
 
 	len = 0;
 	rem = val;
 	for (; xlat->str != NULL; xlat++) {
 		if ((xlat->val & rem) == xlat->val) {
 			/*
 			 * Don't print the "all-bits-zero" string unless all
 			 * bits are really zero.
 			 */
 			if (xlat->val == 0 && val != 0)
 				continue;
 			len += sprintf(str + len, "%s|", xlat->str);
 			rem &= ~(xlat->val);
 		}
 	}
 
 	/*
 	 * If we have leftover bits or didn't match anything, print
 	 * the remainder.
 	 */
 	if (rem || len == 0)
 		len += sprintf(str + len, "0x%x", rem);
 	if (len && str[len - 1] == '|')
 		len--;
 	str[len] = 0;
 	return (str);
 }
 
 static void
 print_integer_arg(const char *(*decoder)(int), FILE *fp, int value)
 {
 	const char *str;
 
 	str = decoder(value);
 	if (str != NULL)
 		fputs(str, fp);
 	else
 		fprintf(fp, "%d", value);
 }
 
 static void
 print_mask_arg(bool (*decoder)(FILE *, int, int *), FILE *fp, int value)
 {
 	int rem;
 
 	if (!decoder(fp, value, &rem))
 		fprintf(fp, "0x%x", rem);
 	else if (rem != 0)
 		fprintf(fp, "|0x%x", rem);
 }
 
 static void
 print_mask_arg32(bool (*decoder)(FILE *, uint32_t, uint32_t *), FILE *fp,
     uint32_t value)
 {
 	uint32_t rem;
 
 	if (!decoder(fp, value, &rem))
 		fprintf(fp, "0x%x", rem);
 	else if (rem != 0)
 		fprintf(fp, "|0x%x", rem);
 }
 
 #ifndef __LP64__
 /*
  * Add argument padding to subsequent system calls afater a Quad
  * syscall arguments as needed.  This used to be done by hand in the
  * decoded_syscalls table which was ugly and error prone.  It is
  * simpler to do the fixup of offsets at initalization time than when
  * decoding arguments.
  */
 static void
 quad_fixup(struct syscall *sc)
 {
 	int offset, prev;
 	u_int i;
 
 	offset = 0;
 	prev = -1;
 	for (i = 0; i < sc->nargs; i++) {
 		/* This arg type is a dummy that doesn't use offset. */
 		if ((sc->args[i].type & ARG_MASK) == PipeFds)
 			continue;
 
 		assert(prev < sc->args[i].offset);
 		prev = sc->args[i].offset;
 		sc->args[i].offset += offset;
 		switch (sc->args[i].type & ARG_MASK) {
 		case Quad:
 		case QuadHex:
 #ifdef __powerpc__
 			/*
 			 * 64-bit arguments on 32-bit powerpc must be
 			 * 64-bit aligned.  If the current offset is
 			 * not aligned, the calling convention inserts
 			 * a 32-bit pad argument that should be skipped.
 			 */
 			if (sc->args[i].offset % 2 == 1) {
 				sc->args[i].offset++;
 				offset++;
 			}
 #endif
 			offset++;
 		default:
 			break;
 		}
 	}
 }
 #endif
 
 void
 init_syscalls(void)
 {
 	struct syscall *sc;
 
 	STAILQ_INIT(&syscalls);
 	for (sc = decoded_syscalls; sc->name != NULL; sc++) {
 #ifndef __LP64__
 		quad_fixup(sc);
 #endif
 		STAILQ_INSERT_HEAD(&syscalls, sc, entries);
 	}
 }
 
 static struct syscall *
 find_syscall(struct procabi *abi, u_int number)
 {
 	struct extra_syscall *es;
 
 	if (number < nitems(abi->syscalls))
 		return (abi->syscalls[number]);
 	STAILQ_FOREACH(es, &abi->extra_syscalls, entries) {
 		if (es->number == number)
 			return (es->sc);
 	}
 	return (NULL);
 }
 
 static void
 add_syscall(struct procabi *abi, u_int number, struct syscall *sc)
 {
 	struct extra_syscall *es;
 
 	if (number < nitems(abi->syscalls)) {
 		assert(abi->syscalls[number] == NULL);
 		abi->syscalls[number] = sc;
 	} else {
 		es = malloc(sizeof(*es));
 		es->sc = sc;
 		es->number = number;
 		STAILQ_INSERT_TAIL(&abi->extra_syscalls, es, entries);
 	}
 }
 
 /*
  * If/when the list gets big, it might be desirable to do it
  * as a hash table or binary search.
  */
 struct syscall *
 get_syscall(struct threadinfo *t, u_int number, u_int nargs)
 {
 	struct syscall *sc;
 	const char *name;
 	char *new_name;
 	u_int i;
 
 	sc = find_syscall(t->proc->abi, number);
 	if (sc != NULL)
 		return (sc);
 
 	name = sysdecode_syscallname(t->proc->abi->abi, number);
 	if (name == NULL) {
 		asprintf(&new_name, "#%d", number);
 		name = new_name;
 	} else
 		new_name = NULL;
 	STAILQ_FOREACH(sc, &syscalls, entries) {
 		if (strcmp(name, sc->name) == 0) {
 			add_syscall(t->proc->abi, number, sc);
 			free(new_name);
 			return (sc);
 		}
 	}
 
 	/* It is unknown.  Add it into the list. */
 #if DEBUG
 	fprintf(stderr, "unknown syscall %s -- setting args to %d\n", name,
 	    nargs);
 #endif
 
 	sc = calloc(1, sizeof(struct syscall));
 	sc->name = name;
 	if (new_name != NULL)
 		sc->unknown = true;
 	sc->ret_type = 1;
 	sc->nargs = nargs;
 	for (i = 0; i < nargs; i++) {
 		sc->args[i].offset = i;
 		/* Treat all unknown arguments as LongHex. */
 		sc->args[i].type = LongHex;
 	}
 	STAILQ_INSERT_HEAD(&syscalls, sc, entries);
 	add_syscall(t->proc->abi, number, sc);
 
 	return (sc);
 }
 
 /*
  * Copy a fixed amount of bytes from the process.
  */
 static int
 get_struct(pid_t pid, void *offset, void *buf, int len)
 {
 	struct ptrace_io_desc iorequest;
 
 	iorequest.piod_op = PIOD_READ_D;
 	iorequest.piod_offs = offset;
 	iorequest.piod_addr = buf;
 	iorequest.piod_len = len;
 	if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0)
 		return (-1);
 	return (0);
 }
 
 #define	MAXSIZE		4096
 
 /*
  * Copy a string from the process.  Note that it is
  * expected to be a C string, but if max is set, it will
  * only get that much.
  */
 static char *
 get_string(pid_t pid, void *addr, int max)
 {
 	struct ptrace_io_desc iorequest;
 	char *buf, *nbuf;
 	size_t offset, size, totalsize;
 
 	offset = 0;
 	if (max)
 		size = max + 1;
 	else {
 		/* Read up to the end of the current page. */
 		size = PAGE_SIZE - ((uintptr_t)addr % PAGE_SIZE);
 		if (size > MAXSIZE)
 			size = MAXSIZE;
 	}
 	totalsize = size;
 	buf = malloc(totalsize);
 	if (buf == NULL)
 		return (NULL);
 	for (;;) {
 		iorequest.piod_op = PIOD_READ_D;
 		iorequest.piod_offs = (char *)addr + offset;
 		iorequest.piod_addr = buf + offset;
 		iorequest.piod_len = size;
 		if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) {
 			free(buf);
 			return (NULL);
 		}
 		if (memchr(buf + offset, '\0', size) != NULL)
 			return (buf);
 		offset += size;
 		if (totalsize < MAXSIZE && max == 0) {
 			size = MAXSIZE - totalsize;
 			if (size > PAGE_SIZE)
 				size = PAGE_SIZE;
 			nbuf = realloc(buf, totalsize + size);
 			if (nbuf == NULL) {
 				buf[totalsize - 1] = '\0';
 				return (buf);
 			}
 			buf = nbuf;
 			totalsize += size;
 		} else {
 			buf[totalsize - 1] = '\0';
 			return (buf);
 		}
 	}
 }
 
 static const char *
 strsig2(int sig)
 {
 	static char tmp[32];
 	const char *signame;
 
 	signame = sysdecode_signal(sig);
 	if (signame == NULL) {
 		snprintf(tmp, sizeof(tmp), "%d", sig);
 		signame = tmp;
 	}
 	return (signame);
 }
 
 static void
 print_kevent(FILE *fp, struct kevent *ke)
 {
 
 	switch (ke->filter) {
 	case EVFILT_READ:
 	case EVFILT_WRITE:
 	case EVFILT_VNODE:
 	case EVFILT_PROC:
 	case EVFILT_TIMER:
 	case EVFILT_PROCDESC:
 	case EVFILT_EMPTY:
 		fprintf(fp, "%ju", (uintmax_t)ke->ident);
 		break;
 	case EVFILT_SIGNAL:
 		fputs(strsig2(ke->ident), fp);
 		break;
 	default:
 		fprintf(fp, "%p", (void *)ke->ident);
 	}
 	fprintf(fp, ",");
 	print_integer_arg(sysdecode_kevent_filter, fp, ke->filter);
 	fprintf(fp, ",");
 	print_mask_arg(sysdecode_kevent_flags, fp, ke->flags);
 	fprintf(fp, ",");
 	sysdecode_kevent_fflags(fp, ke->filter, ke->fflags, 16);
 	fprintf(fp, ",%#jx,%p", (uintmax_t)ke->data, ke->udata);
 }
 
 static void
 print_utrace(FILE *fp, void *utrace_addr, size_t len)
 {
 	unsigned char *utrace_buffer;
 
 	fprintf(fp, "{ ");
 	if (sysdecode_utrace(fp, utrace_addr, len)) {
 		fprintf(fp, " }");
 		return;
 	}
 
 	utrace_buffer = utrace_addr;
 	fprintf(fp, "%zu:", len);
 	while (len--)
 		fprintf(fp, " %02x", *utrace_buffer++);
 	fprintf(fp, " }");
 }
 
 static void
 print_sockaddr(FILE *fp, struct trussinfo *trussinfo, void *arg, socklen_t len)
 {
 	char addr[64];
 	struct sockaddr_in *lsin;
 	struct sockaddr_in6 *lsin6;
 	struct sockaddr_un *sun;
 	struct sockaddr *sa;
 	u_char *q;
 	pid_t pid = trussinfo->curthread->proc->pid;
 
 	if (arg == NULL) {
 		fputs("NULL", fp);
 		return;
 	}
 	/* If the length is too small, just bail. */
 	if (len < sizeof(*sa)) {
 		fprintf(fp, "%p", arg);
 		return;
 	}
 
 	sa = calloc(1, len);
 	if (get_struct(pid, arg, sa, len) == -1) {
 		free(sa);
 		fprintf(fp, "%p", arg);
 		return;
 	}
 
 	switch (sa->sa_family) {
 	case AF_INET:
 		if (len < sizeof(*lsin))
 			goto sockaddr_short;
 		lsin = (struct sockaddr_in *)(void *)sa;
 		inet_ntop(AF_INET, &lsin->sin_addr, addr, sizeof(addr));
 		fprintf(fp, "{ AF_INET %s:%d }", addr,
 		    htons(lsin->sin_port));
 		break;
 	case AF_INET6:
 		if (len < sizeof(*lsin6))
 			goto sockaddr_short;
 		lsin6 = (struct sockaddr_in6 *)(void *)sa;
 		inet_ntop(AF_INET6, &lsin6->sin6_addr, addr,
 		    sizeof(addr));
 		fprintf(fp, "{ AF_INET6 [%s]:%d }", addr,
 		    htons(lsin6->sin6_port));
 		break;
 	case AF_UNIX:
 		sun = (struct sockaddr_un *)sa;
 		fprintf(fp, "{ AF_UNIX \"%.*s\" }",
 		    (int)(len - offsetof(struct sockaddr_un, sun_path)),
 		    sun->sun_path);
 		break;
 	default:
 	sockaddr_short:
 		fprintf(fp,
 		    "{ sa_len = %d, sa_family = %d, sa_data = {",
 		    (int)sa->sa_len, (int)sa->sa_family);
 		for (q = (u_char *)sa->sa_data;
 		     q < (u_char *)sa + len; q++)
 			fprintf(fp, "%s 0x%02x",
 			    q == (u_char *)sa->sa_data ? "" : ",",
 			    *q);
 		fputs(" } }", fp);
 	}
 	free(sa);
 }
 
 #define IOV_LIMIT 16
 
 static void
 print_iovec(FILE *fp, struct trussinfo *trussinfo, void *arg, int iovcnt)
 {
 	struct iovec iov[IOV_LIMIT];
 	size_t max_string = trussinfo->strsize;
 	char tmp2[max_string + 1], *tmp3;
 	size_t len;
 	pid_t pid = trussinfo->curthread->proc->pid;
 	int i;
 	bool buf_truncated, iov_truncated;
 
 	if (iovcnt <= 0) {
 		fprintf(fp, "%p", arg);
 		return;
 	}
 	if (iovcnt > IOV_LIMIT) {
 		iovcnt = IOV_LIMIT;
 		iov_truncated = true;
 	} else {
 		iov_truncated = false;
 	}
 	if (get_struct(pid, arg, &iov, iovcnt * sizeof(struct iovec)) == -1) {
 		fprintf(fp, "%p", arg);
 		return;
 	}
 
 	fputs("[", fp);
 	for (i = 0; i < iovcnt; i++) {
 		len = iov[i].iov_len;
 		if (len > max_string) {
 			len = max_string;
 			buf_truncated = true;
 		} else {
 			buf_truncated = false;
 		}
 		fprintf(fp, "%s{", (i > 0) ? "," : "");
 		if (len && get_struct(pid, iov[i].iov_base, &tmp2, len) != -1) {
 			tmp3 = malloc(len * 4 + 1);
 			while (len) {
 				if (strvisx(tmp3, tmp2, len,
 				    VIS_CSTYLE|VIS_TAB|VIS_NL) <=
 				    (int)max_string)
 					break;
 				len--;
 				buf_truncated = true;
 			}
 			fprintf(fp, "\"%s\"%s", tmp3,
 			    buf_truncated ? "..." : "");
 			free(tmp3);
 		} else {
 			fprintf(fp, "%p", iov[i].iov_base);
 		}
 		fprintf(fp, ",%zu}", iov[i].iov_len);
 	}
 	fprintf(fp, "%s%s", iov_truncated ? ",..." : "", "]");
 }
 
 static void
 print_gen_cmsg(FILE *fp, struct cmsghdr *cmsghdr)
 {
 	u_char *q;
 
 	fputs("{", fp);
 	for (q = CMSG_DATA(cmsghdr);
 	     q < (u_char *)cmsghdr + cmsghdr->cmsg_len; q++) {
 		fprintf(fp, "%s0x%02x", q == CMSG_DATA(cmsghdr) ? "" : ",", *q);
 	}
 	fputs("}", fp);
 }
 
 static void
 print_sctp_initmsg(FILE *fp, struct sctp_initmsg *init)
 {
 	fprintf(fp, "{out=%u,", init->sinit_num_ostreams);
 	fprintf(fp, "in=%u,", init->sinit_max_instreams);
 	fprintf(fp, "max_rtx=%u,", init->sinit_max_attempts);
 	fprintf(fp, "max_rto=%u}", init->sinit_max_init_timeo);
 }
 
 static void
 print_sctp_sndrcvinfo(FILE *fp, bool receive, struct sctp_sndrcvinfo *info)
 {
 	fprintf(fp, "{sid=%u,", info->sinfo_stream);
 	if (receive) {
 		fprintf(fp, "ssn=%u,", info->sinfo_ssn);
 	}
 	fputs("flgs=", fp);
 	sysdecode_sctp_sinfo_flags(fp, info->sinfo_flags);
 	fprintf(fp, ",ppid=%u,", ntohl(info->sinfo_ppid));
 	if (!receive) {
 		fprintf(fp, "ctx=%u,", info->sinfo_context);
 		fprintf(fp, "ttl=%u,", info->sinfo_timetolive);
 	}
 	if (receive) {
 		fprintf(fp, "tsn=%u,", info->sinfo_tsn);
 		fprintf(fp, "cumtsn=%u,", info->sinfo_cumtsn);
 	}
 	fprintf(fp, "id=%u}", info->sinfo_assoc_id);
 }
 
 static void
 print_sctp_sndinfo(FILE *fp, struct sctp_sndinfo *info)
 {
 	fprintf(fp, "{sid=%u,", info->snd_sid);
 	fputs("flgs=", fp);
 	print_mask_arg(sysdecode_sctp_snd_flags, fp, info->snd_flags);
 	fprintf(fp, ",ppid=%u,", ntohl(info->snd_ppid));
 	fprintf(fp, "ctx=%u,", info->snd_context);
 	fprintf(fp, "id=%u}", info->snd_assoc_id);
 }
 
 static void
 print_sctp_rcvinfo(FILE *fp, struct sctp_rcvinfo *info)
 {
 	fprintf(fp, "{sid=%u,", info->rcv_sid);
 	fprintf(fp, "ssn=%u,", info->rcv_ssn);
 	fputs("flgs=", fp);
 	print_mask_arg(sysdecode_sctp_rcv_flags, fp, info->rcv_flags);
 	fprintf(fp, ",ppid=%u,", ntohl(info->rcv_ppid));
 	fprintf(fp, "tsn=%u,", info->rcv_tsn);
 	fprintf(fp, "cumtsn=%u,", info->rcv_cumtsn);
 	fprintf(fp, "ctx=%u,", info->rcv_context);
 	fprintf(fp, "id=%u}", info->rcv_assoc_id);
 }
 
 static void
 print_sctp_nxtinfo(FILE *fp, struct sctp_nxtinfo *info)
 {
 	fprintf(fp, "{sid=%u,", info->nxt_sid);
 	fputs("flgs=", fp);
 	print_mask_arg(sysdecode_sctp_nxt_flags, fp, info->nxt_flags);
 	fprintf(fp, ",ppid=%u,", ntohl(info->nxt_ppid));
 	fprintf(fp, "len=%u,", info->nxt_length);
 	fprintf(fp, "id=%u}", info->nxt_assoc_id);
 }
 
 static void
 print_sctp_prinfo(FILE *fp, struct sctp_prinfo *info)
 {
 	fputs("{pol=", fp);
 	print_integer_arg(sysdecode_sctp_pr_policy, fp, info->pr_policy);
 	fprintf(fp, ",val=%u}", info->pr_value);
 }
 
 static void
 print_sctp_authinfo(FILE *fp, struct sctp_authinfo *info)
 {
 	fprintf(fp, "{num=%u}", info->auth_keynumber);
 }
 
 static void
 print_sctp_ipv4_addr(FILE *fp, struct in_addr *addr)
 {
 	char buf[INET_ADDRSTRLEN];
 	const char *s;
 
 	s = inet_ntop(AF_INET, addr, buf, INET_ADDRSTRLEN);
 	if (s != NULL)
 		fprintf(fp, "{addr=%s}", s);
 	else
 		fputs("{addr=???}", fp);
 }
 
 static void
 print_sctp_ipv6_addr(FILE *fp, struct in6_addr *addr)
 {
 	char buf[INET6_ADDRSTRLEN];
 	const char *s;
 
 	s = inet_ntop(AF_INET6, addr, buf, INET6_ADDRSTRLEN);
 	if (s != NULL)
 		fprintf(fp, "{addr=%s}", s);
 	else
 		fputs("{addr=???}", fp);
 }
 
 static void
 print_sctp_cmsg(FILE *fp, bool receive, struct cmsghdr *cmsghdr)
 {
 	void *data;
 	socklen_t len;
 
 	len = cmsghdr->cmsg_len;
 	data = CMSG_DATA(cmsghdr);
 	switch (cmsghdr->cmsg_type) {
 	case SCTP_INIT:
 		if (len == CMSG_LEN(sizeof(struct sctp_initmsg)))
 			print_sctp_initmsg(fp, (struct sctp_initmsg *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_SNDRCV:
 		if (len == CMSG_LEN(sizeof(struct sctp_sndrcvinfo)))
 			print_sctp_sndrcvinfo(fp, receive,
 			    (struct sctp_sndrcvinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 #if 0
 	case SCTP_EXTRCV:
 		if (len == CMSG_LEN(sizeof(struct sctp_extrcvinfo)))
 			print_sctp_extrcvinfo(fp,
 			    (struct sctp_extrcvinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 #endif
 	case SCTP_SNDINFO:
 		if (len == CMSG_LEN(sizeof(struct sctp_sndinfo)))
 			print_sctp_sndinfo(fp, (struct sctp_sndinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_RCVINFO:
 		if (len == CMSG_LEN(sizeof(struct sctp_rcvinfo)))
 			print_sctp_rcvinfo(fp, (struct sctp_rcvinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_NXTINFO:
 		if (len == CMSG_LEN(sizeof(struct sctp_nxtinfo)))
 			print_sctp_nxtinfo(fp, (struct sctp_nxtinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_PRINFO:
 		if (len == CMSG_LEN(sizeof(struct sctp_prinfo)))
 			print_sctp_prinfo(fp, (struct sctp_prinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_AUTHINFO:
 		if (len == CMSG_LEN(sizeof(struct sctp_authinfo)))
 			print_sctp_authinfo(fp, (struct sctp_authinfo *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_DSTADDRV4:
 		if (len == CMSG_LEN(sizeof(struct in_addr)))
 			print_sctp_ipv4_addr(fp, (struct in_addr *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	case SCTP_DSTADDRV6:
 		if (len == CMSG_LEN(sizeof(struct in6_addr)))
 			print_sctp_ipv6_addr(fp, (struct in6_addr *)data);
 		else
 			print_gen_cmsg(fp, cmsghdr);
 		break;
 	default:
 		print_gen_cmsg(fp, cmsghdr);
 	}
 }
 
 static void
 print_cmsgs(FILE *fp, pid_t pid, bool receive, struct msghdr *msghdr)
 {
 	struct cmsghdr *cmsghdr;
 	char *cmsgbuf;
 	const char *temp;
 	socklen_t len;
 	int level, type;
 	bool first;
 
 	len = msghdr->msg_controllen;
 	if (len == 0) {
 		fputs("{}", fp);
 		return;
 	}
 	cmsgbuf = calloc(1, len);
 	if (get_struct(pid, msghdr->msg_control, cmsgbuf, len) == -1) {
 		fprintf(fp, "%p", msghdr->msg_control);
 		free(cmsgbuf);
 		return;
 	}
 	msghdr->msg_control = cmsgbuf;
 	first = true;
 	fputs("{", fp);
 	for (cmsghdr = CMSG_FIRSTHDR(msghdr);
 	   cmsghdr != NULL;
 	   cmsghdr = CMSG_NXTHDR(msghdr, cmsghdr)) {
 		level = cmsghdr->cmsg_level;
 		type = cmsghdr->cmsg_type;
 		len = cmsghdr->cmsg_len;
 		fprintf(fp, "%s{level=", first ? "" : ",");
 		print_integer_arg(sysdecode_sockopt_level, fp, level);
 		fputs(",type=", fp);
 		temp = sysdecode_cmsg_type(level, type);
 		if (temp) {
 			fputs(temp, fp);
 		} else {
 			fprintf(fp, "%d", type);
 		}
 		fputs(",data=", fp);
 		switch (level) {
 		case IPPROTO_SCTP:
 			print_sctp_cmsg(fp, receive, cmsghdr);
 			break;
 		default:
 			print_gen_cmsg(fp, cmsghdr);
 			break;
 		}
 		fputs("}", fp);
 		first = false;
 	}
 	fputs("}", fp);
 	free(cmsgbuf);
 }
 
 /*
  * Converts a syscall argument into a string.  Said string is
  * allocated via malloc(), so needs to be free()'d.  sc is
  * a pointer to the syscall description (see above); args is
  * an array of all of the system call arguments.
  */
 char *
 print_arg(struct syscall_args *sc, unsigned long *args, long *retval,
     struct trussinfo *trussinfo)
 {
 	FILE *fp;
 	char *tmp;
 	size_t tmplen;
 	pid_t pid;
 
 	fp = open_memstream(&tmp, &tmplen);
 	pid = trussinfo->curthread->proc->pid;
 	switch (sc->type & ARG_MASK) {
 	case Hex:
 		fprintf(fp, "0x%x", (int)args[sc->offset]);
 		break;
 	case Octal:
 		fprintf(fp, "0%o", (int)args[sc->offset]);
 		break;
 	case Int:
 		fprintf(fp, "%d", (int)args[sc->offset]);
 		break;
 	case UInt:
 		fprintf(fp, "%u", (unsigned int)args[sc->offset]);
 		break;
 	case PUInt: {
 		unsigned int val;
 
 		if (get_struct(pid, (void *)args[sc->offset], &val,
 		    sizeof(val)) == 0) 
 			fprintf(fp, "{ %u }", val);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case LongHex:
 		fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	case Long:
 		fprintf(fp, "%ld", args[sc->offset]);
 		break;
 	case Sizet:
 		fprintf(fp, "%zu", (size_t)args[sc->offset]);
 		break;
 	case Name: {
 		/* NULL-terminated string. */
 		char *tmp2;
 
 		tmp2 = get_string(pid, (void*)args[sc->offset], 0);
 		fprintf(fp, "\"%s\"", tmp2);
 		free(tmp2);
 		break;
 	}
 	case BinString: {
 		/*
 		 * Binary block of data that might have printable characters.
 		 * XXX If type|OUT, assume that the length is the syscall's
 		 * return value.  Otherwise, assume that the length of the block
 		 * is in the next syscall argument.
 		 */
 		int max_string = trussinfo->strsize;
 		char tmp2[max_string + 1], *tmp3;
 		int len;
 		int truncated = 0;
 
 		if (sc->type & OUT)
 			len = retval[0];
 		else
 			len = args[sc->offset + 1];
 
 		/*
 		 * Don't print more than max_string characters, to avoid word
 		 * wrap.  If we have to truncate put some ... after the string.
 		 */
 		if (len > max_string) {
 			len = max_string;
 			truncated = 1;
 		}
 		if (len && get_struct(pid, (void*)args[sc->offset], &tmp2, len)
 		    != -1) {
 			tmp3 = malloc(len * 4 + 1);
 			while (len) {
 				if (strvisx(tmp3, tmp2, len,
 				    VIS_CSTYLE|VIS_TAB|VIS_NL) <= max_string)
 					break;
 				len--;
 				truncated = 1;
 			}
 			fprintf(fp, "\"%s\"%s", tmp3, truncated ?
 			    "..." : "");
 			free(tmp3);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		break;
 	}
 	case ExecArgs:
 	case ExecEnv:
 	case StringArray: {
 		uintptr_t addr;
 		union {
 			char *strarray[0];
 			char buf[PAGE_SIZE];
 		} u;
 		char *string;
 		size_t len;
 		u_int first, i;
 
 		/*
 		 * Only parse argv[] and environment arrays from exec calls
 		 * if requested.
 		 */
 		if (((sc->type & ARG_MASK) == ExecArgs &&
 		    (trussinfo->flags & EXECVEARGS) == 0) ||
 		    ((sc->type & ARG_MASK) == ExecEnv &&
 		    (trussinfo->flags & EXECVEENVS) == 0)) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 
 		/*
 		 * Read a page of pointers at a time.  Punt if the top-level
 		 * pointer is not aligned.  Note that the first read is of
 		 * a partial page.
 		 */
 		addr = args[sc->offset];
 		if (addr % sizeof(char *) != 0) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 
 		len = PAGE_SIZE - (addr & PAGE_MASK);
 		if (get_struct(pid, (void *)addr, u.buf, len) == -1) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 
 		fputc('[', fp);
 		first = 1;
 		i = 0;
 		while (u.strarray[i] != NULL) {
 			string = get_string(pid, u.strarray[i], 0);
 			fprintf(fp, "%s \"%s\"", first ? "" : ",", string);
 			free(string);
 			first = 0;
 
 			i++;
 			if (i == len / sizeof(char *)) {
 				addr += len;
 				len = PAGE_SIZE;
 				if (get_struct(pid, (void *)addr, u.buf, len) ==
 				    -1) {
 					fprintf(fp, ", <inval>");
 					break;
 				}
 				i = 0;
 			}
 		}
 		fputs(" ]", fp);
 		break;
 	}
 #ifdef __LP64__
 	case Quad:
 		fprintf(fp, "%ld", args[sc->offset]);
 		break;
 	case QuadHex:
 		fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 #else
 	case Quad:
 	case QuadHex: {
 		unsigned long long ll;
 
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 		ll = (unsigned long long)args[sc->offset + 1] << 32 |
 		    args[sc->offset];
 #else
 		ll = (unsigned long long)args[sc->offset] << 32 |
 		    args[sc->offset + 1];
 #endif
 		if ((sc->type & ARG_MASK) == Quad)
 			fprintf(fp, "%lld", ll);
 		else
 			fprintf(fp, "0x%llx", ll);
 		break;
 	}
 #endif
 	case PQuadHex: {
 		uint64_t val;
 
 		if (get_struct(pid, (void *)args[sc->offset], &val,
 		    sizeof(val)) == 0) 
 			fprintf(fp, "{ 0x%jx }", (uintmax_t)val);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Ptr:
 		fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	case Readlinkres: {
 		char *tmp2;
 
 		if (retval[0] == -1)
 			break;
 		tmp2 = get_string(pid, (void*)args[sc->offset], retval[0]);
 		fprintf(fp, "\"%s\"", tmp2);
 		free(tmp2);
 		break;
 	}
 	case Ioctl: {
 		const char *temp;
 		unsigned long cmd;
 
 		cmd = args[sc->offset];
 		temp = sysdecode_ioctlname(cmd);
 		if (temp)
 			fputs(temp, fp);
 		else {
 			fprintf(fp, "0x%lx { IO%s%s 0x%lx('%c'), %lu, %lu }",
 			    cmd, cmd & IOC_OUT ? "R" : "",
 			    cmd & IOC_IN ? "W" : "", IOCGROUP(cmd),
 			    isprint(IOCGROUP(cmd)) ? (char)IOCGROUP(cmd) : '?',
 			    cmd & 0xFF, IOCPARM_LEN(cmd));
 		}
 		break;
 	}
 	case Timespec: {
 		struct timespec ts;
 
 		if (get_struct(pid, (void *)args[sc->offset], &ts,
 		    sizeof(ts)) != -1)
 			fprintf(fp, "{ %jd.%09ld }", (intmax_t)ts.tv_sec,
 			    ts.tv_nsec);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Timespec2: {
 		struct timespec ts[2];
 		const char *sep;
 		unsigned int i;
 
 		if (get_struct(pid, (void *)args[sc->offset], &ts, sizeof(ts))
 		    != -1) {
 			fputs("{ ", fp);
 			sep = "";
 			for (i = 0; i < nitems(ts); i++) {
 				fputs(sep, fp);
 				sep = ", ";
 				switch (ts[i].tv_nsec) {
 				case UTIME_NOW:
 					fprintf(fp, "UTIME_NOW");
 					break;
 				case UTIME_OMIT:
 					fprintf(fp, "UTIME_OMIT");
 					break;
 				default:
 					fprintf(fp, "%jd.%09ld",
 					    (intmax_t)ts[i].tv_sec,
 					    ts[i].tv_nsec);
 					break;
 				}
 			}
 			fputs(" }", fp);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Timeval: {
 		struct timeval tv;
 
 		if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv))
 		    != -1)
 			fprintf(fp, "{ %jd.%06ld }", (intmax_t)tv.tv_sec,
 			    tv.tv_usec);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Timeval2: {
 		struct timeval tv[2];
 
 		if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv))
 		    != -1)
 			fprintf(fp, "{ %jd.%06ld, %jd.%06ld }",
 			    (intmax_t)tv[0].tv_sec, tv[0].tv_usec,
 			    (intmax_t)tv[1].tv_sec, tv[1].tv_usec);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Itimerval: {
 		struct itimerval itv;
 
 		if (get_struct(pid, (void *)args[sc->offset], &itv,
 		    sizeof(itv)) != -1)
 			fprintf(fp, "{ %jd.%06ld, %jd.%06ld }",
 			    (intmax_t)itv.it_interval.tv_sec,
 			    itv.it_interval.tv_usec,
 			    (intmax_t)itv.it_value.tv_sec,
 			    itv.it_value.tv_usec);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case LinuxSockArgs:
 	{
 		struct linux_socketcall_args largs;
 
 		if (get_struct(pid, (void *)args[sc->offset], (void *)&largs,
 		    sizeof(largs)) != -1)
 			fprintf(fp, "{ %s, 0x%lx }",
 			    lookup(linux_socketcall_ops, largs.what, 10),
 			    (long unsigned int)largs.args);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Pollfd: {
 		/*
 		 * XXX: A Pollfd argument expects the /next/ syscall argument
 		 * to be the number of fds in the array. This matches the poll
 		 * syscall.
 		 */
 		struct pollfd *pfd;
 		int numfds = args[sc->offset + 1];
 		size_t bytes = sizeof(struct pollfd) * numfds;
 		int i;
 
 		if ((pfd = malloc(bytes)) == NULL)
 			err(1, "Cannot malloc %zu bytes for pollfd array",
 			    bytes);
 		if (get_struct(pid, (void *)args[sc->offset], pfd, bytes)
 		    != -1) {
 			fputs("{", fp);
 			for (i = 0; i < numfds; i++) {
 				fprintf(fp, " %d/%s", pfd[i].fd,
 				    xlookup_bits(poll_flags, pfd[i].events));
 			}
 			fputs(" }", fp);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		free(pfd);
 		break;
 	}
 	case Fd_set: {
 		/*
 		 * XXX: A Fd_set argument expects the /first/ syscall argument
 		 * to be the number of fds in the array.  This matches the
 		 * select syscall.
 		 */
 		fd_set *fds;
 		int numfds = args[0];
 		size_t bytes = _howmany(numfds, _NFDBITS) * _NFDBITS;
 		int i;
 
 		if ((fds = malloc(bytes)) == NULL)
 			err(1, "Cannot malloc %zu bytes for fd_set array",
 			    bytes);
 		if (get_struct(pid, (void *)args[sc->offset], fds, bytes)
 		    != -1) {
 			fputs("{", fp);
 			for (i = 0; i < numfds; i++) {
 				if (FD_ISSET(i, fds))
 					fprintf(fp, " %d", i);
 			}
 			fputs(" }", fp);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		free(fds);
 		break;
 	}
 	case Signal:
 		fputs(strsig2(args[sc->offset]), fp);
 		break;
 	case Sigset: {
 		long sig;
 		sigset_t ss;
 		int i, first;
 
 		sig = args[sc->offset];
 		if (get_struct(pid, (void *)args[sc->offset], (void *)&ss,
 		    sizeof(ss)) == -1) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 		fputs("{ ", fp);
 		first = 1;
 		for (i = 1; i < sys_nsig; i++) {
 			if (sigismember(&ss, i)) {
 				fprintf(fp, "%s%s", !first ? "|" : "",
 				    strsig2(i));
 				first = 0;
 			}
 		}
 		if (!first)
 			fputc(' ', fp);
 		fputc('}', fp);
 		break;
 	}
 	case Sigprocmask:
 		print_integer_arg(sysdecode_sigprocmask_how, fp,
 		    args[sc->offset]);
 		break;
 	case Fcntlflag:
 		/* XXX: Output depends on the value of the previous argument. */
 		if (sysdecode_fcntl_arg_p(args[sc->offset - 1]))
 			sysdecode_fcntl_arg(fp, args[sc->offset - 1],
 			    args[sc->offset], 16);
 		break;
 	case Open:
 		print_mask_arg(sysdecode_open_flags, fp, args[sc->offset]);
 		break;
 	case Fcntl:
 		print_integer_arg(sysdecode_fcntl_cmd, fp, args[sc->offset]);
 		break;
 	case Mprot:
 		print_mask_arg(sysdecode_mmap_prot, fp, args[sc->offset]);
 		break;
 	case Mmapflags:
 		print_mask_arg(sysdecode_mmap_flags, fp, args[sc->offset]);
 		break;
 	case Whence:
 		print_integer_arg(sysdecode_whence, fp, args[sc->offset]);
 		break;
 	case Sockdomain:
 		print_integer_arg(sysdecode_socketdomain, fp, args[sc->offset]);
 		break;
 	case Socktype:
 		print_mask_arg(sysdecode_socket_type, fp, args[sc->offset]);
 		break;
 	case Shutdown:
 		print_integer_arg(sysdecode_shutdown_how, fp, args[sc->offset]);
 		break;
 	case Resource:
 		print_integer_arg(sysdecode_rlimit, fp, args[sc->offset]);
 		break;
 	case RusageWho:
 		print_integer_arg(sysdecode_getrusage_who, fp, args[sc->offset]);
 		break;
 	case Pathconf:
 		print_integer_arg(sysdecode_pathconf_name, fp, args[sc->offset]);
 		break;
 	case Rforkflags:
 		print_mask_arg(sysdecode_rfork_flags, fp, args[sc->offset]);
 		break;
 	case Sockaddr: {
 		socklen_t len;
 
 		if (args[sc->offset] == 0) {
 			fputs("NULL", fp);
 			break;
 		}
 
 		/*
 		 * Extract the address length from the next argument.  If
 		 * this is an output sockaddr (OUT is set), then the
 		 * next argument is a pointer to a socklen_t.  Otherwise
 		 * the next argument contains a socklen_t by value.
 		 */
 		if (sc->type & OUT) {
 			if (get_struct(pid, (void *)args[sc->offset + 1],
 			    &len, sizeof(len)) == -1) {
 				fprintf(fp, "0x%lx", args[sc->offset]);
 				break;
 			}
 		} else
 			len = args[sc->offset + 1];
 
 		print_sockaddr(fp, trussinfo, (void *)args[sc->offset], len);
 		break;
 	}
 	case Sigaction: {
 		struct sigaction sa;
 
 		if (get_struct(pid, (void *)args[sc->offset], &sa, sizeof(sa))
 		    != -1) {
 			fputs("{ ", fp);
 			if (sa.sa_handler == SIG_DFL)
 				fputs("SIG_DFL", fp);
 			else if (sa.sa_handler == SIG_IGN)
 				fputs("SIG_IGN", fp);
 			else
 				fprintf(fp, "%p", sa.sa_handler);
 			fprintf(fp, " %s ss_t }",
 			    xlookup_bits(sigaction_flags, sa.sa_flags));
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Kevent: {
 		/*
 		 * XXX XXX: The size of the array is determined by either the
 		 * next syscall argument, or by the syscall return value,
 		 * depending on which argument number we are.  This matches the
 		 * kevent syscall, but luckily that's the only syscall that uses
 		 * them.
 		 */
 		struct kevent *ke;
 		int numevents = -1;
 		size_t bytes;
 		int i;
 
 		if (sc->offset == 1)
 			numevents = args[sc->offset+1];
 		else if (sc->offset == 3 && retval[0] != -1)
 			numevents = retval[0];
 
 		if (numevents >= 0) {
 			bytes = sizeof(struct kevent) * numevents;
 			if ((ke = malloc(bytes)) == NULL)
 				err(1,
 				    "Cannot malloc %zu bytes for kevent array",
 				    bytes);
 		} else
 			ke = NULL;
 		if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset],
 		    ke, bytes) != -1) {
 			fputc('{', fp);
 			for (i = 0; i < numevents; i++) {
 				fputc(' ', fp);
 				print_kevent(fp, &ke[i]);
 			}
 			fputs(" }", fp);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		free(ke);
 		break;
 	}
 	case Kevent11: {
 		struct kevent_freebsd11 *ke11;
 		struct kevent ke;
 		int numevents = -1;
 		size_t bytes;
 		int i;
 
 		if (sc->offset == 1)
 			numevents = args[sc->offset+1];
 		else if (sc->offset == 3 && retval[0] != -1)
 			numevents = retval[0];
 
 		if (numevents >= 0) {
 			bytes = sizeof(struct kevent_freebsd11) * numevents;
 			if ((ke11 = malloc(bytes)) == NULL)
 				err(1,
 				    "Cannot malloc %zu bytes for kevent array",
 				    bytes);
 		} else
 			ke11 = NULL;
 		memset(&ke, 0, sizeof(ke));
 		if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset],
 		    ke11, bytes) != -1) {
 			fputc('{', fp);
 			for (i = 0; i < numevents; i++) {
 				fputc(' ', fp);
 				ke.ident = ke11[i].ident;
 				ke.filter = ke11[i].filter;
 				ke.flags = ke11[i].flags;
 				ke.fflags = ke11[i].fflags;
 				ke.data = ke11[i].data;
 				ke.udata = ke11[i].udata;
 				print_kevent(fp, &ke);
 			}
 			fputs(" }", fp);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		free(ke11);
 		break;
 	}
 	case Stat: {
 		struct stat st;
 
 		if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st))
 		    != -1) {
 			char mode[12];
 
 			strmode(st.st_mode, mode);
 			fprintf(fp,
 			    "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode,
 			    (uintmax_t)st.st_ino, (intmax_t)st.st_size,
 			    (long)st.st_blksize);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		break;
 	}
 	case Stat11: {
 		struct freebsd11_stat st;
 
 		if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st))
 		    != -1) {
 			char mode[12];
 
 			strmode(st.st_mode, mode);
 			fprintf(fp,
 			    "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode,
 			    (uintmax_t)st.st_ino, (intmax_t)st.st_size,
 			    (long)st.st_blksize);
 		} else {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		}
 		break;
 	}
 	case StatFs: {
 		unsigned int i;
 		struct statfs buf;
 
 		if (get_struct(pid, (void *)args[sc->offset], &buf,
 		    sizeof(buf)) != -1) {
 			char fsid[17];
 
 			bzero(fsid, sizeof(fsid));
 			if (buf.f_fsid.val[0] != 0 || buf.f_fsid.val[1] != 0) {
 			        for (i = 0; i < sizeof(buf.f_fsid); i++)
 					snprintf(&fsid[i*2],
 					    sizeof(fsid) - (i*2), "%02x",
 					    ((u_char *)&buf.f_fsid)[i]);
 			}
 			fprintf(fp,
 			    "{ fstypename=%s,mntonname=%s,mntfromname=%s,"
 			    "fsid=%s }", buf.f_fstypename, buf.f_mntonname,
 			    buf.f_mntfromname, fsid);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 
 	case Rusage: {
 		struct rusage ru;
 
 		if (get_struct(pid, (void *)args[sc->offset], &ru, sizeof(ru))
 		    != -1) {
 			fprintf(fp,
 			    "{ u=%jd.%06ld,s=%jd.%06ld,in=%ld,out=%ld }",
 			    (intmax_t)ru.ru_utime.tv_sec, ru.ru_utime.tv_usec,
 			    (intmax_t)ru.ru_stime.tv_sec, ru.ru_stime.tv_usec,
 			    ru.ru_inblock, ru.ru_oublock);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Rlimit: {
 		struct rlimit rl;
 
 		if (get_struct(pid, (void *)args[sc->offset], &rl, sizeof(rl))
 		    != -1) {
 			fprintf(fp, "{ cur=%ju,max=%ju }",
 			    rl.rlim_cur, rl.rlim_max);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case ExitStatus: {
 		int status;
 
 		if (get_struct(pid, (void *)args[sc->offset], &status,
 		    sizeof(status)) != -1) {
 			fputs("{ ", fp);
 			if (WIFCONTINUED(status))
 				fputs("CONTINUED", fp);
 			else if (WIFEXITED(status))
 				fprintf(fp, "EXITED,val=%d",
 				    WEXITSTATUS(status));
 			else if (WIFSIGNALED(status))
 				fprintf(fp, "SIGNALED,sig=%s%s",
 				    strsig2(WTERMSIG(status)),
 				    WCOREDUMP(status) ? ",cored" : "");
 			else
 				fprintf(fp, "STOPPED,sig=%s",
 				    strsig2(WTERMSIG(status)));
 			fputs(" }", fp);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Waitoptions:
 		print_mask_arg(sysdecode_wait6_options, fp, args[sc->offset]);
 		break;
 	case Idtype:
 		print_integer_arg(sysdecode_idtype, fp, args[sc->offset]);
 		break;
 	case Procctl:
 		print_integer_arg(sysdecode_procctl_cmd, fp, args[sc->offset]);
 		break;
 	case Umtxop:
 		print_integer_arg(sysdecode_umtx_op, fp, args[sc->offset]);
 		break;
 	case Atfd:
 		print_integer_arg(sysdecode_atfd, fp, args[sc->offset]);
 		break;
 	case Atflags:
 		print_mask_arg(sysdecode_atflags, fp, args[sc->offset]);
 		break;
 	case Accessmode:
 		print_mask_arg(sysdecode_access_mode, fp, args[sc->offset]);
 		break;
 	case Sysarch:
 		print_integer_arg(sysdecode_sysarch_number, fp,
 		    args[sc->offset]);
 		break;
 	case PipeFds:
 		/*
 		 * The pipe() system call in the kernel returns its
 		 * two file descriptors via return values.  However,
 		 * the interface exposed by libc is that pipe()
 		 * accepts a pointer to an array of descriptors.
 		 * Format the output to match the libc API by printing
 		 * the returned file descriptors as a fake argument.
 		 *
 		 * Overwrite the first retval to signal a successful
 		 * return as well.
 		 */
 		fprintf(fp, "{ %ld, %ld }", retval[0], retval[1]);
 		retval[0] = 0;
 		break;
 	case Utrace: {
 		size_t len;
 		void *utrace_addr;
 
 		len = args[sc->offset + 1];
 		utrace_addr = calloc(1, len);
 		if (get_struct(pid, (void *)args[sc->offset],
 		    (void *)utrace_addr, len) != -1)
 			print_utrace(fp, utrace_addr, len);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		free(utrace_addr);
 		break;
 	}
 	case IntArray: {
 		int descriptors[16];
 		unsigned long i, ndescriptors;
 		bool truncated;
 
 		ndescriptors = args[sc->offset + 1];
 		truncated = false;
 		if (ndescriptors > nitems(descriptors)) {
 			ndescriptors = nitems(descriptors);
 			truncated = true;
 		}
 		if (get_struct(pid, (void *)args[sc->offset],
 		    descriptors, ndescriptors * sizeof(descriptors[0])) != -1) {
 			fprintf(fp, "{");
 			for (i = 0; i < ndescriptors; i++)
 				fprintf(fp, i == 0 ? " %d" : ", %d",
 				    descriptors[i]);
 			fprintf(fp, truncated ? ", ... }" : " }");
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Pipe2:
 		print_mask_arg(sysdecode_pipe2_flags, fp, args[sc->offset]);
 		break;
 	case CapFcntlRights: {
 		uint32_t rights;
 
 		if (sc->type & OUT) {
 			if (get_struct(pid, (void *)args[sc->offset], &rights,
 			    sizeof(rights)) == -1) {
 				fprintf(fp, "0x%lx", args[sc->offset]);
 				break;
 			}
 		} else
 			rights = args[sc->offset];
 		print_mask_arg32(sysdecode_cap_fcntlrights, fp, rights);
 		break;
 	}
 	case Fadvice:
 		print_integer_arg(sysdecode_fadvice, fp, args[sc->offset]);
 		break;
 	case FileFlags: {
 		fflags_t rem;
 
 		if (!sysdecode_fileflags(fp, args[sc->offset], &rem))
 			fprintf(fp, "0x%x", rem);
 		else if (rem != 0)
 			fprintf(fp, "|0x%x", rem);
 		break;
 	}
 	case Flockop:
 		print_mask_arg(sysdecode_flock_operation, fp, args[sc->offset]);
 		break;
 	case Getfsstatmode:
 		print_integer_arg(sysdecode_getfsstat_mode, fp,
 		    args[sc->offset]);
 		break;
 	case Kldsymcmd:
 		print_integer_arg(sysdecode_kldsym_cmd, fp, args[sc->offset]);
 		break;
 	case Kldunloadflags:
 		print_integer_arg(sysdecode_kldunload_flags, fp,
 		    args[sc->offset]);
 		break;
 	case Madvice:
 		print_integer_arg(sysdecode_madvice, fp, args[sc->offset]);
 		break;
 	case Socklent:
 		fprintf(fp, "%u", (socklen_t)args[sc->offset]);
 		break;
 	case Sockprotocol: {
 		const char *temp;
 		int domain, protocol;
 
 		domain = args[sc->offset - 2];
 		protocol = args[sc->offset];
 		if (protocol == 0) {
 			fputs("0", fp);
 		} else {
 			temp = sysdecode_socket_protocol(domain, protocol);
 			if (temp) {
 				fputs(temp, fp);
 			} else {
 				fprintf(fp, "%d", protocol);
 			}
 		}
 		break;
 	}
 	case Sockoptlevel:
 		print_integer_arg(sysdecode_sockopt_level, fp,
 		    args[sc->offset]);
 		break;
 	case Sockoptname: {
 		const char *temp;
 		int level, name;
 
 		level = args[sc->offset - 1];
 		name = args[sc->offset];
 		temp = sysdecode_sockopt_name(level, name);
 		if (temp) {
 			fputs(temp, fp);
 		} else {
 			fprintf(fp, "%d", name);
 		}
 		break;
 	}
 	case Msgflags:
 		print_mask_arg(sysdecode_msg_flags, fp, args[sc->offset]);
 		break;
 	case CapRights: {
 		cap_rights_t rights;
 
 		if (get_struct(pid, (void *)args[sc->offset], &rights,
 		    sizeof(rights)) != -1) {
 			fputs("{ ", fp);
 			sysdecode_cap_rights(fp, &rights);
 			fputs(" }", fp);
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Acltype:
 		print_integer_arg(sysdecode_acltype, fp, args[sc->offset]);
 		break;
 	case Extattrnamespace:
 		print_integer_arg(sysdecode_extattrnamespace, fp,
 		    args[sc->offset]);
 		break;
 	case Minherit:
 		print_integer_arg(sysdecode_minherit_inherit, fp,
 		    args[sc->offset]);
 		break;
 	case Mlockall:
 		print_mask_arg(sysdecode_mlockall_flags, fp, args[sc->offset]);
 		break;
 	case Mountflags:
 		print_mask_arg(sysdecode_mount_flags, fp, args[sc->offset]);
 		break;
 	case Msync:
 		print_mask_arg(sysdecode_msync_flags, fp, args[sc->offset]);
 		break;
 	case Priowhich:
 		print_integer_arg(sysdecode_prio_which, fp, args[sc->offset]);
 		break;
 	case Ptraceop:
 		print_integer_arg(sysdecode_ptrace_request, fp,
 		    args[sc->offset]);
 		break;
 	case Quotactlcmd:
 		if (!sysdecode_quotactl_cmd(fp, args[sc->offset]))
 			fprintf(fp, "%#x", (int)args[sc->offset]);
 		break;
 	case Reboothowto:
 		print_mask_arg(sysdecode_reboot_howto, fp, args[sc->offset]);
 		break;
 	case Rtpriofunc:
 		print_integer_arg(sysdecode_rtprio_function, fp,
 		    args[sc->offset]);
 		break;
 	case Schedpolicy:
 		print_integer_arg(sysdecode_scheduler_policy, fp,
 		    args[sc->offset]);
 		break;
 	case Schedparam: {
 		struct sched_param sp;
 
 		if (get_struct(pid, (void *)args[sc->offset], &sp,
 		    sizeof(sp)) != -1)
 			fprintf(fp, "{ %d }", sp.sched_priority);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case PSig: {
 		int sig;
 
 		if (get_struct(pid, (void *)args[sc->offset], &sig,
 		    sizeof(sig)) == 0) 
 			fprintf(fp, "{ %s }", strsig2(sig));
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Siginfo: {
 		siginfo_t si;
 
 		if (get_struct(pid, (void *)args[sc->offset], &si,
 		    sizeof(si)) != -1) {
 			fprintf(fp, "{ signo=%s", strsig2(si.si_signo));
 			decode_siginfo(fp, &si);
 			fprintf(fp, " }");
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case Iovec:
 		/*
 		 * Print argument as an array of struct iovec, where the next
 		 * syscall argument is the number of elements of the array.
 		 */
 
 		print_iovec(fp, trussinfo, (void *)args[sc->offset],
 		    (int)args[sc->offset + 1]);
 		break;
 	case Sctpsndrcvinfo: {
 		struct sctp_sndrcvinfo info;
 
 		if (get_struct(pid, (void *)args[sc->offset],
 		    &info, sizeof(struct sctp_sndrcvinfo)) == -1) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 		print_sctp_sndrcvinfo(fp, sc->type & OUT, &info);
 		break;
 	}
 	case Msghdr: {
 		struct msghdr msghdr;
 
 		if (get_struct(pid, (void *)args[sc->offset],
 		    &msghdr, sizeof(struct msghdr)) == -1) {
 			fprintf(fp, "0x%lx", args[sc->offset]);
 			break;
 		}
 		fputs("{", fp);
 		print_sockaddr(fp, trussinfo, msghdr.msg_name, msghdr.msg_namelen);
 		fprintf(fp, ",%d,", msghdr.msg_namelen);
 		print_iovec(fp, trussinfo, msghdr.msg_iov, msghdr.msg_iovlen);
 		fprintf(fp, ",%d,", msghdr.msg_iovlen);
 		print_cmsgs(fp, pid, sc->type & OUT, &msghdr);
 		fprintf(fp, ",%u,", msghdr.msg_controllen);
 		print_mask_arg(sysdecode_msg_flags, fp, msghdr.msg_flags);
 		fputs("}", fp);
 		break;
 	}
 
 	case CloudABIAdvice:
 		fputs(xlookup(cloudabi_advice, args[sc->offset]), fp);
 		break;
 	case CloudABIClockID:
 		fputs(xlookup(cloudabi_clockid, args[sc->offset]), fp);
 		break;
 	case CloudABIFDSFlags:
 		fputs(xlookup_bits(cloudabi_fdsflags, args[sc->offset]), fp);
 		break;
 	case CloudABIFDStat: {
 		cloudabi_fdstat_t fds;
 		if (get_struct(pid, (void *)args[sc->offset], &fds, sizeof(fds))
 		    != -1) {
 			fprintf(fp, "{ %s, ",
 			    xlookup(cloudabi_filetype, fds.fs_filetype));
 			fprintf(fp, "%s, ... }",
 			    xlookup_bits(cloudabi_fdflags, fds.fs_flags));
 		} else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case CloudABIFileStat: {
 		cloudabi_filestat_t fsb;
 		if (get_struct(pid, (void *)args[sc->offset], &fsb, sizeof(fsb))
 		    != -1)
 			fprintf(fp, "{ %s, %ju }",
 			    xlookup(cloudabi_filetype, fsb.st_filetype),
 			    (uintmax_t)fsb.st_size);
 		else
 			fprintf(fp, "0x%lx", args[sc->offset]);
 		break;
 	}
 	case CloudABIFileType:
 		fputs(xlookup(cloudabi_filetype, args[sc->offset]), fp);
 		break;
 	case CloudABIFSFlags:
 		fputs(xlookup_bits(cloudabi_fsflags, args[sc->offset]), fp);
 		break;
 	case CloudABILookup:
 		if ((args[sc->offset] & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0)
 			fprintf(fp, "%d|LOOKUP_SYMLINK_FOLLOW",
 			    (int)args[sc->offset]);
 		else
 			fprintf(fp, "%d", (int)args[sc->offset]);
 		break;
 	case CloudABIMFlags:
 		fputs(xlookup_bits(cloudabi_mflags, args[sc->offset]), fp);
 		break;
 	case CloudABIMProt:
 		fputs(xlookup_bits(cloudabi_mprot, args[sc->offset]), fp);
 		break;
 	case CloudABIMSFlags:
 		fputs(xlookup_bits(cloudabi_msflags, args[sc->offset]), fp);
 		break;
 	case CloudABIOFlags:
 		fputs(xlookup_bits(cloudabi_oflags, args[sc->offset]), fp);
 		break;
 	case CloudABISDFlags:
 		fputs(xlookup_bits(cloudabi_sdflags, args[sc->offset]), fp);
 		break;
 	case CloudABISignal:
 		fputs(xlookup(cloudabi_signal, args[sc->offset]), fp);
 		break;
 	case CloudABITimestamp:
 		fprintf(fp, "%lu.%09lus", args[sc->offset] / 1000000000,
 		    args[sc->offset] % 1000000000);
 		break;
 	case CloudABIULFlags:
 		fputs(xlookup_bits(cloudabi_ulflags, args[sc->offset]), fp);
 		break;
 	case CloudABIWhence:
 		fputs(xlookup(cloudabi_whence, args[sc->offset]), fp);
 		break;
 
 	default:
 		errx(1, "Invalid argument type %d\n", sc->type & ARG_MASK);
 	}
 	fclose(fp);
 	return (tmp);
 }
 
 /*
  * Print (to outfile) the system call and its arguments.
  */
 void
 print_syscall(struct trussinfo *trussinfo)
 {
 	struct threadinfo *t;
 	const char *name;
 	char **s_args;
 	int i, len, nargs;
 
 	t = trussinfo->curthread;
 
 	name = t->cs.sc->name;
 	nargs = t->cs.nargs;
 	s_args = t->cs.s_args;
 
 	len = print_line_prefix(trussinfo);
 	len += fprintf(trussinfo->outfile, "%s(", name);
 
 	for (i = 0; i < nargs; i++) {
 		if (s_args[i] != NULL)
 			len += fprintf(trussinfo->outfile, "%s", s_args[i]);
 		else
 			len += fprintf(trussinfo->outfile,
 			    "<missing argument>");
 		len += fprintf(trussinfo->outfile, "%s", i < (nargs - 1) ?
 		    "," : "");
 	}
 	len += fprintf(trussinfo->outfile, ")");
 	for (i = 0; i < 6 - (len / 8); i++)
 		fprintf(trussinfo->outfile, "\t");
 }
 
 void
 print_syscall_ret(struct trussinfo *trussinfo, int errorp, long *retval)
 {
 	struct timespec timediff;
 	struct threadinfo *t;
 	struct syscall *sc;
 	int error;
 
 	t = trussinfo->curthread;
 	sc = t->cs.sc;
 	if (trussinfo->flags & COUNTONLY) {
-		timespecsubt(&t->after, &t->before, &timediff);
+		timespecsub(&t->after, &t->before, &timediff);
 		timespecadd(&sc->time, &timediff, &sc->time);
 		sc->ncalls++;
 		if (errorp)
 			sc->nerror++;
 		return;
 	}
 
 	print_syscall(trussinfo);
 	fflush(trussinfo->outfile);
 
 	if (retval == NULL) {
 		/*
 		 * This system call resulted in the current thread's exit,
 		 * so there is no return value or error to display.
 		 */
 		fprintf(trussinfo->outfile, "\n");
 		return;
 	}
 
 	if (errorp) {
 		error = sysdecode_abi_to_freebsd_errno(t->proc->abi->abi,
 		    retval[0]);
 		fprintf(trussinfo->outfile, " ERR#%ld '%s'\n", retval[0],
 		    error == INT_MAX ? "Unknown error" : strerror(error));
 	}
 #ifndef __LP64__
 	else if (sc->ret_type == 2) {
 		off_t off;
 
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 		off = (off_t)retval[1] << 32 | retval[0];
 #else
 		off = (off_t)retval[0] << 32 | retval[1];
 #endif
 		fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)off,
 		    (intmax_t)off);
 	}
 #endif
 	else
 		fprintf(trussinfo->outfile, " = %ld (0x%lx)\n", retval[0],
 		    retval[0]);
 }
 
 void
 print_summary(struct trussinfo *trussinfo)
 {
 	struct timespec total = {0, 0};
 	struct syscall *sc;
 	int ncall, nerror;
 
 	fprintf(trussinfo->outfile, "%-20s%15s%8s%8s\n",
 	    "syscall", "seconds", "calls", "errors");
 	ncall = nerror = 0;
 	STAILQ_FOREACH(sc, &syscalls, entries)
 		if (sc->ncalls) {
 			fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n",
 			    sc->name, (intmax_t)sc->time.tv_sec,
 			    sc->time.tv_nsec, sc->ncalls, sc->nerror);
 			timespecadd(&total, &sc->time, &total);
 			ncall += sc->ncalls;
 			nerror += sc->nerror;
 		}
 	fprintf(trussinfo->outfile, "%20s%15s%8s%8s\n",
 	    "", "-------------", "-------", "-------");
 	fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n",
 	    "", (intmax_t)total.tv_sec, total.tv_nsec, ncall, nerror);
 }
Index: head/usr.bin/truss/truss.h
===================================================================
--- head/usr.bin/truss/truss.h	(revision 336913)
+++ head/usr.bin/truss/truss.h	(revision 336914)
@@ -1,141 +1,121 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 2001 Jamey Wood
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/linker_set.h>
 #include <sys/queue.h>
 
 #define	FOLLOWFORKS		0x00000001
 #define	RELATIVETIMESTAMPS	0x00000002
 #define	ABSOLUTETIMESTAMPS	0x00000004
 #define	NOSIGS			0x00000008
 #define	EXECVEARGS		0x00000010
 #define	EXECVEENVS		0x00000020
 #define	COUNTONLY		0x00000040
 #define	DISPLAYTIDS		0x00000080
 
 struct procinfo;
 struct syscall;
 struct trussinfo;
 
 /*
  * The lookup of normal system calls are optimized by using a fixed
  * array for the first 1024 system calls that can be indexed directly.
  * Unknown system calls with other IDs are stored in a linked list.
  */
 #define	SYSCALL_NORMAL_COUNT	1024
 
 struct extra_syscall {
 	STAILQ_ENTRY(extra_syscall) entries;
 	struct syscall *sc;
 	u_int number;
 };
 
 struct procabi {
 	const char *type;
 	enum sysdecode_abi abi;
 	int (*fetch_args)(struct trussinfo *, u_int);
 	int (*fetch_retval)(struct trussinfo *, long *, int *);
 	STAILQ_HEAD(, extra_syscall) extra_syscalls;
 	struct syscall *syscalls[SYSCALL_NORMAL_COUNT];
 };
 
 #define	PROCABI(abi)	DATA_SET(procabi, abi)
 
 /*
  * This is confusingly named.  It holds per-thread state about the
  * currently executing system call.  syscall.h defines a struct
  * syscall that holds metadata used to format system call arguments.
  *
  * NB: args[] stores the raw argument values (e.g. from registers)
  * passed to the system call.  s_args[] stores a string representation
  * of a system call's arguments.  These do not necessarily map one to
  * one.  A system call description may omit individual arguments
  * (padding) or combine adjacent arguments (e.g. when passing an off_t
  * argument on a 32-bit system).  The nargs member contains the count
  * of valid pointers in s_args[], not args[].
  */
 struct current_syscall {
 	struct syscall *sc;
 	unsigned int number;
 	unsigned int nargs;
 	unsigned long args[10];
 	char *s_args[10];	/* the printable arguments */
 };
 
 struct threadinfo
 {
 	LIST_ENTRY(threadinfo) entries;
 	struct procinfo *proc;
 	lwpid_t tid;
 	int in_syscall;
 	struct current_syscall cs;
 	struct timespec before;
 	struct timespec after;
 };
 
 struct procinfo {
 	LIST_ENTRY(procinfo) entries;
 	pid_t pid;
 	struct procabi *abi;
 
 	LIST_HEAD(, threadinfo) threadlist;
 };
 
 struct trussinfo
 {
 	int flags;
 	int strsize;
 	FILE *outfile;
 
 	struct timespec start_time;
 
 	struct threadinfo *curthread;
 
 	LIST_HEAD(, procinfo) proclist;
 };
-
-#define	timespecsubt(tvp, uvp, vvp)					\
-	do {								\
-		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
-		(vvp)->tv_nsec = (tvp)->tv_nsec - (uvp)->tv_nsec;	\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-
-#define	timespecadd(tvp, uvp, vvp)					\
-	do {								\
-		(vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec;		\
-		(vvp)->tv_nsec = (tvp)->tv_nsec + (uvp)->tv_nsec;	\
-		if ((vvp)->tv_nsec > 1000000000) {				\
-			(vvp)->tv_sec++;				\
-			(vvp)->tv_nsec -= 1000000000;			\
-		}							\
-	} while (0)
Index: head/usr.sbin/camdd/camdd.c
===================================================================
--- head/usr.sbin/camdd/camdd.c	(revision 336913)
+++ head/usr.sbin/camdd/camdd.c	(revision 336914)
@@ -1,3527 +1,3510 @@
 /*-
  * Copyright (c) 1997-2007 Kenneth D. Merry
  * Copyright (c) 2013, 2014, 2015 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * Authors: Ken Merry           (Spectra Logic Corporation)
  */
 
 /*
  * This is eventually intended to be:
  * - A basic data transfer/copy utility
  * - A simple benchmark utility
  * - An example of how to use the asynchronous pass(4) driver interface.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/ioctl.h>
 #include <sys/stdint.h>
 #include <sys/types.h>
 #include <sys/endian.h>
 #include <sys/param.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/event.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 #include <vm/vm.h>
 #include <machine/bus.h>
 #include <sys/bus.h>
 #include <sys/bus_dma.h>
 #include <sys/mtio.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <semaphore.h>
 #include <string.h>
 #include <unistd.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <fcntl.h>
 #include <ctype.h>
 #include <err.h>
 #include <libutil.h>
 #include <pthread.h>
 #include <assert.h>
 #include <bsdxml.h>
 
 #include <cam/cam.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_ccb.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/scsi/scsi_pass.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/smp_all.h>
 #include <camlib.h>
 #include <mtlib.h>
 #include <zlib.h>
 
 typedef enum {
 	CAMDD_CMD_NONE		= 0x00000000,
 	CAMDD_CMD_HELP		= 0x00000001,
 	CAMDD_CMD_WRITE		= 0x00000002,
 	CAMDD_CMD_READ		= 0x00000003
 } camdd_cmdmask;
 
 typedef enum {
 	CAMDD_ARG_NONE		= 0x00000000,
 	CAMDD_ARG_VERBOSE	= 0x00000001,
 	CAMDD_ARG_DEVICE	= 0x00000002,
 	CAMDD_ARG_BUS		= 0x00000004,
 	CAMDD_ARG_TARGET	= 0x00000008,
 	CAMDD_ARG_LUN		= 0x00000010,
 	CAMDD_ARG_UNIT		= 0x00000020,
 	CAMDD_ARG_TIMEOUT	= 0x00000040,
 	CAMDD_ARG_ERR_RECOVER	= 0x00000080,
 	CAMDD_ARG_RETRIES	= 0x00000100
 } camdd_argmask;
 
 typedef enum {
 	CAMDD_DEV_NONE		= 0x00,
 	CAMDD_DEV_PASS		= 0x01,
 	CAMDD_DEV_FILE		= 0x02
 } camdd_dev_type;
 
 struct camdd_io_opts {
 	camdd_dev_type	dev_type;
 	char		*dev_name;
 	uint64_t	blocksize;
 	uint64_t	queue_depth;
 	uint64_t	offset;
 	int		min_cmd_size;
 	int		write_dev;
 	uint64_t	debug;
 };
 
 typedef enum {
 	CAMDD_BUF_NONE,
 	CAMDD_BUF_DATA,
 	CAMDD_BUF_INDIRECT
 } camdd_buf_type;
 
 struct camdd_buf_indirect {
 	/*
 	 * Pointer to the source buffer.
 	 */
 	struct camdd_buf *src_buf;
 
 	/*
 	 * Offset into the source buffer, in bytes.
 	 */
 	uint64_t	  offset;
 	/*
 	 * Pointer to the starting point in the source buffer.
 	 */
 	uint8_t		 *start_ptr;
 
 	/*
 	 * Length of this chunk in bytes.
 	 */
 	size_t		  len;
 };
 
 struct camdd_buf_data {
 	/*
 	 * Buffer allocated when we allocate this camdd_buf.  This should
 	 * be the size of the blocksize for this device.
 	 */
 	uint8_t			*buf;
 
 	/*
 	 * The amount of backing store allocated in buf.  Generally this
 	 * will be the blocksize of the device.
 	 */
 	uint32_t		 alloc_len;
 
 	/*
 	 * The amount of data that was put into the buffer (on reads) or
 	 * the amount of data we have put onto the src_list so far (on
 	 * writes).
 	 */
 	uint32_t		 fill_len;
 
 	/*
 	 * The amount of data that was not transferred.
 	 */
 	uint32_t		 resid;
 
 	/*
 	 * Starting byte offset on the reader.
 	 */
 	uint64_t		 src_start_offset;
 	
 	/*
 	 * CCB used for pass(4) device targets.
 	 */
 	union ccb		 ccb;
 
 	/*
 	 * Number of scatter/gather segments.
 	 */
 	int			 sg_count;
 
 	/*
 	 * Set if we had to tack on an extra buffer to round the transfer
 	 * up to a sector size.
 	 */
 	int			 extra_buf;
 
 	/*
 	 * Scatter/gather list used generally when we're the writer for a
 	 * pass(4) device. 
 	 */
 	bus_dma_segment_t	*segs;
 
 	/*
 	 * Scatter/gather list used generally when we're the writer for a
 	 * file or block device;
 	 */
 	struct iovec		*iovec;
 };
 
 union camdd_buf_types {
 	struct camdd_buf_indirect	indirect;
 	struct camdd_buf_data		data;
 };
 
 typedef enum {
 	CAMDD_STATUS_NONE,
 	CAMDD_STATUS_OK,
 	CAMDD_STATUS_SHORT_IO,
 	CAMDD_STATUS_EOF,
 	CAMDD_STATUS_ERROR
 } camdd_buf_status;
 
 struct camdd_buf {
 	camdd_buf_type		 buf_type;
 	union camdd_buf_types	 buf_type_spec;
 
 	camdd_buf_status	 status;
 
 	uint64_t		 lba;
 	size_t			 len;
 
 	/*
 	 * A reference count of how many indirect buffers point to this
 	 * buffer.
 	 */
 	int			 refcount;
 
 	/*
 	 * A link back to our parent device.
 	 */
 	struct camdd_dev	*dev;
 	STAILQ_ENTRY(camdd_buf)  links;
 	STAILQ_ENTRY(camdd_buf)  work_links;
 
 	/*
 	 * A count of the buffers on the src_list.
 	 */
 	int			 src_count;
 
 	/*
 	 * List of buffers from our partner thread that are the components
 	 * of this buffer for the I/O.  Uses src_links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 src_list;
 	STAILQ_ENTRY(camdd_buf)  src_links;
 };
 
 #define	NUM_DEV_TYPES	2
 
 struct camdd_dev_pass {
 	int			 scsi_dev_type;
 	int			 protocol;
 	struct cam_device	*dev;
 	uint64_t		 max_sector;
 	uint32_t		 block_len;
 	uint32_t		 cpi_maxio;
 };
 
 typedef enum {
 	CAMDD_FILE_NONE,
 	CAMDD_FILE_REG,
 	CAMDD_FILE_STD,
 	CAMDD_FILE_PIPE,
 	CAMDD_FILE_DISK,
 	CAMDD_FILE_TAPE,
 	CAMDD_FILE_TTY,
 	CAMDD_FILE_MEM
 } camdd_file_type;
 
 typedef enum {
 	CAMDD_FF_NONE 		= 0x00,
 	CAMDD_FF_CAN_SEEK	= 0x01
 } camdd_file_flags;
 
 struct camdd_dev_file {
 	int			 fd;
 	struct stat		 sb;
 	char			 filename[MAXPATHLEN + 1];
 	camdd_file_type		 file_type;
 	camdd_file_flags	 file_flags;
 	uint8_t			*tmp_buf;
 };
 
 struct camdd_dev_block {
 	int			 fd;
 	uint64_t		 size_bytes;
 	uint32_t		 block_len;
 };
 
 union camdd_dev_spec {
 	struct camdd_dev_pass	pass;
 	struct camdd_dev_file	file;
 	struct camdd_dev_block	block;
 };
 
 typedef enum {
 	CAMDD_DEV_FLAG_NONE		= 0x00,
 	CAMDD_DEV_FLAG_EOF		= 0x01,
 	CAMDD_DEV_FLAG_PEER_EOF		= 0x02,
 	CAMDD_DEV_FLAG_ACTIVE		= 0x04,
 	CAMDD_DEV_FLAG_EOF_SENT		= 0x08,
 	CAMDD_DEV_FLAG_EOF_QUEUED	= 0x10
 } camdd_dev_flags;
 
 struct camdd_dev {
 	camdd_dev_type		 dev_type;
 	union camdd_dev_spec	 dev_spec;
 	camdd_dev_flags		 flags;
 	char			 device_name[MAXPATHLEN+1];
 	uint32_t		 blocksize;
 	uint32_t		 sector_size;
 	uint64_t		 max_sector;
 	uint64_t		 sector_io_limit;
 	int			 min_cmd_size;
 	int			 write_dev;
 	int			 retry_count;
 	int			 io_timeout;
 	int			 debug;
 	uint64_t		 start_offset_bytes;
 	uint64_t		 next_io_pos_bytes;
 	uint64_t		 next_peer_pos_bytes;
 	uint64_t		 next_completion_pos_bytes;
 	uint64_t		 peer_bytes_queued;
 	uint64_t		 bytes_transferred;
 	uint32_t		 target_queue_depth;
 	uint32_t		 cur_active_io;
 	uint8_t			*extra_buf;
 	uint32_t		 extra_buf_len;
 	struct camdd_dev	*peer_dev;
 	pthread_mutex_t		 mutex;
 	pthread_cond_t		 cond;
 	int			 kq;
 
 	int			 (*run)(struct camdd_dev *dev);
 	int			 (*fetch)(struct camdd_dev *dev);
 
 	/*
 	 * Buffers that are available for I/O.  Uses links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 free_queue;
 
 	/*
 	 * Free indirect buffers.  These are used for breaking a large
 	 * buffer into multiple pieces.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 free_indirect_queue;
 
 	/*
 	 * Buffers that have been queued to the kernel.  Uses links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 active_queue;
 
 	/*
 	 * Will generally contain one of our buffers that is waiting for enough
 	 * I/O from our partner thread to be able to execute.  This will
 	 * generally happen when our per-I/O-size is larger than the
 	 * partner thread's per-I/O-size.  Uses links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 pending_queue;
 
 	/*
 	 * Number of buffers on the pending queue
 	 */
 	int			 num_pending_queue;
 
 	/*
 	 * Buffers that are filled and ready to execute.  This is used when
 	 * our partner (reader) thread sends us blocks that are larger than
 	 * our blocksize, and so we have to split them into multiple pieces.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 run_queue;
 
 	/*
 	 * Number of buffers on the run queue.
 	 */
 	int			 num_run_queue;
 
 	STAILQ_HEAD(,camdd_buf)	 reorder_queue;
 
 	int			 num_reorder_queue;
 
 	/*
 	 * Buffers that have been queued to us by our partner thread
 	 * (generally the reader thread) to be written out.  Uses
 	 * work_links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 work_queue;
 
 	/*
 	 * Buffers that have been completed by our partner thread.  Uses
 	 * work_links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 peer_done_queue;
 
 	/*
 	 * Number of buffers on the peer done queue.
 	 */
 	uint32_t		 num_peer_done_queue;
 
 	/*
 	 * A list of buffers that we have queued to our peer thread.  Uses
 	 * links.
 	 */
 	STAILQ_HEAD(,camdd_buf)	 peer_work_queue;
 
 	/*
 	 * Number of buffers on the peer work queue.
 	 */
 	uint32_t		 num_peer_work_queue;
 };
 
 static sem_t camdd_sem;
 static sig_atomic_t need_exit = 0;
 static sig_atomic_t error_exit = 0;
 static sig_atomic_t need_status = 0;
 
 #ifndef min
 #define	min(a, b) (a < b) ? a : b
 #endif
 
-/*
- * XXX KDM private copy of timespecsub().  This is normally defined in
- * sys/time.h, but is only enabled in the kernel.  If that definition is
- * enabled in userland, it breaks the build of libnetbsd.
- */
-#ifndef timespecsub
-#define	timespecsub(vvp, uvp)						\
-	do {								\
-		(vvp)->tv_sec -= (uvp)->tv_sec;				\
-		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
-		if ((vvp)->tv_nsec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_nsec += 1000000000;			\
-		}							\
-	} while (0)
-#endif
 
-
 /* Generically useful offsets into the peripheral private area */
 #define ppriv_ptr0 periph_priv.entries[0].ptr
 #define ppriv_ptr1 periph_priv.entries[1].ptr
 #define ppriv_field0 periph_priv.entries[0].field
 #define ppriv_field1 periph_priv.entries[1].field
 
 #define	ccb_buf	ppriv_ptr0
 
 #define	CAMDD_FILE_DEFAULT_BLOCK	524288
 #define	CAMDD_FILE_DEFAULT_DEPTH	1
 #define	CAMDD_PASS_MAX_BLOCK		1048576
 #define	CAMDD_PASS_DEFAULT_DEPTH	6
 #define	CAMDD_PASS_RW_TIMEOUT		60 * 1000
 
 static int parse_btl(char *tstr, int *bus, int *target, int *lun,
 		     camdd_argmask *arglst);
 void camdd_free_dev(struct camdd_dev *dev);
 struct camdd_dev *camdd_alloc_dev(camdd_dev_type dev_type,
 				  struct kevent *new_ke, int num_ke,
 				  int retry_count, int timeout);
 static struct camdd_buf *camdd_alloc_buf(struct camdd_dev *dev,
 					 camdd_buf_type buf_type);
 void camdd_release_buf(struct camdd_buf *buf);
 struct camdd_buf *camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type);
 int camdd_buf_sg_create(struct camdd_buf *buf, int iovec,
 			uint32_t sector_size, uint32_t *num_sectors_used,
 			int *double_buf_needed);
 uint32_t camdd_buf_get_len(struct camdd_buf *buf);
 void camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf);
 int camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize,
 		     uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran);
 int camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb,
          camdd_argmask arglist, int probe_retry_count,
          int probe_timeout, uint64_t *maxsector, uint32_t *block_len);
 struct camdd_dev *camdd_probe_file(int fd, struct camdd_io_opts *io_opts,
 				   int retry_count, int timeout);
 struct camdd_dev *camdd_probe_pass(struct cam_device *cam_dev,
 				   struct camdd_io_opts *io_opts,
 				   camdd_argmask arglist, int probe_retry_count,
 				   int probe_timeout, int io_retry_count,
 				   int io_timeout);
 void *camdd_file_worker(void *arg);
 camdd_buf_status camdd_ccb_status(union ccb *ccb, int protocol);
 int camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd);
 int camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf);
 int camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf);
 void camdd_peer_done(struct camdd_buf *buf);
 void camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf,
 			int *error_count);
 int camdd_pass_fetch(struct camdd_dev *dev);
 int camdd_file_run(struct camdd_dev *dev);
 int camdd_pass_run(struct camdd_dev *dev);
 int camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len);
 int camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf);
 void camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth,
 		     uint32_t *peer_depth, uint32_t *our_bytes,
 		     uint32_t *peer_bytes);
 void *camdd_worker(void *arg);
 void camdd_sig_handler(int sig);
 void camdd_print_status(struct camdd_dev *camdd_dev,
 			struct camdd_dev *other_dev,
 			struct timespec *start_time);
 int camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts,
 	     uint64_t max_io, int retry_count, int timeout);
 int camdd_parse_io_opts(char *args, int is_write,
 			struct camdd_io_opts *io_opts);
 void usage(void);
 
 /*
  * Parse out a bus, or a bus, target and lun in the following
  * format:
  * bus
  * bus:target
  * bus:target:lun
  *
  * Returns the number of parsed components, or 0.
  */
 static int
 parse_btl(char *tstr, int *bus, int *target, int *lun, camdd_argmask *arglst)
 {
 	char *tmpstr;
 	int convs = 0;
 
 	while (isspace(*tstr) && (*tstr != '\0'))
 		tstr++;
 
 	tmpstr = (char *)strtok(tstr, ":");
 	if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 		*bus = strtol(tmpstr, NULL, 0);
 		*arglst |= CAMDD_ARG_BUS;
 		convs++;
 		tmpstr = (char *)strtok(NULL, ":");
 		if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 			*target = strtol(tmpstr, NULL, 0);
 			*arglst |= CAMDD_ARG_TARGET;
 			convs++;
 			tmpstr = (char *)strtok(NULL, ":");
 			if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 				*lun = strtol(tmpstr, NULL, 0);
 				*arglst |= CAMDD_ARG_LUN;
 				convs++;
 			}
 		}
 	}
 
 	return convs;
 }
 
 /*
  * XXX KDM clean up and free all of the buffers on the queue!
  */
 void
 camdd_free_dev(struct camdd_dev *dev)
 {
 	if (dev == NULL)
 		return;
 
 	switch (dev->dev_type) {
 	case CAMDD_DEV_FILE: {
 		struct camdd_dev_file *file_dev = &dev->dev_spec.file;
 
 		if (file_dev->fd != -1)
 			close(file_dev->fd);
 		free(file_dev->tmp_buf);
 		break;
 	}
 	case CAMDD_DEV_PASS: {
 		struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;
 
 		if (pass_dev->dev != NULL)
 			cam_close_device(pass_dev->dev);
 		break;
 	}
 	default:
 		break;
 	}
 
 	free(dev);
 }
 
 struct camdd_dev *
 camdd_alloc_dev(camdd_dev_type dev_type, struct kevent *new_ke, int num_ke,
 		int retry_count, int timeout)
 {
 	struct camdd_dev *dev = NULL;
 	struct kevent *ke;
 	size_t ke_size;
 	int retval = 0;
 
 	dev = calloc(1, sizeof(*dev));
 	if (dev == NULL) {
 		warn("%s: unable to malloc %zu bytes", __func__, sizeof(*dev));
 		goto bailout;
 	}
 
 	dev->dev_type = dev_type;
 	dev->io_timeout = timeout;
 	dev->retry_count = retry_count;
 	STAILQ_INIT(&dev->free_queue);
 	STAILQ_INIT(&dev->free_indirect_queue);
 	STAILQ_INIT(&dev->active_queue);
 	STAILQ_INIT(&dev->pending_queue);
 	STAILQ_INIT(&dev->run_queue);
 	STAILQ_INIT(&dev->reorder_queue);
 	STAILQ_INIT(&dev->work_queue);
 	STAILQ_INIT(&dev->peer_done_queue);
 	STAILQ_INIT(&dev->peer_work_queue);
 	retval = pthread_mutex_init(&dev->mutex, NULL);
 	if (retval != 0) {
 		warnc(retval, "%s: failed to initialize mutex", __func__);
 		goto bailout;
 	}
 
 	retval = pthread_cond_init(&dev->cond, NULL);
 	if (retval != 0) {
 		warnc(retval, "%s: failed to initialize condition variable",
 		      __func__);
 		goto bailout;
 	}
 
 	dev->kq = kqueue();
 	if (dev->kq == -1) {
 		warn("%s: Unable to create kqueue", __func__);
 		goto bailout;
 	}
 
 	ke_size = sizeof(struct kevent) * (num_ke + 4);
 	ke = calloc(1, ke_size);
 	if (ke == NULL) {
 		warn("%s: unable to malloc %zu bytes", __func__, ke_size);
 		goto bailout;
 	}
 	if (num_ke > 0)
 		bcopy(new_ke, ke, num_ke * sizeof(struct kevent));
 
 	EV_SET(&ke[num_ke++], (uintptr_t)&dev->work_queue, EVFILT_USER,
 	       EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0);
 	EV_SET(&ke[num_ke++], (uintptr_t)&dev->peer_done_queue, EVFILT_USER,
 	       EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0);
 	EV_SET(&ke[num_ke++], SIGINFO, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0);
 	EV_SET(&ke[num_ke++], SIGINT, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0);
 
 	retval = kevent(dev->kq, ke, num_ke, NULL, 0, NULL);
 	if (retval == -1) {
 		warn("%s: Unable to register kevents", __func__);
 		goto bailout;
 	}
 
 
 	return (dev);
 
 bailout:
 	free(dev);
 
 	return (NULL);
 }
 
 static struct camdd_buf *
 camdd_alloc_buf(struct camdd_dev *dev, camdd_buf_type buf_type)
 {
 	struct camdd_buf *buf = NULL;
 	uint8_t *data_ptr = NULL;
 
 	/*
 	 * We only need to allocate data space for data buffers.
 	 */
 	switch (buf_type) {
 	case CAMDD_BUF_DATA:
 		data_ptr = malloc(dev->blocksize);
 		if (data_ptr == NULL) {
 			warn("unable to allocate %u bytes", dev->blocksize);
 			goto bailout_error;
 		}
 		break;
 	default:
 		break;
 	}
 	
 	buf = calloc(1, sizeof(*buf));
 	if (buf == NULL) {
 		warn("unable to allocate %zu bytes", sizeof(*buf));
 		goto bailout_error;
 	}
 
 	buf->buf_type = buf_type;
 	buf->dev = dev;
 	switch (buf_type) {
 	case CAMDD_BUF_DATA: {
 		struct camdd_buf_data *data;
 
 		data = &buf->buf_type_spec.data;
 
 		data->alloc_len = dev->blocksize;
 		data->buf = data_ptr;
 		break;
 	}
 	case CAMDD_BUF_INDIRECT:
 		break;
 	default:
 		break;
 	}
 	STAILQ_INIT(&buf->src_list);
 
 	return (buf);
 
 bailout_error:
 	free(data_ptr);
 
 	return (NULL);
 }
 
 void
 camdd_release_buf(struct camdd_buf *buf)
 {
 	struct camdd_dev *dev;
 
 	dev = buf->dev;
 
 	switch (buf->buf_type) {
 	case CAMDD_BUF_DATA: {
 		struct camdd_buf_data *data;
 
 		data = &buf->buf_type_spec.data;
 
 		if (data->segs != NULL) {
 			if (data->extra_buf != 0) {
 				void *extra_buf;
 
 				extra_buf = (void *)
 				    data->segs[data->sg_count - 1].ds_addr;
 				free(extra_buf);
 				data->extra_buf = 0;
 			}
 			free(data->segs);
 			data->segs = NULL;
 			data->sg_count = 0;
 		} else if (data->iovec != NULL) {
 			if (data->extra_buf != 0) {
 				free(data->iovec[data->sg_count - 1].iov_base);
 				data->extra_buf = 0;
 			}
 			free(data->iovec);
 			data->iovec = NULL;
 			data->sg_count = 0;
 		}
 		STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
 		break;
 	}
 	case CAMDD_BUF_INDIRECT:
 		STAILQ_INSERT_TAIL(&dev->free_indirect_queue, buf, links);
 		break;
 	default:
 		err(1, "%s: Invalid buffer type %d for released buffer",
 		    __func__, buf->buf_type);
 		break;
 	}
 }
 
 struct camdd_buf *
 camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type)
 {
 	struct camdd_buf *buf = NULL;
 
 	switch (buf_type) {
 	case CAMDD_BUF_DATA:
 		buf = STAILQ_FIRST(&dev->free_queue);
 		if (buf != NULL) {
 			struct camdd_buf_data *data;
 			uint8_t *data_ptr;
 			uint32_t alloc_len;
 
 			STAILQ_REMOVE_HEAD(&dev->free_queue, links);
 			data = &buf->buf_type_spec.data;
 			data_ptr = data->buf;
 			alloc_len = data->alloc_len;
 			bzero(buf, sizeof(*buf));
 			data->buf = data_ptr;
 			data->alloc_len = alloc_len;
 		}
 		break;
 	case CAMDD_BUF_INDIRECT:
 		buf = STAILQ_FIRST(&dev->free_indirect_queue);
 		if (buf != NULL) {
 			STAILQ_REMOVE_HEAD(&dev->free_indirect_queue, links);
 
 			bzero(buf, sizeof(*buf));
 		}
 		break;
 	default:
 		warnx("Unknown buffer type %d requested", buf_type);
 		break;
 	}
 
 
 	if (buf == NULL)
 		return (camdd_alloc_buf(dev, buf_type));
 	else {
 		STAILQ_INIT(&buf->src_list);
 		buf->dev = dev;
 		buf->buf_type = buf_type;
 
 		return (buf);
 	}
 }
 
 int
 camdd_buf_sg_create(struct camdd_buf *buf, int iovec, uint32_t sector_size,
 		    uint32_t *num_sectors_used, int *double_buf_needed)
 {
 	struct camdd_buf *tmp_buf;
 	struct camdd_buf_data *data;
 	uint8_t *extra_buf = NULL;
 	size_t extra_buf_len = 0;
 	int extra_buf_attached = 0;
 	int i, retval = 0;
 
 	data = &buf->buf_type_spec.data;
 
 	data->sg_count = buf->src_count;
 	/*
 	 * Compose a scatter/gather list from all of the buffers in the list.
 	 * If the length of the buffer isn't a multiple of the sector size,
 	 * we'll have to add an extra buffer.  This should only happen
 	 * at the end of a transfer.
 	 */
 	if ((data->fill_len % sector_size) != 0) {
 		extra_buf_len = sector_size - (data->fill_len % sector_size);
 		extra_buf = calloc(extra_buf_len, 1);
 		if (extra_buf == NULL) {
 			warn("%s: unable to allocate %zu bytes for extra "
 			    "buffer space", __func__, extra_buf_len);
 			retval = 1;
 			goto bailout;
 		}
 		data->extra_buf = 1;
 		data->sg_count++;
 	}
 	if (iovec == 0) {
 		data->segs = calloc(data->sg_count, sizeof(bus_dma_segment_t));
 		if (data->segs == NULL) {
 			warn("%s: unable to allocate %zu bytes for S/G list",
 			    __func__, sizeof(bus_dma_segment_t) *
 			    data->sg_count);
 			retval = 1;
 			goto bailout;
 		}
 
 	} else {
 		data->iovec = calloc(data->sg_count, sizeof(struct iovec));
 		if (data->iovec == NULL) {
 			warn("%s: unable to allocate %zu bytes for S/G list",
 			    __func__, sizeof(struct iovec) * data->sg_count);
 			retval = 1;
 			goto bailout;
 		}
 	}
 
 	for (i = 0, tmp_buf = STAILQ_FIRST(&buf->src_list);
 	     i < buf->src_count && tmp_buf != NULL; i++,
 	     tmp_buf = STAILQ_NEXT(tmp_buf, src_links)) {
 
 		if (tmp_buf->buf_type == CAMDD_BUF_DATA) {
 			struct camdd_buf_data *tmp_data;
 
 			tmp_data = &tmp_buf->buf_type_spec.data;
 			if (iovec == 0) {
 				data->segs[i].ds_addr =
 				    (bus_addr_t) tmp_data->buf;
 				data->segs[i].ds_len = tmp_data->fill_len -
 				    tmp_data->resid;
 			} else {
 				data->iovec[i].iov_base = tmp_data->buf;
 				data->iovec[i].iov_len = tmp_data->fill_len -
 				    tmp_data->resid;
 			}
 			if (((tmp_data->fill_len - tmp_data->resid) %
 			     sector_size) != 0)
 				*double_buf_needed = 1;
 		} else {
 			struct camdd_buf_indirect *tmp_ind;
 
 			tmp_ind = &tmp_buf->buf_type_spec.indirect;
 			if (iovec == 0) {
 				data->segs[i].ds_addr =
 				    (bus_addr_t)tmp_ind->start_ptr;
 				data->segs[i].ds_len = tmp_ind->len;
 			} else {
 				data->iovec[i].iov_base = tmp_ind->start_ptr;
 				data->iovec[i].iov_len = tmp_ind->len;
 			}
 			if ((tmp_ind->len % sector_size) != 0)
 				*double_buf_needed = 1;
 		}
 	}
 
 	if (extra_buf != NULL) {
 		if (iovec == 0) {
 			data->segs[i].ds_addr = (bus_addr_t)extra_buf;
 			data->segs[i].ds_len = extra_buf_len;
 		} else {
 			data->iovec[i].iov_base = extra_buf;
 			data->iovec[i].iov_len = extra_buf_len;
 		}
 		extra_buf_attached = 1;
 		i++;
 	}
 	if ((tmp_buf != NULL) || (i != data->sg_count)) {
 		warnx("buffer source count does not match "
 		      "number of buffers in list!");
 		retval = 1;
 		goto bailout;
 	}
 
 bailout:
 	if (retval == 0) {
 		*num_sectors_used = (data->fill_len + extra_buf_len) /
 		    sector_size;
 	} else if (extra_buf_attached == 0) {
 		/*
 		 * If extra_buf isn't attached yet, we need to free it
 		 * to avoid leaking.
 		 */
 		free(extra_buf);
 		data->extra_buf = 0;
 		data->sg_count--;
 	}
 	return (retval);
 }
 
 uint32_t
 camdd_buf_get_len(struct camdd_buf *buf)
 {
 	uint32_t len = 0;
 
 	if (buf->buf_type != CAMDD_BUF_DATA) {
 		struct camdd_buf_indirect *indirect;
 
 		indirect = &buf->buf_type_spec.indirect;
 		len = indirect->len;
 	} else {
 		struct camdd_buf_data *data;
 
 		data = &buf->buf_type_spec.data;
 		len = data->fill_len;
 	}
 
 	return (len);
 }
 
 void
 camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf)
 {
 	struct camdd_buf_data *data;
 
 	assert(buf->buf_type == CAMDD_BUF_DATA);
 
 	data = &buf->buf_type_spec.data;
 
 	STAILQ_INSERT_TAIL(&buf->src_list, child_buf, src_links);
 	buf->src_count++;
 
 	data->fill_len += camdd_buf_get_len(child_buf);
 }
 
 typedef enum {
 	CAMDD_TS_MAX_BLK,
 	CAMDD_TS_MIN_BLK,
 	CAMDD_TS_BLK_GRAN,
 	CAMDD_TS_EFF_IOSIZE
 } camdd_status_item_index;
 
 static struct camdd_status_items {
 	const char *name;
 	struct mt_status_entry *entry;
 } req_status_items[] = {
 	{ "max_blk", NULL },
 	{ "min_blk", NULL },
 	{ "blk_gran", NULL },
 	{ "max_effective_iosize", NULL }
 };
 
 int
 camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize,
 		 uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran)
 {
 	struct mt_status_data status_data;
 	char *xml_str = NULL;
 	unsigned int i;
 	int retval = 0;
 	
 	retval = mt_get_xml_str(fd, MTIOCEXTGET, &xml_str);
 	if (retval != 0)
 		err(1, "Couldn't get XML string from %s", filename);
 
 	retval = mt_get_status(xml_str, &status_data);
 	if (retval != XML_STATUS_OK) {
 		warn("couldn't get status for %s", filename);
 		retval = 1;
 		goto bailout;
 	} else
 		retval = 0;
 
 	if (status_data.error != 0) {
 		warnx("%s", status_data.error_str);
 		retval = 1;
 		goto bailout;
 	}
 
 	for (i = 0; i < nitems(req_status_items); i++) {
                 char *name;
 
 		name = __DECONST(char *, req_status_items[i].name);
 		req_status_items[i].entry = mt_status_entry_find(&status_data,
 		    name);
 		if (req_status_items[i].entry == NULL) {
 			errx(1, "Cannot find status entry %s",
 			    req_status_items[i].name);
 		}
 	}
 
 	*max_iosize = req_status_items[CAMDD_TS_EFF_IOSIZE].entry->value_unsigned;
 	*max_blk= req_status_items[CAMDD_TS_MAX_BLK].entry->value_unsigned;
 	*min_blk= req_status_items[CAMDD_TS_MIN_BLK].entry->value_unsigned;
 	*blk_gran = req_status_items[CAMDD_TS_BLK_GRAN].entry->value_unsigned;
 bailout:
 
 	free(xml_str);
 	mt_status_free(&status_data);
 
 	return (retval);
 }
 
 struct camdd_dev *
 camdd_probe_file(int fd, struct camdd_io_opts *io_opts, int retry_count,
     int timeout)
 {
 	struct camdd_dev *dev = NULL;
 	struct camdd_dev_file *file_dev;
 	uint64_t blocksize = io_opts->blocksize;
 
 	dev = camdd_alloc_dev(CAMDD_DEV_FILE, NULL, 0, retry_count, timeout);
 	if (dev == NULL)
 		goto bailout;
 
 	file_dev = &dev->dev_spec.file;
 	file_dev->fd = fd;
 	strlcpy(file_dev->filename, io_opts->dev_name,
 	    sizeof(file_dev->filename));
 	strlcpy(dev->device_name, io_opts->dev_name, sizeof(dev->device_name));
 	if (blocksize == 0)
 		dev->blocksize = CAMDD_FILE_DEFAULT_BLOCK;
 	else
 		dev->blocksize = blocksize;
 
 	if ((io_opts->queue_depth != 0)
 	 && (io_opts->queue_depth != 1)) {
 		warnx("Queue depth %ju for %s ignored, only 1 outstanding "
 		    "command supported", (uintmax_t)io_opts->queue_depth,
 		    io_opts->dev_name);
 	}
 	dev->target_queue_depth = CAMDD_FILE_DEFAULT_DEPTH;
 	dev->run = camdd_file_run;
 	dev->fetch = NULL;
 
 	/*
 	 * We can effectively access files on byte boundaries.  We'll reset
 	 * this for devices like disks that can be accessed on sector
 	 * boundaries.
 	 */
 	dev->sector_size = 1;
 
 	if ((fd != STDIN_FILENO)
 	 && (fd != STDOUT_FILENO)) {
 		int retval;
 
 		retval = fstat(fd, &file_dev->sb);
 		if (retval != 0) {
 			warn("Cannot stat %s", dev->device_name);
 			goto bailout_error;
 		}
 		if (S_ISREG(file_dev->sb.st_mode)) {
 			file_dev->file_type = CAMDD_FILE_REG;
 		} else if (S_ISCHR(file_dev->sb.st_mode)) {
 			int type;
 
 			if (ioctl(fd, FIODTYPE, &type) == -1)
 				err(1, "FIODTYPE ioctl failed on %s",
 				    dev->device_name);
 			else {
 				if (type & D_TAPE)
 					file_dev->file_type = CAMDD_FILE_TAPE;
 				else if (type & D_DISK)
 					file_dev->file_type = CAMDD_FILE_DISK;
 				else if (type & D_MEM)
 					file_dev->file_type = CAMDD_FILE_MEM;
 				else if (type & D_TTY)
 					file_dev->file_type = CAMDD_FILE_TTY;
 			}
 		} else if (S_ISDIR(file_dev->sb.st_mode)) {
 			errx(1, "cannot operate on directory %s",
 			    dev->device_name);
 		} else if (S_ISFIFO(file_dev->sb.st_mode)) {
 			file_dev->file_type = CAMDD_FILE_PIPE;
 		} else
 			errx(1, "Cannot determine file type for %s",
 			    dev->device_name);
 
 		switch (file_dev->file_type) {
 		case CAMDD_FILE_REG:
 			if (file_dev->sb.st_size != 0)
 				dev->max_sector = file_dev->sb.st_size - 1;
 			else
 				dev->max_sector = 0;
 			file_dev->file_flags |= CAMDD_FF_CAN_SEEK;
 			break;
 		case CAMDD_FILE_TAPE: {
 			uint64_t max_iosize, max_blk, min_blk, blk_gran;
 			/*
 			 * Check block limits and maximum effective iosize.
 			 * Make sure the blocksize is within the block
 			 * limits (and a multiple of the minimum blocksize)
 			 * and that the blocksize is <= maximum effective
 			 * iosize.
 			 */
 			retval = camdd_probe_tape(fd, dev->device_name,
 			    &max_iosize, &max_blk, &min_blk, &blk_gran);
 			if (retval != 0)
 				errx(1, "Unable to probe tape %s",
 				    dev->device_name);
 
 			/*
 			 * The blocksize needs to be <= the maximum
 			 * effective I/O size of the tape device.  Note
 			 * that this also takes into account the maximum
 			 * blocksize reported by READ BLOCK LIMITS.
 			 */
 			if (dev->blocksize > max_iosize) {
 				warnx("Blocksize %u too big for %s, limiting "
 				    "to %ju", dev->blocksize, dev->device_name,
 				    max_iosize);
 				dev->blocksize = max_iosize;
 			}
 
 			/*
 			 * The blocksize needs to be at least min_blk;
 			 */
 			if (dev->blocksize < min_blk) {
 				warnx("Blocksize %u too small for %s, "
 				    "increasing to %ju", dev->blocksize,
 				    dev->device_name, min_blk);
 				dev->blocksize = min_blk;
 			}
 
 			/*
 			 * And the blocksize needs to be a multiple of
 			 * the block granularity.
 			 */
 			if ((blk_gran != 0)
 			 && (dev->blocksize % (1 << blk_gran))) {
 				warnx("Blocksize %u for %s not a multiple of "
 				    "%d, adjusting to %d", dev->blocksize,
 				    dev->device_name, (1 << blk_gran),
 				    dev->blocksize & ~((1 << blk_gran) - 1));
 				dev->blocksize &= ~((1 << blk_gran) - 1);
 			}
 
 			if (dev->blocksize == 0) {
 				errx(1, "Unable to derive valid blocksize for "
 				    "%s", dev->device_name);
 			}
 
 			/*
 			 * For tape drives, set the sector size to the
 			 * blocksize so that we make sure not to write
 			 * less than the blocksize out to the drive.
 			 */
 			dev->sector_size = dev->blocksize;
 			break;
 		}
 		case CAMDD_FILE_DISK: {
 			off_t media_size;
 			unsigned int sector_size;
 
 			file_dev->file_flags |= CAMDD_FF_CAN_SEEK;
 
 			if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) == -1) {
 				err(1, "DIOCGSECTORSIZE ioctl failed on %s",
 				    dev->device_name);
 			}
 
 			if (sector_size == 0) {
 				errx(1, "DIOCGSECTORSIZE ioctl returned "
 				    "invalid sector size %u for %s",
 				    sector_size, dev->device_name);
 			}
 
 			if (ioctl(fd, DIOCGMEDIASIZE, &media_size) == -1) {
 				err(1, "DIOCGMEDIASIZE ioctl failed on %s",
 				    dev->device_name);
 			}
 
 			if (media_size == 0) {
 				errx(1, "DIOCGMEDIASIZE ioctl returned "
 				    "invalid media size %ju for %s",
 				    (uintmax_t)media_size, dev->device_name);
 			}
 
 			if (dev->blocksize % sector_size) {
 				errx(1, "%s blocksize %u not a multiple of "
 				    "sector size %u", dev->device_name,
 				    dev->blocksize, sector_size);
 			}
 
 			dev->sector_size = sector_size;
 			dev->max_sector = (media_size / sector_size) - 1;
 			break;
 		}
 		case CAMDD_FILE_MEM:
 			file_dev->file_flags |= CAMDD_FF_CAN_SEEK;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if ((io_opts->offset != 0)
 	 && ((file_dev->file_flags & CAMDD_FF_CAN_SEEK) == 0)) {
 		warnx("Offset %ju specified for %s, but we cannot seek on %s",
 		    io_opts->offset, io_opts->dev_name, io_opts->dev_name);
 		goto bailout_error;
 	}
 #if 0
 	else if ((io_opts->offset != 0)
 		&& ((io_opts->offset % dev->sector_size) != 0)) {
 		warnx("Offset %ju for %s is not a multiple of the "
 		      "sector size %u", io_opts->offset, 
 		      io_opts->dev_name, dev->sector_size);
 		goto bailout_error;
 	} else {
 		dev->start_offset_bytes = io_opts->offset;
 	}
 #endif
 
 bailout:
 	return (dev);
 
 bailout_error:
 	camdd_free_dev(dev);
 	return (NULL);
 }
 
 /*
  * Get a get device CCB for the specified device.
  */
 int
 camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd)
 {
         union ccb *ccb;
 	int retval = 0;
 
 	ccb = cam_getccb(device);
  
 	if (ccb == NULL) {
 		warnx("%s: couldn't allocate CCB", __func__);
 		return -1;
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgd);
 
 	ccb->ccb_h.func_code = XPT_GDEV_TYPE;
  
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("%s: error sending Get Device Information CCB", __func__);
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = -1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = -1;
 		goto bailout;
 	}
 
 	bcopy(&ccb->cgd, cgd, sizeof(struct ccb_getdev));
 
 bailout:
 	cam_freeccb(ccb);
  
 	return retval;
 }
 
 int
 camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb,
 		 camdd_argmask arglist, int probe_retry_count,
 		 int probe_timeout, uint64_t *maxsector, uint32_t *block_len)
 {
 	struct scsi_read_capacity_data rcap;
 	struct scsi_read_capacity_data_long rcaplong;
 	int retval = -1;
 
 	if (ccb == NULL) {
 		warnx("%s: error passed ccb is NULL", __func__);
 		goto bailout;
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	scsi_read_capacity(&ccb->csio,
 			   /*retries*/ probe_retry_count,
 			   /*cbfcnp*/ NULL,
 			   /*tag_action*/ MSG_SIMPLE_Q_TAG,
 			   &rcap,
 			   SSD_FULL_SIZE,
 			   /*timeout*/ probe_timeout ? probe_timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAMDD_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(cam_dev, ccb) < 0) {
 		warn("error sending READ CAPACITY command");
 
 		cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		goto bailout;
 	}
 
 	*maxsector = scsi_4btoul(rcap.addr);
 	*block_len = scsi_4btoul(rcap.length);
 
 	/*
 	 * A last block of 2^32-1 means that the true capacity is over 2TB,
 	 * and we need to issue the long READ CAPACITY to get the real
 	 * capacity.  Otherwise, we're all set.
 	 */
 	if (*maxsector != 0xffffffff) {
 		retval = 0;
 		goto bailout;
 	}
 
 	scsi_read_capacity_16(&ccb->csio,
 			      /*retries*/ probe_retry_count,
 			      /*cbfcnp*/ NULL,
 			      /*tag_action*/ MSG_SIMPLE_Q_TAG,
 			      /*lba*/ 0,
 			      /*reladdr*/ 0,
 			      /*pmi*/ 0,
 			      (uint8_t *)&rcaplong,
 			      sizeof(rcaplong),
 			      /*sense_len*/ SSD_FULL_SIZE,
 			      /*timeout*/ probe_timeout ? probe_timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAMDD_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(cam_dev, ccb) < 0) {
 		warn("error sending READ CAPACITY (16) command");
 		cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		goto bailout;
 	}
 
 	*maxsector = scsi_8btou64(rcaplong.addr);
 	*block_len = scsi_4btoul(rcaplong.length);
 
 	retval = 0;
 
 bailout:
 	return retval;
 }
 
 /*
  * Need to implement this.  Do a basic probe:
  * - Check the inquiry data, make sure we're talking to a device that we
  *   can reasonably expect to talk to -- direct, RBC, CD, WORM.
  * - Send a test unit ready, make sure the device is available.
  * - Get the capacity and block size.
  */
 struct camdd_dev *
 camdd_probe_pass(struct cam_device *cam_dev, struct camdd_io_opts *io_opts,
 		 camdd_argmask arglist, int probe_retry_count,
 		 int probe_timeout, int io_retry_count, int io_timeout)
 {
 	union ccb *ccb;
 	uint64_t maxsector = 0;
 	uint32_t cpi_maxio, max_iosize, pass_numblocks;
 	uint32_t block_len = 0;
 	struct camdd_dev *dev = NULL;
 	struct camdd_dev_pass *pass_dev;
 	struct kevent ke;
 	struct ccb_getdev cgd;
 	int retval;
 	int scsi_dev_type;
 
 	if ((retval = camdd_get_cgd(cam_dev, &cgd)) != 0) {
 		warnx("%s: error retrieving CGD", __func__);
 		return NULL;
 	}
 
 	ccb = cam_getccb(cam_dev);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		goto bailout;
 	}
 
 	switch (cgd.protocol) {
 	case PROTO_SCSI:
 		scsi_dev_type = SID_TYPE(&cam_dev->inq_data);
 
 		/*
 		 * For devices that support READ CAPACITY, we'll attempt to get the
 		 * capacity.  Otherwise, we really don't support tape or other
 		 * devices via SCSI passthrough, so just return an error in that case.
 		 */
 		switch (scsi_dev_type) {
 		case T_DIRECT:
 		case T_WORM:
 		case T_CDROM:
 		case T_OPTICAL:
 		case T_RBC:
 		case T_ZBC_HM:
 			break;
 		default:
 			errx(1, "Unsupported SCSI device type %d", scsi_dev_type);
 			break; /*NOTREACHED*/
 		}
 
 		if ((retval = camdd_probe_pass_scsi(cam_dev, ccb, probe_retry_count,
 						arglist, probe_timeout, &maxsector,
 						&block_len))) {
 			goto bailout;
 		}
 		break;
 	default:
 		errx(1, "Unsupported PROTO type %d", cgd.protocol);
 		break; /*NOTREACHED*/
 	}
 
 	if (block_len == 0) {
 		warnx("Sector size for %s%u is 0, cannot continue",
 		    cam_dev->device_name, cam_dev->dev_unit_num);
 		goto bailout_error;
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi);
 
 	ccb->ccb_h.func_code = XPT_PATH_INQ;
 	ccb->ccb_h.flags = CAM_DIR_NONE;
 	ccb->ccb_h.retry_count = 1;
 	
 	if (cam_send_ccb(cam_dev, ccb) < 0) {
 		warn("error sending XPT_PATH_INQ CCB");
 
 		cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		goto bailout;
 	}
 
 	EV_SET(&ke, cam_dev->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0);
 
 	dev = camdd_alloc_dev(CAMDD_DEV_PASS, &ke, 1, io_retry_count,
 			      io_timeout);
 	if (dev == NULL)
 		goto bailout;
 
 	pass_dev = &dev->dev_spec.pass;
 	pass_dev->scsi_dev_type = scsi_dev_type;
 	pass_dev->protocol = cgd.protocol;
 	pass_dev->dev = cam_dev;
 	pass_dev->max_sector = maxsector;
 	pass_dev->block_len = block_len;
 	pass_dev->cpi_maxio = ccb->cpi.maxio;
 	snprintf(dev->device_name, sizeof(dev->device_name), "%s%u",
 		 pass_dev->dev->device_name, pass_dev->dev->dev_unit_num);
 	dev->sector_size = block_len;
 	dev->max_sector = maxsector;
 	
 
 	/*
 	 * Determine the optimal blocksize to use for this device.
 	 */
 
 	/*
 	 * If the controller has not specified a maximum I/O size,
 	 * just go with 128K as a somewhat conservative value.
 	 */
 	if (pass_dev->cpi_maxio == 0)
 		cpi_maxio = 131072;
 	else
 		cpi_maxio = pass_dev->cpi_maxio;
 
 	/*
 	 * If the controller has a large maximum I/O size, limit it
 	 * to something smaller so that the kernel doesn't have trouble
 	 * allocating buffers to copy data in and out for us.
 	 * XXX KDM this is until we have unmapped I/O support in the kernel.
 	 */
 	max_iosize = min(cpi_maxio, CAMDD_PASS_MAX_BLOCK);
 
 	/*
 	 * If we weren't able to get a block size for some reason,
 	 * default to 512 bytes.
 	 */
 	block_len = pass_dev->block_len;
 	if (block_len == 0)
 		block_len = 512;
 
 	/*
 	 * Figure out how many blocksize chunks will fit in the
 	 * maximum I/O size.
 	 */
 	pass_numblocks = max_iosize / block_len;
 
 	/*
 	 * And finally, multiple the number of blocks by the LBA
 	 * length to get our maximum block size;
 	 */
 	dev->blocksize = pass_numblocks * block_len;
 
 	if (io_opts->blocksize != 0) {
 		if ((io_opts->blocksize % dev->sector_size) != 0) {
 			warnx("Blocksize %ju for %s is not a multiple of "
 			      "sector size %u", (uintmax_t)io_opts->blocksize, 
 			      dev->device_name, dev->sector_size);
 			goto bailout_error;
 		}
 		dev->blocksize = io_opts->blocksize;
 	}
 	dev->target_queue_depth = CAMDD_PASS_DEFAULT_DEPTH;
 	if (io_opts->queue_depth != 0)
 		dev->target_queue_depth = io_opts->queue_depth;
 
 	if (io_opts->offset != 0) {
 		if (io_opts->offset > (dev->max_sector * dev->sector_size)) {
 			warnx("Offset %ju is past the end of device %s",
 			    io_opts->offset, dev->device_name);
 			goto bailout_error;
 		}
 #if 0
 		else if ((io_opts->offset % dev->sector_size) != 0) {
 			warnx("Offset %ju for %s is not a multiple of the "
 			      "sector size %u", io_opts->offset, 
 			      dev->device_name, dev->sector_size);
 			goto bailout_error;
 		}
 		dev->start_offset_bytes = io_opts->offset;
 #endif
 	}
 
 	dev->min_cmd_size = io_opts->min_cmd_size;
 
 	dev->run = camdd_pass_run;
 	dev->fetch = camdd_pass_fetch;
 
 bailout:
 	cam_freeccb(ccb);
 
 	return (dev);
 
 bailout_error:
 	cam_freeccb(ccb);
 
 	camdd_free_dev(dev);
 
 	return (NULL);
 }
 
 void *
 camdd_worker(void *arg)
 {
 	struct camdd_dev *dev = arg;
 	struct camdd_buf *buf;
 	struct timespec ts, *kq_ts;
 
 	ts.tv_sec = 0;
 	ts.tv_nsec = 0;
 
 	pthread_mutex_lock(&dev->mutex);
 
 	dev->flags |= CAMDD_DEV_FLAG_ACTIVE;
 
 	for (;;) {
 		struct kevent ke;
 		int retval = 0;
 
 		/*
 		 * XXX KDM check the reorder queue depth?
 		 */
 		if (dev->write_dev == 0) {
 			uint32_t our_depth, peer_depth, peer_bytes, our_bytes;
 			uint32_t target_depth = dev->target_queue_depth;
 			uint32_t peer_target_depth =
 			    dev->peer_dev->target_queue_depth;
 			uint32_t peer_blocksize = dev->peer_dev->blocksize;
 
 			camdd_get_depth(dev, &our_depth, &peer_depth,
 					&our_bytes, &peer_bytes);
 
 #if 0
 			while (((our_depth < target_depth)
 			     && (peer_depth < peer_target_depth))
 			    || ((peer_bytes + our_bytes) <
 				 (peer_blocksize * 2))) {
 #endif
 			while (((our_depth + peer_depth) <
 			        (target_depth + peer_target_depth))
 			    || ((peer_bytes + our_bytes) <
 				(peer_blocksize * 3))) {
 
 				retval = camdd_queue(dev, NULL);
 				if (retval == 1)
 					break;
 				else if (retval != 0) {
 					error_exit = 1;
 					goto bailout;
 				}
 
 				camdd_get_depth(dev, &our_depth, &peer_depth,
 						&our_bytes, &peer_bytes);
 			}
 		}
 		/*
 		 * See if we have any I/O that is ready to execute.
 		 */
 		buf = STAILQ_FIRST(&dev->run_queue);
 		if (buf != NULL) {
 			while (dev->target_queue_depth > dev->cur_active_io) {
 				retval = dev->run(dev);
 				if (retval == -1) {
 					dev->flags |= CAMDD_DEV_FLAG_EOF;
 					error_exit = 1;
 					break;
 				} else if (retval != 0) {
 					break;
 				}
 			}
 		}
 
 		/*
 		 * We've reached EOF, or our partner has reached EOF.
 		 */
 		if ((dev->flags & CAMDD_DEV_FLAG_EOF)
 		 || (dev->flags & CAMDD_DEV_FLAG_PEER_EOF)) {
 			if (dev->write_dev != 0) {
 			 	if ((STAILQ_EMPTY(&dev->work_queue))
 				 && (dev->num_run_queue == 0)
 				 && (dev->cur_active_io == 0)) {
 					goto bailout;
 				}
 			} else {
 				/*
 				 * If we're the reader, and the writer
 				 * got EOF, he is already done.  If we got
 				 * the EOF, then we need to wait until
 				 * everything is flushed out for the writer.
 				 */
 				if (dev->flags & CAMDD_DEV_FLAG_PEER_EOF) {
 					goto bailout;
 				} else if ((dev->num_peer_work_queue == 0)
 					&& (dev->num_peer_done_queue == 0)
 					&& (dev->cur_active_io == 0)
 					&& (dev->num_run_queue == 0)) {
 					goto bailout;
 				}
 			}
 			/*
 			 * XXX KDM need to do something about the pending
 			 * queue and cleanup resources.
 			 */
 		} 
 
 		if ((dev->write_dev == 0)
 		 && (dev->cur_active_io == 0)
 		 && (dev->peer_bytes_queued < dev->peer_dev->blocksize))
 			kq_ts = &ts;
 		else
 			kq_ts = NULL;
 
 		/*
 		 * Run kevent to see if there are events to process.
 		 */
 		pthread_mutex_unlock(&dev->mutex);
 		retval = kevent(dev->kq, NULL, 0, &ke, 1, kq_ts);
 		pthread_mutex_lock(&dev->mutex);
 		if (retval == -1) {
 			warn("%s: error returned from kevent",__func__);
 			goto bailout;
 		} else if (retval != 0) {
 			switch (ke.filter) {
 			case EVFILT_READ:
 				if (dev->fetch != NULL) {
 					retval = dev->fetch(dev);
 					if (retval == -1) {
 						error_exit = 1;
 						goto bailout;
 					}
 				}
 				break;
 			case EVFILT_SIGNAL:
 				/*
 				 * We register for this so we don't get
 				 * an error as a result of a SIGINFO or a
 				 * SIGINT.  It will actually get handled
 				 * by the signal handler.  If we get a
 				 * SIGINT, bail out without printing an
 				 * error message.  Any other signals 
 				 * will result in the error message above.
 				 */
 				if (ke.ident == SIGINT)
 					goto bailout;
 				break;
 			case EVFILT_USER:
 				retval = 0;
 				/*
 				 * Check to see if the other thread has
 				 * queued any I/O for us to do.  (In this
 				 * case we're the writer.)
 				 */
 				for (buf = STAILQ_FIRST(&dev->work_queue);
 				     buf != NULL;
 				     buf = STAILQ_FIRST(&dev->work_queue)) {
 					STAILQ_REMOVE_HEAD(&dev->work_queue,
 							   work_links);
 					retval = camdd_queue(dev, buf);
 					/*
 					 * We keep going unless we get an
 					 * actual error.  If we get EOF, we
 					 * still want to remove the buffers
 					 * from the queue and send the back
 					 * to the reader thread.
 					 */
 					if (retval == -1) {
 						error_exit = 1;
 						goto bailout;
 					} else
 						retval = 0;
 				}
 
 				/*
 				 * Next check to see if the other thread has
 				 * queued any completed buffers back to us.
 				 * (In this case we're the reader.)
 				 */
 				for (buf = STAILQ_FIRST(&dev->peer_done_queue);
 				     buf != NULL;
 				     buf = STAILQ_FIRST(&dev->peer_done_queue)){
 					STAILQ_REMOVE_HEAD(
 					    &dev->peer_done_queue, work_links);
 					dev->num_peer_done_queue--;
 					camdd_peer_done(buf);
 				}
 				break;
 			default:
 				warnx("%s: unknown kevent filter %d",
 				      __func__, ke.filter);
 				break;
 			}
 		}
 	}
 
 bailout:
 
 	dev->flags &= ~CAMDD_DEV_FLAG_ACTIVE;
 
 	/* XXX KDM cleanup resources here? */
 
 	pthread_mutex_unlock(&dev->mutex);
 
 	need_exit = 1;
 	sem_post(&camdd_sem);
 
 	return (NULL);
 }
 
 /*
  * Simplistic translation of CCB status to our local status.
  */
 camdd_buf_status
 camdd_ccb_status(union ccb *ccb, int protocol)
 {
 	camdd_buf_status status = CAMDD_STATUS_NONE;
 	cam_status ccb_status;
 
 	ccb_status = ccb->ccb_h.status & CAM_STATUS_MASK;
 
 	switch (protocol) {
 	case PROTO_SCSI:
 		switch (ccb_status) {
 		case CAM_REQ_CMP: {
 			if (ccb->csio.resid == 0) {
 				status = CAMDD_STATUS_OK;
 			} else if (ccb->csio.dxfer_len > ccb->csio.resid) {
 				status = CAMDD_STATUS_SHORT_IO;
 			} else {
 				status = CAMDD_STATUS_EOF;
 			}
 			break;
 		}
 		case CAM_SCSI_STATUS_ERROR: {
 			switch (ccb->csio.scsi_status) {
 			case SCSI_STATUS_OK:
 			case SCSI_STATUS_COND_MET:
 			case SCSI_STATUS_INTERMED:
 			case SCSI_STATUS_INTERMED_COND_MET:
 				status = CAMDD_STATUS_OK;
 				break;
 			case SCSI_STATUS_CMD_TERMINATED:
 			case SCSI_STATUS_CHECK_COND:
 			case SCSI_STATUS_QUEUE_FULL:
 			case SCSI_STATUS_BUSY:
 			case SCSI_STATUS_RESERV_CONFLICT:
 			default:
 				status = CAMDD_STATUS_ERROR;
 				break;
 			}
 			break;
 		}
 		default:
 			status = CAMDD_STATUS_ERROR;
 			break;
 		}
 		break;
 	default:
 		status = CAMDD_STATUS_ERROR;
 		break;
 	}
 
 	return (status);
 }
 
 /*
  * Queue a buffer to our peer's work thread for writing.
  *
  * Returns 0 for success, -1 for failure, 1 if the other thread exited.
  */
 int
 camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf)
 {
 	struct kevent ke;
 	STAILQ_HEAD(, camdd_buf) local_queue;
 	struct camdd_buf *buf1, *buf2;
 	struct camdd_buf_data *data = NULL;
 	uint64_t peer_bytes_queued = 0;
 	int active = 1;
 	int retval = 0;
 
 	STAILQ_INIT(&local_queue);
 
 	/*
 	 * Since we're the reader, we need to queue our I/O to the writer
 	 * in sequential order in order to make sure it gets written out
 	 * in sequential order.
 	 *
 	 * Check the next expected I/O starting offset.  If this doesn't
 	 * match, put it on the reorder queue.
 	 */
 	if ((buf->lba * dev->sector_size) != dev->next_completion_pos_bytes) {
 
 		/*
 		 * If there is nothing on the queue, there is no sorting
 		 * needed.
 		 */
 		if (STAILQ_EMPTY(&dev->reorder_queue)) {
 			STAILQ_INSERT_TAIL(&dev->reorder_queue, buf, links);
 			dev->num_reorder_queue++;
 			goto bailout;
 		}
 
 		/*
 		 * Sort in ascending order by starting LBA.  There should
 		 * be no identical LBAs.
 		 */
 		for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL;
 		     buf1 = buf2) {
 			buf2 = STAILQ_NEXT(buf1, links);
 			if (buf->lba < buf1->lba) {
 				/*
 				 * If we're less than the first one, then
 				 * we insert at the head of the list
 				 * because this has to be the first element
 				 * on the list.
 				 */
 				STAILQ_INSERT_HEAD(&dev->reorder_queue,
 						   buf, links);
 				dev->num_reorder_queue++;
 				break;
 			} else if (buf->lba > buf1->lba) {
 				if (buf2 == NULL) {
 					STAILQ_INSERT_TAIL(&dev->reorder_queue, 
 					    buf, links);
 					dev->num_reorder_queue++;
 					break;
 				} else if (buf->lba < buf2->lba) {
 					STAILQ_INSERT_AFTER(&dev->reorder_queue,
 					    buf1, buf, links);
 					dev->num_reorder_queue++;
 					break;
 				}
 			} else {
 				errx(1, "Found buffers with duplicate LBA %ju!",
 				     buf->lba);
 			}
 		}
 		goto bailout;
 	} else {
 
 		/*
 		 * We're the next expected I/O completion, so put ourselves
 		 * on the local queue to be sent to the writer.  We use
 		 * work_links here so that we can queue this to the 
 		 * peer_work_queue before taking the buffer off of the
 		 * local_queue.
 		 */
 		dev->next_completion_pos_bytes += buf->len;
 		STAILQ_INSERT_TAIL(&local_queue, buf, work_links);
 
 		/*
 		 * Go through the reorder queue looking for more sequential
 		 * I/O and add it to the local queue.
 		 */
 		for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL;
 		     buf1 = STAILQ_FIRST(&dev->reorder_queue)) {
 			/*
 			 * As soon as we see an I/O that is out of sequence,
 			 * we're done.
 			 */
 			if ((buf1->lba * dev->sector_size) !=
 			     dev->next_completion_pos_bytes)
 				break;
 
 			STAILQ_REMOVE_HEAD(&dev->reorder_queue, links);
 			dev->num_reorder_queue--;
 			STAILQ_INSERT_TAIL(&local_queue, buf1, work_links);
 			dev->next_completion_pos_bytes += buf1->len;
 		}
 	}
 
 	/*
 	 * Setup the event to let the other thread know that it has work
 	 * pending.
 	 */
 	EV_SET(&ke, (uintptr_t)&dev->peer_dev->work_queue, EVFILT_USER, 0,
 	       NOTE_TRIGGER, 0, NULL);
 
 	/*
 	 * Put this on our shadow queue so that we know what we've queued
 	 * to the other thread.
 	 */
 	STAILQ_FOREACH_SAFE(buf1, &local_queue, work_links, buf2) {
 		if (buf1->buf_type != CAMDD_BUF_DATA) {
 			errx(1, "%s: should have a data buffer, not an "
 			    "indirect buffer", __func__);
 		}
 		data = &buf1->buf_type_spec.data;
 
 		/*
 		 * We only need to send one EOF to the writer, and don't
 		 * need to continue sending EOFs after that.
 		 */
 		if (buf1->status == CAMDD_STATUS_EOF) {
 			if (dev->flags & CAMDD_DEV_FLAG_EOF_SENT) {
 				STAILQ_REMOVE(&local_queue, buf1, camdd_buf,
 				    work_links);
 				camdd_release_buf(buf1);
 				retval = 1;
 				continue;
 			}
 			dev->flags |= CAMDD_DEV_FLAG_EOF_SENT;
 		}
 
 
 		STAILQ_INSERT_TAIL(&dev->peer_work_queue, buf1, links);
 		peer_bytes_queued += (data->fill_len - data->resid);
 		dev->peer_bytes_queued += (data->fill_len - data->resid);
 		dev->num_peer_work_queue++;
 	}
 
 	if (STAILQ_FIRST(&local_queue) == NULL)
 		goto bailout;
 
 	/*
 	 * Drop our mutex and pick up the other thread's mutex.  We need to
 	 * do this to avoid deadlocks.
 	 */
 	pthread_mutex_unlock(&dev->mutex);
 	pthread_mutex_lock(&dev->peer_dev->mutex);
 
 	if (dev->peer_dev->flags & CAMDD_DEV_FLAG_ACTIVE) {
 		/*
 		 * Put the buffers on the other thread's incoming work queue.
 		 */
 		for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL;
 		     buf1 = STAILQ_FIRST(&local_queue)) {
 			STAILQ_REMOVE_HEAD(&local_queue, work_links);
 			STAILQ_INSERT_TAIL(&dev->peer_dev->work_queue, buf1,
 					   work_links);
 		}
 		/*
 		 * Send an event to the other thread's kqueue to let it know
 		 * that there is something on the work queue.
 		 */
 		retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL);
 		if (retval == -1)
 			warn("%s: unable to add peer work_queue kevent",
 			     __func__);
 		else
 			retval = 0;
 	} else
 		active = 0;
 
 	pthread_mutex_unlock(&dev->peer_dev->mutex);
 	pthread_mutex_lock(&dev->mutex);
 
 	/*
 	 * If the other side isn't active, run through the queue and
 	 * release all of the buffers.
 	 */
 	if (active == 0) {
 		for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL;
 		     buf1 = STAILQ_FIRST(&local_queue)) {
 			STAILQ_REMOVE_HEAD(&local_queue, work_links);
 			STAILQ_REMOVE(&dev->peer_work_queue, buf1, camdd_buf,
 				      links);
 			dev->num_peer_work_queue--;
 			camdd_release_buf(buf1);
 		}
 		dev->peer_bytes_queued -= peer_bytes_queued;
 		retval = 1;
 	}
 
 bailout:
 	return (retval);
 }
 
 /*
  * Return a buffer to the reader thread when we have completed writing it.
  */
 int
 camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf)
 {
 	struct kevent ke;
 	int retval = 0;
 
 	/*
 	 * Setup the event to let the other thread know that we have
 	 * completed a buffer.
 	 */
 	EV_SET(&ke, (uintptr_t)&dev->peer_dev->peer_done_queue, EVFILT_USER, 0,
 	       NOTE_TRIGGER, 0, NULL);
 
 	/*
 	 * Drop our lock and acquire the other thread's lock before
 	 * manipulating 
 	 */
 	pthread_mutex_unlock(&dev->mutex);
 	pthread_mutex_lock(&dev->peer_dev->mutex);
 
 	/*
 	 * Put the buffer on the reader thread's peer done queue now that
 	 * we have completed it.
 	 */
 	STAILQ_INSERT_TAIL(&dev->peer_dev->peer_done_queue, peer_buf,
 			   work_links);
 	dev->peer_dev->num_peer_done_queue++;
 
 	/*
 	 * Send an event to the peer thread to let it know that we've added
 	 * something to its peer done queue.
 	 */
 	retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL);
 	if (retval == -1)
 		warn("%s: unable to add peer_done_queue kevent", __func__);
 	else
 		retval = 0;
 
 	/*
 	 * Drop the other thread's lock and reacquire ours.
 	 */
 	pthread_mutex_unlock(&dev->peer_dev->mutex);
 	pthread_mutex_lock(&dev->mutex);
 
 	return (retval);
 }
 
 /*
  * Free a buffer that was written out by the writer thread and returned to
  * the reader thread.
  */
 void
 camdd_peer_done(struct camdd_buf *buf)
 {
 	struct camdd_dev *dev;
 	struct camdd_buf_data *data;
 
 	dev = buf->dev;
 	if (buf->buf_type != CAMDD_BUF_DATA) {
 		errx(1, "%s: should have a data buffer, not an "
 		    "indirect buffer", __func__);
 	}
 
 	data = &buf->buf_type_spec.data;
 
 	STAILQ_REMOVE(&dev->peer_work_queue, buf, camdd_buf, links);
 	dev->num_peer_work_queue--;
 	dev->peer_bytes_queued -= (data->fill_len - data->resid);
 
 	if (buf->status == CAMDD_STATUS_EOF)
 		dev->flags |= CAMDD_DEV_FLAG_PEER_EOF;
 
 	STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
 }
 
 /*
  * Assumes caller holds the lock for this device.
  */
 void
 camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf,
 		   int *error_count)
 {
 	int retval = 0;
 
 	/*
 	 * If we're the reader, we need to send the completed I/O
 	 * to the writer.  If we're the writer, we need to just
 	 * free up resources, or let the reader know if we've
 	 * encountered an error.
 	 */
 	if (dev->write_dev == 0) {
 		retval = camdd_queue_peer_buf(dev, buf);
 		if (retval != 0)
 			(*error_count)++;
 	} else {
 		struct camdd_buf *tmp_buf, *next_buf;
 
 		STAILQ_FOREACH_SAFE(tmp_buf, &buf->src_list, src_links,
 				    next_buf) {
 			struct camdd_buf *src_buf;
 			struct camdd_buf_indirect *indirect;
 
 			STAILQ_REMOVE(&buf->src_list, tmp_buf,
 				      camdd_buf, src_links);
 
 			tmp_buf->status = buf->status;
 
 			if (tmp_buf->buf_type == CAMDD_BUF_DATA) {
 				camdd_complete_peer_buf(dev, tmp_buf);
 				continue;
 			}
 
 			indirect = &tmp_buf->buf_type_spec.indirect;
 			src_buf = indirect->src_buf;
 			src_buf->refcount--;
 			/*
 			 * XXX KDM we probably need to account for
 			 * exactly how many bytes we were able to
 			 * write.  Allocate the residual to the
 			 * first N buffers?  Or just track the
 			 * number of bytes written?  Right now the reader
 			 * doesn't do anything with a residual.
 			 */
 			src_buf->status = buf->status;
 			if (src_buf->refcount <= 0)
 				camdd_complete_peer_buf(dev, src_buf);
 			STAILQ_INSERT_TAIL(&dev->free_indirect_queue,
 					   tmp_buf, links);
 		}
 
 		STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
 	}
 }
 
 /*
  * Fetch all completed commands from the pass(4) device.
  *
  * Returns the number of commands received, or -1 if any of the commands
  * completed with an error.  Returns 0 if no commands are available.
  */
 int
 camdd_pass_fetch(struct camdd_dev *dev)
 {
 	struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;
 	union ccb ccb;
 	int retval = 0, num_fetched = 0, error_count = 0;
 
 	pthread_mutex_unlock(&dev->mutex);
 	/*
 	 * XXX KDM we don't distinguish between EFAULT and ENOENT.
 	 */
 	while ((retval = ioctl(pass_dev->dev->fd, CAMIOGET, &ccb)) != -1) {
 		struct camdd_buf *buf;
 		struct camdd_buf_data *data;
 		cam_status ccb_status;
 		union ccb *buf_ccb;
 
 		buf = ccb.ccb_h.ccb_buf;
 		data = &buf->buf_type_spec.data;
 		buf_ccb = &data->ccb;
 
 		num_fetched++;
 
 		/*
 		 * Copy the CCB back out so we get status, sense data, etc.
 		 */
 		bcopy(&ccb, buf_ccb, sizeof(ccb));
 
 		pthread_mutex_lock(&dev->mutex);
 
 		/*
 		 * We're now done, so take this off the active queue.
 		 */
 		STAILQ_REMOVE(&dev->active_queue, buf, camdd_buf, links);
 		dev->cur_active_io--;
 
 		ccb_status = ccb.ccb_h.status & CAM_STATUS_MASK;
 		if (ccb_status != CAM_REQ_CMP) {
 			cam_error_print(pass_dev->dev, &ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		switch (pass_dev->protocol) {
 		case PROTO_SCSI:
 			data->resid = ccb.csio.resid;
 			dev->bytes_transferred += (ccb.csio.dxfer_len - ccb.csio.resid);
 			break;
 		default:
 			return -1;
 			break;
 		}
 
 		if (buf->status == CAMDD_STATUS_NONE)
 			buf->status = camdd_ccb_status(&ccb, pass_dev->protocol);
 		if (buf->status == CAMDD_STATUS_ERROR)
 			error_count++;
 		else if (buf->status == CAMDD_STATUS_EOF) {
 			/*
 			 * Once we queue this buffer to our partner thread,
 			 * he will know that we've hit EOF.
 			 */
 			dev->flags |= CAMDD_DEV_FLAG_EOF;
 		}
 
 		camdd_complete_buf(dev, buf, &error_count);
 
 		/*
 		 * Unlock in preparation for the ioctl call.
 		 */
 		pthread_mutex_unlock(&dev->mutex);
 	}
 
 	pthread_mutex_lock(&dev->mutex);
 
 	if (error_count > 0)
 		return (-1);
 	else
 		return (num_fetched);
 }
 
 /*
  * Returns -1 for error, 0 for success/continue, and 1 for resource
  * shortage/stop processing.
  */
 int
 camdd_file_run(struct camdd_dev *dev)
 {
 	struct camdd_dev_file *file_dev = &dev->dev_spec.file;
 	struct camdd_buf_data *data;
 	struct camdd_buf *buf;
 	off_t io_offset;
 	int retval = 0, write_dev = dev->write_dev;
 	int error_count = 0, no_resources = 0, double_buf_needed = 0;
 	uint32_t num_sectors = 0, db_len = 0;
 
 	buf = STAILQ_FIRST(&dev->run_queue);
 	if (buf == NULL) {
 		no_resources = 1;
 		goto bailout;
 	} else if ((dev->write_dev == 0)
 		&& (dev->flags & (CAMDD_DEV_FLAG_EOF |
 				  CAMDD_DEV_FLAG_EOF_SENT))) {
 		STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
 		dev->num_run_queue--;
 		buf->status = CAMDD_STATUS_EOF;
 		error_count++;
 		goto bailout;
 	}
 
 	/*
 	 * If we're writing, we need to go through the source buffer list
 	 * and create an S/G list.
 	 */
 	if (write_dev != 0) {
 		retval = camdd_buf_sg_create(buf, /*iovec*/ 1,
 		    dev->sector_size, &num_sectors, &double_buf_needed);
 		if (retval != 0) {
 			no_resources = 1;
 			goto bailout;
 		}
 	}
 
 	STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
 	dev->num_run_queue--;
 
 	data = &buf->buf_type_spec.data;
 
 	/*
 	 * pread(2) and pwrite(2) offsets are byte offsets.
 	 */
 	io_offset = buf->lba * dev->sector_size;
 
 	/*
 	 * Unlock the mutex while we read or write.
 	 */
 	pthread_mutex_unlock(&dev->mutex);
 
 	/*
 	 * Note that we don't need to double buffer if we're the reader
 	 * because in that case, we have allocated a single buffer of
 	 * sufficient size to do the read.  This copy is necessary on
 	 * writes because if one of the components of the S/G list is not
 	 * a sector size multiple, the kernel will reject the write.  This
 	 * is unfortunate but not surprising.  So this will make sure that
 	 * we're using a single buffer that is a multiple of the sector size.
 	 */
 	if ((double_buf_needed != 0)
 	 && (data->sg_count > 1)
 	 && (write_dev != 0)) {
 		uint32_t cur_offset;
 		int i;
 
 		if (file_dev->tmp_buf == NULL)
 			file_dev->tmp_buf = calloc(dev->blocksize, 1);
 		if (file_dev->tmp_buf == NULL) {
 			buf->status = CAMDD_STATUS_ERROR;
 			error_count++;
 			pthread_mutex_lock(&dev->mutex);
 			goto bailout;
 		}
 		for (i = 0, cur_offset = 0; i < data->sg_count; i++) {
 			bcopy(data->iovec[i].iov_base,
 			    &file_dev->tmp_buf[cur_offset],
 			    data->iovec[i].iov_len);
 			cur_offset += data->iovec[i].iov_len;
 		}
 		db_len = cur_offset;
 	}
 
 	if (file_dev->file_flags & CAMDD_FF_CAN_SEEK) {
 		if (write_dev == 0) {
 			/*
 			 * XXX KDM is there any way we would need a S/G
 			 * list here?
 			 */
 			retval = pread(file_dev->fd, data->buf,
 			    buf->len, io_offset);
 		} else {
 			if (double_buf_needed != 0) {
 				retval = pwrite(file_dev->fd, file_dev->tmp_buf,
 				    db_len, io_offset);
 			} else if (data->sg_count == 0) {
 				retval = pwrite(file_dev->fd, data->buf,
 				    data->fill_len, io_offset);
 			} else {
 				retval = pwritev(file_dev->fd, data->iovec,
 				    data->sg_count, io_offset);
 			}
 		}
 	} else {
 		if (write_dev == 0) {
 			/*
 			 * XXX KDM is there any way we would need a S/G
 			 * list here?
 			 */
 			retval = read(file_dev->fd, data->buf, buf->len);
 		} else {
 			if (double_buf_needed != 0) {
 				retval = write(file_dev->fd, file_dev->tmp_buf,
 				    db_len);
 			} else if (data->sg_count == 0) {
 				retval = write(file_dev->fd, data->buf,
 				    data->fill_len);
 			} else {
 				retval = writev(file_dev->fd, data->iovec,
 				    data->sg_count);
 			}
 		}
 	}
 
 	/* We're done, re-acquire the lock */
 	pthread_mutex_lock(&dev->mutex);
 
 	if (retval >= (ssize_t)data->fill_len) {
 		/*
 		 * If the bytes transferred is more than the request size,
 		 * that indicates an overrun, which should only happen at
 		 * the end of a transfer if we have to round up to a sector
 		 * boundary.
 		 */
 		if (buf->status == CAMDD_STATUS_NONE)
 			buf->status = CAMDD_STATUS_OK;
 		data->resid = 0;
 		dev->bytes_transferred += retval;
 	} else if (retval == -1) {
 		warn("Error %s %s", (write_dev) ? "writing to" :
 		    "reading from", file_dev->filename);
 
 		buf->status = CAMDD_STATUS_ERROR;
 		data->resid = data->fill_len;
 		error_count++;
 
 		if (dev->debug == 0)
 			goto bailout;
 
 		if ((double_buf_needed != 0)
 		 && (write_dev != 0)) {
 			fprintf(stderr, "%s: fd %d, DB buf %p, len %u lba %ju "
 			    "offset %ju\n", __func__, file_dev->fd,
 			    file_dev->tmp_buf, db_len, (uintmax_t)buf->lba,
 			    (uintmax_t)io_offset);
 		} else if (data->sg_count == 0) {
 			fprintf(stderr, "%s: fd %d, buf %p, len %u, lba %ju "
 			    "offset %ju\n", __func__, file_dev->fd, data->buf,
 			    data->fill_len, (uintmax_t)buf->lba,
 			    (uintmax_t)io_offset);
 		} else {
 			int i;
 
 			fprintf(stderr, "%s: fd %d, len %u, lba %ju "
 			    "offset %ju\n", __func__, file_dev->fd, 
 			    data->fill_len, (uintmax_t)buf->lba,
 			    (uintmax_t)io_offset);
 
 			for (i = 0; i < data->sg_count; i++) {
 				fprintf(stderr, "index %d ptr %p len %zu\n",
 				    i, data->iovec[i].iov_base,
 				    data->iovec[i].iov_len);
 			}
 		}
 	} else if (retval == 0) {
 		buf->status = CAMDD_STATUS_EOF;
 		if (dev->debug != 0)
 			printf("%s: got EOF from %s!\n", __func__,
 			    file_dev->filename);
 		data->resid = data->fill_len;
 		error_count++;
 	} else if (retval < (ssize_t)data->fill_len) {
 		if (buf->status == CAMDD_STATUS_NONE)
 			buf->status = CAMDD_STATUS_SHORT_IO;
 		data->resid = data->fill_len - retval;
 		dev->bytes_transferred += retval;
 	}
 
 bailout:
 	if (buf != NULL) {
 		if (buf->status == CAMDD_STATUS_EOF) {
 			struct camdd_buf *buf2;
 			dev->flags |= CAMDD_DEV_FLAG_EOF;
 			STAILQ_FOREACH(buf2, &dev->run_queue, links)
 				buf2->status = CAMDD_STATUS_EOF;
 		}
 
 		camdd_complete_buf(dev, buf, &error_count);
 	}
 
 	if (error_count != 0)
 		return (-1);
 	else if (no_resources != 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Execute one command from the run queue.  Returns 0 for success, 1 for
  * stop processing, and -1 for error.
  */
 int
 camdd_pass_run(struct camdd_dev *dev)
 {
 	struct camdd_buf *buf = NULL;
 	struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;
 	struct camdd_buf_data *data;
 	uint32_t num_blocks, sectors_used = 0;
 	union ccb *ccb;
 	int retval = 0, is_write = dev->write_dev;
 	int double_buf_needed = 0;
 
 	buf = STAILQ_FIRST(&dev->run_queue);
 	if (buf == NULL) {
 		retval = 1;
 		goto bailout;
 	}
 
 	/*
 	 * If we're writing, we need to go through the source buffer list
 	 * and create an S/G list.
 	 */
 	if (is_write != 0) {
 		retval = camdd_buf_sg_create(buf, /*iovec*/ 0,dev->sector_size,
 		    &sectors_used, &double_buf_needed);
 		if (retval != 0) {
 			retval = -1;
 			goto bailout;
 		}
 	}
 
 	STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
 	dev->num_run_queue--;
 
 	data = &buf->buf_type_spec.data;
 
 	/*
 	 * In almost every case the number of blocks should be the device
 	 * block size.  The exception may be at the end of an I/O stream
 	 * for a partial block or at the end of a device.
 	 */
 	if (is_write != 0)
 		num_blocks = sectors_used;
 	else
 		num_blocks = data->fill_len / pass_dev->block_len;
 
 	ccb = &data->ccb;
 
 	switch (pass_dev->protocol) {
 	case PROTO_SCSI:
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 		scsi_read_write(&ccb->csio,
 				/*retries*/ dev->retry_count,
 				/*cbfcnp*/ NULL,
 				/*tag_action*/ MSG_SIMPLE_Q_TAG,
 				/*readop*/ (dev->write_dev == 0) ? SCSI_RW_READ :
 					   SCSI_RW_WRITE,
 				/*byte2*/ 0,
 				/*minimum_cmd_size*/ dev->min_cmd_size,
 				/*lba*/ buf->lba,
 				/*block_count*/ num_blocks,
 				/*data_ptr*/ (data->sg_count != 0) ?
 					     (uint8_t *)data->segs : data->buf,
 				/*dxfer_len*/ (num_blocks * pass_dev->block_len),
 				/*sense_len*/ SSD_FULL_SIZE,
 				/*timeout*/ dev->io_timeout);
 
 		if (data->sg_count != 0) {
 			ccb->csio.sglist_cnt = data->sg_count;
 		}
 		break;
 	default:
 		retval = -1;
 		goto bailout;
 	}
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (dev->retry_count != 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (data->sg_count != 0) {
 		ccb->ccb_h.flags |= CAM_DATA_SG;
 	}
 
 	/*
 	 * Store a pointer to the buffer in the CCB.  The kernel will
 	 * restore this when we get it back, and we'll use it to identify
 	 * the buffer this CCB came from.
 	 */
 	ccb->ccb_h.ccb_buf = buf;
 
 	/*
 	 * Unlock our mutex in preparation for issuing the ioctl.
 	 */
 	pthread_mutex_unlock(&dev->mutex);
 	/*
 	 * Queue the CCB to the pass(4) driver.
 	 */
 	if (ioctl(pass_dev->dev->fd, CAMIOQUEUE, ccb) == -1) {
 		pthread_mutex_lock(&dev->mutex);
 
 		warn("%s: error sending CAMIOQUEUE ioctl to %s%u", __func__,
 		     pass_dev->dev->device_name, pass_dev->dev->dev_unit_num);
 		warn("%s: CCB address is %p", __func__, ccb);
 		retval = -1;
 
 		STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
 	} else {
 		pthread_mutex_lock(&dev->mutex);
 
 		dev->cur_active_io++;
 		STAILQ_INSERT_TAIL(&dev->active_queue, buf, links);
 	}
 
 bailout:
 	return (retval);
 }
 
 int
 camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len)
 {
 	struct camdd_dev_pass *pass_dev;
 	uint32_t num_blocks;
 	int retval = 0;
 
 	pass_dev = &dev->dev_spec.pass;
 
 	*lba = dev->next_io_pos_bytes / dev->sector_size;
 	*len = dev->blocksize;
 	num_blocks = *len / dev->sector_size;
 
 	/*
 	 * If max_sector is 0, then we have no set limit.  This can happen
 	 * if we're writing to a file in a filesystem, or reading from
 	 * something like /dev/zero.
 	 */
 	if ((dev->max_sector != 0)
 	 || (dev->sector_io_limit != 0)) {
 		uint64_t max_sector;
 
 		if ((dev->max_sector != 0)
 		 && (dev->sector_io_limit != 0)) 
 			max_sector = min(dev->sector_io_limit, dev->max_sector);
 		else if (dev->max_sector != 0)
 			max_sector = dev->max_sector;
 		else
 			max_sector = dev->sector_io_limit;
 
 
 		/*
 		 * Check to see whether we're starting off past the end of
 		 * the device.  If so, we need to just send an EOF 	
 		 * notification to the writer.
 		 */
 		if (*lba > max_sector) {
 			*len = 0;
 			retval = 1;
 		} else if (((*lba + num_blocks) > max_sector + 1)
 			|| ((*lba + num_blocks) < *lba)) {
 			/*
 			 * If we get here (but pass the first check), we
 			 * can trim the request length down to go to the
 			 * end of the device.
 			 */
 			num_blocks = (max_sector + 1) - *lba;
 			*len = num_blocks * dev->sector_size;
 			retval = 1;
 		}
 	}
 
 	dev->next_io_pos_bytes += *len;
 
 	return (retval);
 }
 
 /*
  * Returns 0 for success, 1 for EOF detected, and -1 for failure.
  */
 int
 camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf)
 {
 	struct camdd_buf *buf = NULL;
 	struct camdd_buf_data *data;
 	struct camdd_dev_pass *pass_dev;
 	size_t new_len;
 	struct camdd_buf_data *rb_data;
 	int is_write = dev->write_dev;
 	int eof_flush_needed = 0;
 	int retval = 0;
 	int error;
 
 	pass_dev = &dev->dev_spec.pass;
 
 	/*
 	 * If we've gotten EOF or our partner has, we should not continue
 	 * queueing I/O.  If we're a writer, though, we should continue
 	 * to write any buffers that don't have EOF status.
 	 */
 	if ((dev->flags & CAMDD_DEV_FLAG_EOF)
 	 || ((dev->flags & CAMDD_DEV_FLAG_PEER_EOF)
 	  && (is_write == 0))) {
 		/*
 		 * Tell the worker thread that we have seen EOF.
 		 */
 		retval = 1;
 
 		/*
 		 * If we're the writer, send the buffer back with EOF status.
 		 */
 		if (is_write) {
 			read_buf->status = CAMDD_STATUS_EOF;
 			
 			error = camdd_complete_peer_buf(dev, read_buf);
 		}
 		goto bailout;
 	}
 
 	if (is_write == 0) {
 		buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
 		if (buf == NULL) {
 			retval = -1;
 			goto bailout;
 		}
 		data = &buf->buf_type_spec.data;
 
 		retval = camdd_get_next_lba_len(dev, &buf->lba, &buf->len);
 		if (retval != 0) {
 			buf->status = CAMDD_STATUS_EOF;
 
 		 	if ((buf->len == 0)
 			 && ((dev->flags & (CAMDD_DEV_FLAG_EOF_SENT |
 			     CAMDD_DEV_FLAG_EOF_QUEUED)) != 0)) {
 				camdd_release_buf(buf);
 				goto bailout;
 			}
 			dev->flags |= CAMDD_DEV_FLAG_EOF_QUEUED;
 		}
 
 		data->fill_len = buf->len;
 		data->src_start_offset = buf->lba * dev->sector_size;
 
 		/*
 		 * Put this on the run queue.
 		 */
 		STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
 		dev->num_run_queue++;
 
 		/* We're done. */
 		goto bailout;
 	}
 
 	/*
 	 * Check for new EOF status from the reader.
 	 */
 	if ((read_buf->status == CAMDD_STATUS_EOF)
 	 || (read_buf->status == CAMDD_STATUS_ERROR)) {
 		dev->flags |= CAMDD_DEV_FLAG_PEER_EOF;
 		if ((STAILQ_FIRST(&dev->pending_queue) == NULL)
 		 && (read_buf->len == 0)) {
 			camdd_complete_peer_buf(dev, read_buf);
 			retval = 1;
 			goto bailout;
 		} else
 			eof_flush_needed = 1;
 	}
 
 	/*
 	 * See if we have a buffer we're composing with pieces from our
 	 * partner thread.
 	 */
 	buf = STAILQ_FIRST(&dev->pending_queue);
 	if (buf == NULL) {
 		uint64_t lba;
 		ssize_t len;
 
 		retval = camdd_get_next_lba_len(dev, &lba, &len);
 		if (retval != 0) {
 			read_buf->status = CAMDD_STATUS_EOF;
 
 			if (len == 0) {
 				dev->flags |= CAMDD_DEV_FLAG_EOF;
 				error = camdd_complete_peer_buf(dev, read_buf);
 				goto bailout;
 			}
 		}
 
 		/*
 		 * If we don't have a pending buffer, we need to grab a new
 		 * one from the free list or allocate another one.
 		 */
 		buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
 		if (buf == NULL) {
 			retval = 1;
 			goto bailout;
 		}
 
 		buf->lba = lba;
 		buf->len = len;
 
 		STAILQ_INSERT_TAIL(&dev->pending_queue, buf, links);
 		dev->num_pending_queue++;
 	}
 
 	data = &buf->buf_type_spec.data;
 
 	rb_data = &read_buf->buf_type_spec.data;
 
 	if ((rb_data->src_start_offset != dev->next_peer_pos_bytes)
 	 && (dev->debug != 0)) {
 		printf("%s: WARNING: reader offset %#jx != expected offset "
 		    "%#jx\n", __func__, (uintmax_t)rb_data->src_start_offset,
 		    (uintmax_t)dev->next_peer_pos_bytes);
 	}
 	dev->next_peer_pos_bytes = rb_data->src_start_offset +
 	    (rb_data->fill_len - rb_data->resid);
 
 	new_len = (rb_data->fill_len - rb_data->resid) + data->fill_len;
 	if (new_len < buf->len) {
 		/*
 		 * There are three cases here:
 		 * 1. We need more data to fill up a block, so we put 
 		 *    this I/O on the queue and wait for more I/O.
 		 * 2. We have a pending buffer in the queue that is
 		 *    smaller than our blocksize, but we got an EOF.  So we
 		 *    need to go ahead and flush the write out.
 		 * 3. We got an error.
 		 */
 
 		/*
 		 * Increment our fill length.
 		 */
 		data->fill_len += (rb_data->fill_len - rb_data->resid);
 
 		/*
 		 * Add the new read buffer to the list for writing.
 		 */
 		STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links);
 
 		/* Increment the count */
 		buf->src_count++;
 
 		if (eof_flush_needed == 0) {
 			/*
 			 * We need to exit, because we don't have enough
 			 * data yet.
 			 */
 			goto bailout;
 		} else {
 			/*
 			 * Take the buffer off of the pending queue.
 			 */
 			STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf,
 				      links);
 			dev->num_pending_queue--;
 
 			/*
 			 * If we need an EOF flush, but there is no data
 			 * to flush, go ahead and return this buffer.
 			 */
 			if (data->fill_len == 0) {
 				camdd_complete_buf(dev, buf, /*error_count*/0);
 				retval = 1;
 				goto bailout;
 			}
 
 			/*
 			 * Put this on the next queue for execution.
 			 */
 			STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
 			dev->num_run_queue++;
 		}
 	} else if (new_len == buf->len) {
 		/*
 		 * We have enough data to completey fill one block,
 		 * so we're ready to issue the I/O.
 		 */
 
 		/*
 		 * Take the buffer off of the pending queue.
 		 */
 		STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links);
 		dev->num_pending_queue--;
 
 		/*
 		 * Add the new read buffer to the list for writing.
 		 */
 		STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links);
 
 		/* Increment the count */
 		buf->src_count++;
 
 		/*
 		 * Increment our fill length.
 		 */
 		data->fill_len += (rb_data->fill_len - rb_data->resid);
 
 		/*
 		 * Put this on the next queue for execution.
 		 */
 		STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
 		dev->num_run_queue++;
 	} else {
 		struct camdd_buf *idb;
 		struct camdd_buf_indirect *indirect;
 		uint32_t len_to_go, cur_offset;
 
 		
 		idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT);
 		if (idb == NULL) {
 			retval = 1;
 			goto bailout;
 		}
 		indirect = &idb->buf_type_spec.indirect;
 		indirect->src_buf = read_buf;
 		read_buf->refcount++;
 		indirect->offset = 0;
 		indirect->start_ptr = rb_data->buf;
 		/*
 		 * We've already established that there is more
 		 * data in read_buf than we have room for in our
 		 * current write request.  So this particular chunk
 		 * of the request should just be the remainder
 		 * needed to fill up a block.
 		 */
 		indirect->len = buf->len - (data->fill_len - data->resid);
 
 		camdd_buf_add_child(buf, idb);
 
 		/*
 		 * This buffer is ready to execute, so we can take
 		 * it off the pending queue and put it on the run
 		 * queue.
 		 */
 		STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf,
 			      links);
 		dev->num_pending_queue--;
 		STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
 		dev->num_run_queue++;
 
 		cur_offset = indirect->offset + indirect->len;
 
 		/*
 		 * The resulting I/O would be too large to fit in
 		 * one block.  We need to split this I/O into
 		 * multiple pieces.  Allocate as many buffers as needed.
 		 */
 		for (len_to_go = rb_data->fill_len - rb_data->resid -
 		     indirect->len; len_to_go > 0;) {
 			struct camdd_buf *new_buf;
 			struct camdd_buf_data *new_data;
 			uint64_t lba;
 			ssize_t len;
 
 			retval = camdd_get_next_lba_len(dev, &lba, &len);
 			if ((retval != 0)
 			 && (len == 0)) {
 				/*
 				 * The device has already been marked
 				 * as EOF, and there is no space left.
 				 */
 				goto bailout;
 			}
 
 			new_buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
 			if (new_buf == NULL) {
 				retval = 1;
 				goto bailout;
 			}
 
 			new_buf->lba = lba;
 			new_buf->len = len;
 
 			idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT);
 			if (idb == NULL) {
 				retval = 1;
 				goto bailout;
 			}
 
 			indirect = &idb->buf_type_spec.indirect;
 
 			indirect->src_buf = read_buf;
 			read_buf->refcount++;
 			indirect->offset = cur_offset;
 			indirect->start_ptr = rb_data->buf + cur_offset;
 			indirect->len = min(len_to_go, new_buf->len);
 #if 0
 			if (((indirect->len % dev->sector_size) != 0)
 			 || ((indirect->offset % dev->sector_size) != 0)) {
 				warnx("offset %ju len %ju not aligned with "
 				    "sector size %u", indirect->offset,
 				    (uintmax_t)indirect->len, dev->sector_size);
 			}
 #endif
 			cur_offset += indirect->len;
 			len_to_go -= indirect->len;
 
 			camdd_buf_add_child(new_buf, idb);
 
 			new_data = &new_buf->buf_type_spec.data;
 
 			if ((new_data->fill_len == new_buf->len)
 			 || (eof_flush_needed != 0)) {
 				STAILQ_INSERT_TAIL(&dev->run_queue,
 						   new_buf, links);
 				dev->num_run_queue++;
 			} else if (new_data->fill_len < buf->len) {
 				STAILQ_INSERT_TAIL(&dev->pending_queue,
 					   	new_buf, links);
 				dev->num_pending_queue++;
 			} else {
 				warnx("%s: too much data in new "
 				      "buffer!", __func__);
 				retval = 1;
 				goto bailout;
 			}
 		}
 	}
 
 bailout:
 	return (retval);
 }
 
 void
 camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth,
 		uint32_t *peer_depth, uint32_t *our_bytes, uint32_t *peer_bytes)
 {
 	*our_depth = dev->cur_active_io + dev->num_run_queue;
 	if (dev->num_peer_work_queue >
 	    dev->num_peer_done_queue)
 		*peer_depth = dev->num_peer_work_queue -
 			      dev->num_peer_done_queue;
 	else
 		*peer_depth = 0;
 	*our_bytes = *our_depth * dev->blocksize;
 	*peer_bytes = dev->peer_bytes_queued;
 }
 
 void
 camdd_sig_handler(int sig)
 {
 	if (sig == SIGINFO)
 		need_status = 1;
 	else {
 		need_exit = 1;
 		error_exit = 1;
 	}
 
 	sem_post(&camdd_sem);
 }
 
 void
 camdd_print_status(struct camdd_dev *camdd_dev, struct camdd_dev *other_dev, 
 		   struct timespec *start_time)
 {
 	struct timespec done_time;
 	uint64_t total_ns;
 	long double mb_sec, total_sec;
 	int error = 0;
 
 	error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &done_time);
 	if (error != 0) {
 		warn("Unable to get done time");
 		return;
 	}
 
-	timespecsub(&done_time, start_time);
+	timespecsub(&done_time, start_time, &done_time);
 	
 	total_ns = done_time.tv_nsec + (done_time.tv_sec * 1000000000);
 	total_sec = total_ns;
 	total_sec /= 1000000000;
 
 	fprintf(stderr, "%ju bytes %s %s\n%ju bytes %s %s\n"
 		"%.4Lf seconds elapsed\n",
 		(uintmax_t)camdd_dev->bytes_transferred,
 		(camdd_dev->write_dev == 0) ?  "read from" : "written to",
 		camdd_dev->device_name,
 		(uintmax_t)other_dev->bytes_transferred,
 		(other_dev->write_dev == 0) ? "read from" : "written to",
 		other_dev->device_name, total_sec);
 
 	mb_sec = min(other_dev->bytes_transferred,camdd_dev->bytes_transferred);
 	mb_sec /= 1024 * 1024;
 	mb_sec *= 1000000000;
 	mb_sec /= total_ns;
 	fprintf(stderr, "%.2Lf MB/sec\n", mb_sec);
 }
 
 int
 camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts, uint64_t max_io,
 	 int retry_count, int timeout)
 {
 	struct cam_device *new_cam_dev = NULL;
 	struct camdd_dev *devs[2];
 	struct timespec start_time;
 	pthread_t threads[2];
 	int unit = 0;
 	int error = 0;
 	int i;
 
 	if (num_io_opts != 2) {
 		warnx("Must have one input and one output path");
 		error = 1;
 		goto bailout;
 	}
 
 	bzero(devs, sizeof(devs));
 
 	for (i = 0; i < num_io_opts; i++) {
 		switch (io_opts[i].dev_type) {
 		case CAMDD_DEV_PASS: {
 			if (isdigit(io_opts[i].dev_name[0])) {
 				camdd_argmask new_arglist = CAMDD_ARG_NONE;
 				int bus = 0, target = 0, lun = 0;
 				int rv;
 
 				/* device specified as bus:target[:lun] */
 				rv = parse_btl(io_opts[i].dev_name, &bus,
 				    &target, &lun, &new_arglist);
 				if (rv < 2) {
 					warnx("numeric device specification "
 					     "must be either bus:target, or "
 					     "bus:target:lun");
 					error = 1;
 					goto bailout;
 				}
 				/* default to 0 if lun was not specified */
 				if ((new_arglist & CAMDD_ARG_LUN) == 0) {
 					lun = 0;
 					new_arglist |= CAMDD_ARG_LUN;
 				}
 				new_cam_dev = cam_open_btl(bus, target, lun,
 				    O_RDWR, NULL);
 			} else {
 				char name[30];
 
 				if (cam_get_device(io_opts[i].dev_name, name,
 						   sizeof name, &unit) == -1) {
 					warnx("%s", cam_errbuf);
 					error = 1;
 					goto bailout;
 				}
 				new_cam_dev = cam_open_spec_device(name, unit,
 				    O_RDWR, NULL);
 			}
 
 			if (new_cam_dev == NULL) {
 				warnx("%s", cam_errbuf);
 				error = 1;
 				goto bailout;
 			}
 
 			devs[i] = camdd_probe_pass(new_cam_dev,
 			    /*io_opts*/ &io_opts[i],
 			    CAMDD_ARG_ERR_RECOVER, 
 			    /*probe_retry_count*/ 3,
 			    /*probe_timeout*/ 5000,
 			    /*io_retry_count*/ retry_count,
 			    /*io_timeout*/ timeout);
 			if (devs[i] == NULL) {
 				warn("Unable to probe device %s%u",
 				     new_cam_dev->device_name,
 				     new_cam_dev->dev_unit_num);
 				error = 1;
 				goto bailout;
 			}
 			break;
 		}
 		case CAMDD_DEV_FILE: {
 			int fd = -1;
 
 			if (io_opts[i].dev_name[0] == '-') {
 				if (io_opts[i].write_dev != 0)
 					fd = STDOUT_FILENO;
 				else
 					fd = STDIN_FILENO;
 			} else {
 				if (io_opts[i].write_dev != 0) {
 					fd = open(io_opts[i].dev_name,
 					    O_RDWR | O_CREAT, S_IWUSR |S_IRUSR);
 				} else {
 					fd = open(io_opts[i].dev_name,
 					    O_RDONLY);
 				}
 			}
 			if (fd == -1) {
 				warn("error opening file %s",
 				    io_opts[i].dev_name);
 				error = 1;
 				goto bailout;
 			}
 
 			devs[i] = camdd_probe_file(fd, &io_opts[i],
 			    retry_count, timeout);
 			if (devs[i] == NULL) {
 				error = 1;
 				goto bailout;
 			}
 
 			break;
 		}
 		default:
 			warnx("Unknown device type %d (%s)",
 			    io_opts[i].dev_type, io_opts[i].dev_name);
 			error = 1;
 			goto bailout;
 			break; /*NOTREACHED */
 		}
 
 		devs[i]->write_dev = io_opts[i].write_dev;
 
 		devs[i]->start_offset_bytes = io_opts[i].offset;
 
 		if (max_io != 0) {
 			devs[i]->sector_io_limit =
 			    (devs[i]->start_offset_bytes /
 			    devs[i]->sector_size) +
 			    (max_io / devs[i]->sector_size) - 1;
 		}
 
 		devs[i]->next_io_pos_bytes = devs[i]->start_offset_bytes;
 		devs[i]->next_completion_pos_bytes =devs[i]->start_offset_bytes;
 	}
 
 	devs[0]->peer_dev = devs[1];
 	devs[1]->peer_dev = devs[0];
 	devs[0]->next_peer_pos_bytes = devs[0]->peer_dev->next_io_pos_bytes;
 	devs[1]->next_peer_pos_bytes = devs[1]->peer_dev->next_io_pos_bytes;
 
 	sem_init(&camdd_sem, /*pshared*/ 0, 0);
 
 	signal(SIGINFO, camdd_sig_handler);
 	signal(SIGINT, camdd_sig_handler);
 
 	error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &start_time);
 	if (error != 0) {
 		warn("Unable to get start time");
 		goto bailout;
 	}
 
 	for (i = 0; i < num_io_opts; i++) {
 		error = pthread_create(&threads[i], NULL, camdd_worker,
 				       (void *)devs[i]);
 		if (error != 0) {
 			warnc(error, "pthread_create() failed");
 			goto bailout;
 		}
 	}
 
 	for (;;) {
 		if ((sem_wait(&camdd_sem) == -1)
 		 || (need_exit != 0)) {
 			struct kevent ke;
 
 			for (i = 0; i < num_io_opts; i++) {
 				EV_SET(&ke, (uintptr_t)&devs[i]->work_queue,
 				    EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL);
 
 				devs[i]->flags |= CAMDD_DEV_FLAG_EOF;
 
 				error = kevent(devs[i]->kq, &ke, 1, NULL, 0,
 						NULL);
 				if (error == -1)
 					warn("%s: unable to wake up thread",
 					    __func__);
 				error = 0;
 			}
 			break;
 		} else if (need_status != 0) {
 			camdd_print_status(devs[0], devs[1], &start_time);
 			need_status = 0;
 		}
 	} 
 	for (i = 0; i < num_io_opts; i++) {
 		pthread_join(threads[i], NULL);
 	}
 
 	camdd_print_status(devs[0], devs[1], &start_time);
 
 bailout:
 
 	for (i = 0; i < num_io_opts; i++)
 		camdd_free_dev(devs[i]);
 
 	return (error + error_exit);
 }
 
 void
 usage(void)
 {
 	fprintf(stderr,
 "usage:  camdd <-i|-o pass=pass0,bs=1M,offset=1M,depth=4>\n"
 "              <-i|-o file=/tmp/file,bs=512K,offset=1M>\n"
 "              <-i|-o file=/dev/da0,bs=512K,offset=1M>\n"
 "              <-i|-o file=/dev/nsa0,bs=512K>\n"
 "              [-C retry_count][-E][-m max_io_amt][-t timeout_secs][-v][-h]\n"
 "Option description\n"
 "-i <arg=val>  Specify input device/file and parameters\n"
 "-o <arg=val>  Specify output device/file and parameters\n"
 "Input and Output parameters\n"
 "pass=name     Specify a pass(4) device like pass0 or /dev/pass0\n"
 "file=name     Specify a file or device, /tmp/foo, /dev/da0, /dev/null\n"
 "              or - for stdin/stdout\n"
 "bs=blocksize  Specify blocksize in bytes, or using K, M, G, etc. suffix\n"
 "offset=len    Specify starting offset in bytes or using K, M, G suffix\n"
 "              NOTE: offset cannot be specified on tapes, pipes, stdin/out\n"
 "depth=N       Specify a numeric queue depth.  This only applies to pass(4)\n"
 "mcs=N         Specify a minimum cmd size for pass(4) read/write commands\n"
 "Optional arguments\n"
 "-C retry_cnt  Specify a retry count for pass(4) devices\n"
 "-E            Enable CAM error recovery for pass(4) devices\n"
 "-m max_io     Specify the maximum amount to be transferred in bytes or\n"
 "              using K, G, M, etc. suffixes\n"
 "-t timeout    Specify the I/O timeout to use with pass(4) devices\n"
 "-v            Enable verbose error recovery\n"
 "-h            Print this message\n");
 }
 
 
 int
 camdd_parse_io_opts(char *args, int is_write, struct camdd_io_opts *io_opts)
 {
 	char *tmpstr, *tmpstr2;
 	char *orig_tmpstr = NULL;
 	int retval = 0;
 
 	io_opts->write_dev = is_write;
 
 	tmpstr = strdup(args);
 	if (tmpstr == NULL) {
 		warn("strdup failed");
 		retval = 1;
 		goto bailout;
 	}
 	orig_tmpstr = tmpstr;
 	while ((tmpstr2 = strsep(&tmpstr, ",")) != NULL) {
 		char *name, *value;
 
 		/*
 		 * If the user creates an empty parameter by putting in two
 		 * commas, skip over it and look for the next field.
 		 */
 		if (*tmpstr2 == '\0')
 			continue;
 
 		name = strsep(&tmpstr2, "=");
 		if (*name == '\0') {
 			warnx("Got empty I/O parameter name");
 			retval = 1;
 			goto bailout;
 		}
 		value = strsep(&tmpstr2, "=");
 		if ((value == NULL)
 		 || (*value == '\0')) {
 			warnx("Empty I/O parameter value for %s", name);
 			retval = 1;
 			goto bailout;
 		}
 		if (strncasecmp(name, "file", 4) == 0) {
 			io_opts->dev_type = CAMDD_DEV_FILE;
 			io_opts->dev_name = strdup(value);
 			if (io_opts->dev_name == NULL) {
 				warn("Error allocating memory");
 				retval = 1;
 				goto bailout;
 			}
 		} else if (strncasecmp(name, "pass", 4) == 0) {
 			io_opts->dev_type = CAMDD_DEV_PASS;
 			io_opts->dev_name = strdup(value);
 			if (io_opts->dev_name == NULL) {
 				warn("Error allocating memory");
 				retval = 1;
 				goto bailout;
 			}
 		} else if ((strncasecmp(name, "bs", 2) == 0)
 			|| (strncasecmp(name, "blocksize", 9) == 0)) {
 			retval = expand_number(value, &io_opts->blocksize);
 			if (retval == -1) {
 				warn("expand_number(3) failed on %s=%s", name,
 				    value);
 				retval = 1;
 				goto bailout;
 			}
 		} else if (strncasecmp(name, "depth", 5) == 0) {
 			char *endptr;
 
 			io_opts->queue_depth = strtoull(value, &endptr, 0);
 			if (*endptr != '\0') {
 				warnx("invalid queue depth %s", value);
 				retval = 1;
 				goto bailout;
 			}
 		} else if (strncasecmp(name, "mcs", 3) == 0) {
 			char *endptr;
 
 			io_opts->min_cmd_size = strtol(value, &endptr, 0);
 			if ((*endptr != '\0')
 			 || ((io_opts->min_cmd_size > 16)
 			  || (io_opts->min_cmd_size < 0))) {
 				warnx("invalid minimum cmd size %s", value);
 				retval = 1;
 				goto bailout;
 			}
 		} else if (strncasecmp(name, "offset", 6) == 0) {
 			retval = expand_number(value, &io_opts->offset);
 			if (retval == -1) {
 				warn("expand_number(3) failed on %s=%s", name,
 				    value);
 				retval = 1;
 				goto bailout;
 			}
 		} else if (strncasecmp(name, "debug", 5) == 0) {
 			char *endptr;
 
 			io_opts->debug = strtoull(value, &endptr, 0);
 			if (*endptr != '\0') {
 				warnx("invalid debug level %s", value);
 				retval = 1;
 				goto bailout;
 			}
 		} else {
 			warnx("Unrecognized parameter %s=%s", name, value);
 		}
 	}
 bailout:
 	free(orig_tmpstr);
 
 	return (retval);
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	camdd_argmask arglist = CAMDD_ARG_NONE;
 	int timeout = 0, retry_count = 1;
 	int error = 0;
 	uint64_t max_io = 0;
 	struct camdd_io_opts *opt_list = NULL;
 
 	if (argc == 1) {
 		usage();
 		exit(1);
 	}
 
 	opt_list = calloc(2, sizeof(struct camdd_io_opts));
 	if (opt_list == NULL) {
 		warn("Unable to allocate option list");
 		error = 1;
 		goto bailout;
 	}
 
 	while ((c = getopt(argc, argv, "C:Ehi:m:o:t:v")) != -1){
 		switch (c) {
 		case 'C':
 			retry_count = strtol(optarg, NULL, 0);
 			if (retry_count < 0)
 				errx(1, "retry count %d is < 0",
 				     retry_count);
 			arglist |= CAMDD_ARG_RETRIES;
 			break;
 		case 'E':
 			arglist |= CAMDD_ARG_ERR_RECOVER;
 			break;
 		case 'i':
 		case 'o':
 			if (((c == 'i')
 			  && (opt_list[0].dev_type != CAMDD_DEV_NONE))
 			 || ((c == 'o')
 			  && (opt_list[1].dev_type != CAMDD_DEV_NONE))) {
 				errx(1, "Only one input and output path "
 				    "allowed");
 			}
 			error = camdd_parse_io_opts(optarg, (c == 'o') ? 1 : 0,
 			    (c == 'o') ? &opt_list[1] : &opt_list[0]);
 			if (error != 0)
 				goto bailout;
 			break;
 		case 'm':
 			error = expand_number(optarg, &max_io);
 			if (error == -1) {
 				warn("invalid maximum I/O amount %s", optarg);
 				error = 1;
 				goto bailout;
 			}
 			break;
 		case 't':
 			timeout = strtol(optarg, NULL, 0);
 			if (timeout < 0)
 				errx(1, "invalid timeout %d", timeout);
 			/* Convert the timeout from seconds to ms */
 			timeout *= 1000;
 			arglist |= CAMDD_ARG_TIMEOUT;
 			break;
 		case 'v':
 			arglist |= CAMDD_ARG_VERBOSE;
 			break;
 		case 'h':
 		default:
 			usage();
 			exit(1);
 			break; /*NOTREACHED*/
 		}
 	}
 
 	if ((opt_list[0].dev_type == CAMDD_DEV_NONE)
 	 || (opt_list[1].dev_type == CAMDD_DEV_NONE))
 		errx(1, "Must specify both -i and -o");
 
 	/*
 	 * Set the timeout if the user hasn't specified one.
 	 */
 	if (timeout == 0)
 		timeout = CAMDD_PASS_RW_TIMEOUT;
 
 	error = camdd_rw(opt_list, 2, max_io, retry_count, timeout);
 
 bailout:
 	free(opt_list);
 
 	exit(error);
 }