Index: head/sys/cam/scsi/scsi_ch.c
===================================================================
--- head/sys/cam/scsi/scsi_ch.c	(revision 328237)
+++ head/sys/cam/scsi/scsi_ch.c	(revision 328238)
@@ -1,1934 +1,1935 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-4-Clause)
  *
  * Copyright (c) 1997 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1996, 1997 Jason R. Thorpe <thorpej@and.com>
  * All rights reserved.
  *
  * Partially based on an autochanger driver written by Stefan Grefen
  * and on an autochanger driver written by the Systems Programming Group
  * at the University of Utah Computer Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgements:
  *	This product includes software developed by Jason R. Thorpe
  *	for And Communications, http://www.and.com/
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $NetBSD: ch.c,v 1.34 1998/08/31 22:28:06 cgd Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/conf.h>
 #include <sys/chio.h>
 #include <sys/errno.h>
 #include <sys/devicestat.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_debug.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/scsi_ch.h>
 
 /*
  * Timeout definitions for various changer related commands.  They may
  * be too short for some devices (especially the timeout for INITIALIZE
  * ELEMENT STATUS).
  */
 
 static const u_int32_t	CH_TIMEOUT_MODE_SENSE                = 6000;
 static const u_int32_t	CH_TIMEOUT_MOVE_MEDIUM               = 15 * 60 * 1000;
 static const u_int32_t	CH_TIMEOUT_EXCHANGE_MEDIUM           = 15 * 60 * 1000;
 static const u_int32_t	CH_TIMEOUT_POSITION_TO_ELEMENT       = 15 * 60 * 1000;
 static const u_int32_t	CH_TIMEOUT_READ_ELEMENT_STATUS       = 5 * 60 * 1000;
 static const u_int32_t	CH_TIMEOUT_SEND_VOLTAG		     = 10000;
 static const u_int32_t	CH_TIMEOUT_INITIALIZE_ELEMENT_STATUS = 500000;
 
 typedef enum {
 	CH_FLAG_INVALID		= 0x001
 } ch_flags;
 
 typedef enum {
 	CH_STATE_PROBE,
 	CH_STATE_NORMAL
 } ch_state;
 
 typedef enum {
 	CH_CCB_PROBE
 } ch_ccb_types;
 
 typedef enum {
 	CH_Q_NONE	= 0x00,
 	CH_Q_NO_DBD	= 0x01,
 	CH_Q_NO_DVCID	= 0x02
 } ch_quirks;
 
 #define CH_Q_BIT_STRING	\
 	"\020"		\
 	"\001NO_DBD"	\
 	"\002NO_DVCID"
 
 #define ccb_state	ppriv_field0
 #define ccb_bp		ppriv_ptr1
 
 struct scsi_mode_sense_data {
 	struct scsi_mode_header_6 header;
 	struct scsi_mode_blk_desc blk_desc;
 	union {
 		struct page_element_address_assignment ea;
 		struct page_transport_geometry_parameters tg;
 		struct page_device_capabilities cap;
 	} pages;
 };
 
 struct ch_softc {
 	ch_flags	flags;
 	ch_state	state;
 	ch_quirks	quirks;
 	union ccb	saved_ccb;
 	struct devstat	*device_stats;
 	struct cdev     *dev;
 	int		open_count;
 
 	int		sc_picker;	/* current picker */
 
 	/*
 	 * The following information is obtained from the
 	 * element address assignment page.
 	 */
 	int		sc_firsts[CHET_MAX + 1];	/* firsts */
 	int		sc_counts[CHET_MAX + 1];	/* counts */
 
 	/*
 	 * The following mask defines the legal combinations
 	 * of elements for the MOVE MEDIUM command.
 	 */
 	u_int8_t	sc_movemask[CHET_MAX + 1];
 
 	/*
 	 * As above, but for EXCHANGE MEDIUM.
 	 */
 	u_int8_t	sc_exchangemask[CHET_MAX + 1];
 
 	/*
 	 * Quirks; see below.  XXX KDM not implemented yet
 	 */
 	int		sc_settledelay;	/* delay for settle */
 };
 
 static	d_open_t	chopen;
 static	d_close_t	chclose;
 static	d_ioctl_t	chioctl;
 static	periph_init_t	chinit;
 static  periph_ctor_t	chregister;
 static	periph_oninv_t	choninvalidate;
 static  periph_dtor_t   chcleanup;
 static  periph_start_t  chstart;
 static	void		chasync(void *callback_arg, u_int32_t code,
 				struct cam_path *path, void *arg);
 static	void		chdone(struct cam_periph *periph,
 			       union ccb *done_ccb);
 static	int		cherror(union ccb *ccb, u_int32_t cam_flags,
 				u_int32_t sense_flags);
 static	int		chmove(struct cam_periph *periph,
 			       struct changer_move *cm);
 static	int		chexchange(struct cam_periph *periph,
 				   struct changer_exchange *ce);
 static	int		chposition(struct cam_periph *periph,
 				   struct changer_position *cp);
 static	int		chgetelemstatus(struct cam_periph *periph,
 				int scsi_version, u_long cmd,
 				struct changer_element_status_request *csr);
 static	int		chsetvoltag(struct cam_periph *periph,
 				    struct changer_set_voltag_request *csvr);
 static	int		chielem(struct cam_periph *periph, 
 				unsigned int timeout);
 static	int		chgetparams(struct cam_periph *periph);
 static	int		chscsiversion(struct cam_periph *periph);
 
 static struct periph_driver chdriver =
 {
 	chinit, "ch",
 	TAILQ_HEAD_INITIALIZER(chdriver.units), /* generation */ 0
 };
 
 PERIPHDRIVER_DECLARE(ch, chdriver);
 
 static struct cdevsw ch_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_TRACKCLOSE,
 	.d_open =	chopen,
 	.d_close =	chclose,
 	.d_ioctl =	chioctl,
 	.d_name =	"ch",
 };
 
 static MALLOC_DEFINE(M_SCSICH, "scsi_ch", "scsi_ch buffers");
 
 static void
 chinit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, chasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("ch: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	}
 }
 
 static void
 chdevgonecb(void *arg)
 {
 	struct ch_softc   *softc;
 	struct cam_periph *periph;
 	struct mtx *mtx;
 	int i;
 
 	periph = (struct cam_periph *)arg;
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 
 	softc = (struct ch_softc *)periph->softc;
 	KASSERT(softc->open_count >= 0, ("Negative open count %d",
 		softc->open_count));
 
 	/*
 	 * When we get this callback, we will get no more close calls from
 	 * devfs.  So if we have any dangling opens, we need to release the
 	 * reference held for that particular context.
 	 */
 	for (i = 0; i < softc->open_count; i++)
 		cam_periph_release_locked(periph);
 
 	softc->open_count = 0;
 
 	/*
 	 * Release the reference held for the device node, it is gone now.
 	 */
 	cam_periph_release_locked(periph);
 
 	/*
 	 * We reference the lock directly here, instead of using
 	 * cam_periph_unlock().  The reason is that the final call to
 	 * cam_periph_release_locked() above could result in the periph
 	 * getting freed.  If that is the case, dereferencing the periph
 	 * with a cam_periph_unlock() call would cause a page fault.
 	 */
 	mtx_unlock(mtx);
 }
 
 static void
 choninvalidate(struct cam_periph *periph)
 {
 	struct ch_softc *softc;
 
 	softc = (struct ch_softc *)periph->softc;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, chasync, periph, periph->path);
 
 	softc->flags |= CH_FLAG_INVALID;
 
 	/*
 	 * Tell devfs this device has gone away, and ask for a callback
 	 * when it has cleaned up its state.
 	 */
 	destroy_dev_sched_cb(softc->dev, chdevgonecb, periph);
 }
 
 static void
 chcleanup(struct cam_periph *periph)
 {
 	struct ch_softc *softc;
 
 	softc = (struct ch_softc *)periph->softc;
 
 	devstat_remove_entry(softc->device_stats);
 
 	free(softc, M_DEVBUF);
 }
 
 static void
 chasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)callback_arg;
 
 	switch(code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
 
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		if (cgd->protocol != PROTO_SCSI)
 			break;
 		if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED)
 			break;
 		if (SID_TYPE(&cgd->inq_data)!= T_CHANGER)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(chregister, choninvalidate,
 					  chcleanup, chstart, "ch",
 					  CAM_PERIPH_BIO, path,
 					  chasync, AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG)
 			printf("chasync: Unable to probe new device "
 			       "due to status 0x%x\n", status);
 
 		break;
 
 	}
 	default:
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 }
 
 static cam_status
 chregister(struct cam_periph *periph, void *arg)
 {
 	struct ch_softc *softc;
 	struct ccb_getdev *cgd;
 	struct ccb_pathinq cpi;
 	struct make_dev_args args;
 	int error;
 
 	cgd = (struct ccb_getdev *)arg;
 	if (cgd == NULL) {
 		printf("chregister: no getdev CCB, can't register device\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (struct ch_softc *)malloc(sizeof(*softc),M_DEVBUF,M_NOWAIT);
 
 	if (softc == NULL) {
 		printf("chregister: Unable to probe new device. "
 		       "Unable to allocate softc\n");				
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	bzero(softc, sizeof(*softc));
 	softc->state = CH_STATE_PROBE;
 	periph->softc = softc;
 	softc->quirks = CH_Q_NONE;
 
 	/*
 	 * The DVCID and CURDATA bits were not introduced until the SMC
 	 * spec.  If this device claims SCSI-2 or earlier support, then it
 	 * very likely does not support these bits.
 	 */
 	if (cgd->inq_data.version <= SCSI_REV_2)
 		softc->quirks |= CH_Q_NO_DVCID;
 
 	xpt_path_inq(&cpi, periph->path);
 
 	/*
 	 * Changers don't have a blocksize, and obviously don't support
 	 * tagged queueing.
 	 */
 	cam_periph_unlock(periph);
 	softc->device_stats = devstat_new_entry("ch",
 			  periph->unit_number, 0,
 			  DEVSTAT_NO_BLOCKSIZE | DEVSTAT_NO_ORDERED_TAGS,
 			  SID_TYPE(&cgd->inq_data) |
 			  XPORT_DEVSTAT_TYPE(cpi.transport),
 			  DEVSTAT_PRIORITY_OTHER);
 
 	/*
 	 * Acquire a reference to the periph before we create the devfs
 	 * instance for it.  We'll release this reference once the devfs
 	 * instance has been freed.
 	 */
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 
 	/* Register the device */
 	make_dev_args_init(&args);
 	args.mda_devsw = &ch_cdevsw;
 	args.mda_unit = periph->unit_number;
 	args.mda_uid = UID_ROOT;
 	args.mda_gid = GID_OPERATOR;
 	args.mda_mode = 0600;
 	args.mda_si_drv1 = periph;
 	error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name,
 	    periph->unit_number);
 	cam_periph_lock(periph);
 	if (error != 0) {
 		cam_periph_release_locked(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	/*
 	 * Add an async callback so that we get
 	 * notified if this device goes away.
 	 */
 	xpt_register_async(AC_LOST_DEVICE, chasync, periph, periph->path);
 
 	/*
 	 * Lock this periph until we are setup.
 	 * This first call can't block
 	 */
 	(void)cam_periph_hold(periph, PRIBIO);
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 
 	return(CAM_REQ_CMP);
 }
 
 static int
 chopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct cam_periph *periph;
 	struct ch_softc *softc;
 	int error;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP)
 		return (ENXIO);
 
 	softc = (struct ch_softc *)periph->softc;
 
 	cam_periph_lock(periph);
 	
 	if (softc->flags & CH_FLAG_INVALID) {
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(ENXIO);
 	}
 
 	if ((error = cam_periph_hold(periph, PRIBIO | PCATCH)) != 0) {
 		cam_periph_unlock(periph);
 		cam_periph_release(periph);
 		return (error);
 	}
 
 	/*
 	 * Load information about this changer device into the softc.
 	 */
 	if ((error = chgetparams(periph)) != 0) {
 		cam_periph_unhold(periph);
 		cam_periph_release_locked(periph);
 		cam_periph_unlock(periph);
 		return(error);
 	}
 
 	cam_periph_unhold(periph);
 
 	softc->open_count++;
 
 	cam_periph_unlock(periph);
 
 	return(error);
 }
 
 static int
 chclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
 	struct	cam_periph *periph;
 	struct  ch_softc *softc;
 	struct mtx *mtx;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	mtx = cam_periph_mtx(periph);
 	mtx_lock(mtx);
 
 	softc = (struct ch_softc *)periph->softc;
 	softc->open_count--;
 
 	cam_periph_release_locked(periph);
 
 	/*
 	 * We reference the lock directly here, instead of using
 	 * cam_periph_unlock().  The reason is that the call to
 	 * cam_periph_release_locked() above could result in the periph
 	 * getting freed.  If that is the case, dereferencing the periph
 	 * with a cam_periph_unlock() call would cause a page fault.
 	 *
 	 * cam_periph_release() avoids this problem using the same method,
 	 * but we're manually acquiring and dropping the lock here to
 	 * protect the open count and avoid another lock acquisition and
 	 * release.
 	 */
 	mtx_unlock(mtx);
 
 	return(0);
 }
 
 static void
 chstart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct ch_softc *softc;
 
 	softc = (struct ch_softc *)periph->softc;
 
 	switch (softc->state) {
 	case CH_STATE_NORMAL:
 	{
 		xpt_release_ccb(start_ccb);
 		break;
 	}
 	case CH_STATE_PROBE:
 	{
 		int mode_buffer_len;
 		void *mode_buffer;
 
 		/*
 		 * Include the block descriptor when calculating the mode
 		 * buffer length,
 		 */
 		mode_buffer_len = sizeof(struct scsi_mode_header_6) +
 				  sizeof(struct scsi_mode_blk_desc) +
 				 sizeof(struct page_element_address_assignment);
 
 		mode_buffer = malloc(mode_buffer_len, M_SCSICH, M_NOWAIT);
 
 		if (mode_buffer == NULL) {
 			printf("chstart: couldn't malloc mode sense data\n");
 			break;
 		}
 		bzero(mode_buffer, mode_buffer_len);
 
 		/*
 		 * Get the element address assignment page.
 		 */
 		scsi_mode_sense(&start_ccb->csio,
 				/* retries */ 1,
 				/* cbfcnp */ chdone,
 				/* tag_action */ MSG_SIMPLE_Q_TAG,
 				/* dbd */ (softc->quirks & CH_Q_NO_DBD) ?
 					FALSE : TRUE,
 				/* pc */ SMS_PAGE_CTRL_CURRENT,
 				/* page */ CH_ELEMENT_ADDR_ASSIGN_PAGE,
 				/* param_buf */ (u_int8_t *)mode_buffer,
 				/* param_len */ mode_buffer_len,
 				/* sense_len */ SSD_FULL_SIZE,
 				/* timeout */ CH_TIMEOUT_MODE_SENSE);
 
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = CH_CCB_PROBE;
 		xpt_action(start_ccb);
 		break;
 	}
 	}
 }
 
 static void
 chdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct ch_softc *softc;
 	struct ccb_scsiio *csio;
 
 	softc = (struct ch_softc *)periph->softc;
 	csio = &done_ccb->csio;
 
 	switch(done_ccb->ccb_h.ccb_state) {
 	case CH_CCB_PROBE:
 	{
 		struct scsi_mode_header_6 *mode_header;
 		struct page_element_address_assignment *ea;
 		char announce_buf[80];
 
 
 		mode_header = (struct scsi_mode_header_6 *)csio->data_ptr;
 
 		ea = (struct page_element_address_assignment *)
 			find_mode_page_6(mode_header);
 
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP){
 			
 			softc->sc_firsts[CHET_MT] = scsi_2btoul(ea->mtea);
 			softc->sc_counts[CHET_MT] = scsi_2btoul(ea->nmte);
 			softc->sc_firsts[CHET_ST] = scsi_2btoul(ea->fsea);
 			softc->sc_counts[CHET_ST] = scsi_2btoul(ea->nse);
 			softc->sc_firsts[CHET_IE] = scsi_2btoul(ea->fieea);
 			softc->sc_counts[CHET_IE] = scsi_2btoul(ea->niee);
 			softc->sc_firsts[CHET_DT] = scsi_2btoul(ea->fdtea);
 			softc->sc_counts[CHET_DT] = scsi_2btoul(ea->ndte);
 			softc->sc_picker = softc->sc_firsts[CHET_MT];
 
 #define PLURAL(c)	(c) == 1 ? "" : "s"
 			snprintf(announce_buf, sizeof(announce_buf),
 				"%d slot%s, %d drive%s, "
 				"%d picker%s, %d portal%s",
 		    		softc->sc_counts[CHET_ST],
 				PLURAL(softc->sc_counts[CHET_ST]),
 		    		softc->sc_counts[CHET_DT],
 				PLURAL(softc->sc_counts[CHET_DT]),
 		    		softc->sc_counts[CHET_MT],
 				PLURAL(softc->sc_counts[CHET_MT]),
 		    		softc->sc_counts[CHET_IE],
 				PLURAL(softc->sc_counts[CHET_IE]));
 #undef PLURAL
 			if (announce_buf[0] != '\0') {
 				xpt_announce_periph(periph, announce_buf);
 				xpt_announce_quirks(periph, softc->quirks,
 				    CH_Q_BIT_STRING);
 			}
 		} else {
 			int error;
 
 			error = cherror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA | SF_NO_PRINT);
 			/*
 			 * Retry any UNIT ATTENTION type errors.  They
 			 * are expected at boot.
 			 */
 			if (error == ERESTART) {
 				/*
 				 * A retry was scheduled, so
 				 * just return.
 				 */
 				return;
 			} else if (error != 0) {
 				struct scsi_mode_sense_6 *sms;
 				int frozen, retry_scheduled;
 
 				sms = (struct scsi_mode_sense_6 *)
 					done_ccb->csio.cdb_io.cdb_bytes;
 				frozen = (done_ccb->ccb_h.status &
 				    CAM_DEV_QFRZN) != 0;
 
 				/*
 				 * Check to see if block descriptors were
 				 * disabled.  Some devices don't like that.
 				 * We're taking advantage of the fact that
 				 * the first few bytes of the 6 and 10 byte
 				 * mode sense commands are the same.  If
 				 * block descriptors were disabled, enable
 				 * them and re-send the command.
 				 */
 				if ((sms->byte2 & SMS_DBD) != 0 &&
 				    (periph->flags & CAM_PERIPH_INVALID) == 0) {
 					sms->byte2 &= ~SMS_DBD;
 					xpt_action(done_ccb);
 					softc->quirks |= CH_Q_NO_DBD;
 					retry_scheduled = 1;
 				} else
 					retry_scheduled = 0;
 
 				/* Don't wedge this device's queue */
 				if (frozen)
 					cam_release_devq(done_ccb->ccb_h.path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 
 				if (retry_scheduled)
 					return;
 
 				if ((done_ccb->ccb_h.status & CAM_STATUS_MASK)
 				    == CAM_SCSI_STATUS_ERROR) 
 					scsi_sense_print(&done_ccb->csio);
 				else {
 					xpt_print(periph->path,
 					    "got CAM status %#x\n",
 					    done_ccb->ccb_h.status);
 				}
 				xpt_print(periph->path, "fatal error, failed "
 				    "to attach to device\n");
 
 				cam_periph_invalidate(periph);
 
 			}
 		}
 		softc->state = CH_STATE_NORMAL;
 		free(mode_header, M_SCSICH);
 		/*
 		 * Since our peripheral may be invalidated by an error
 		 * above or an external event, we must release our CCB
 		 * before releasing the probe lock on the peripheral.
 		 * The peripheral will only go away once the last lock
 		 * is removed, and we need it around for the CCB release
 		 * operation.
 		 */
 		xpt_release_ccb(done_ccb);
 		cam_periph_unhold(periph);
 		return;
 	}
 	default:
 		break;
 	}
 	xpt_release_ccb(done_ccb);
 }
 
 static int
 cherror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
 {
 	struct ch_softc *softc;
 	struct cam_periph *periph;
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	softc = (struct ch_softc *)periph->softc;
 
 	return (cam_periph_error(ccb, cam_flags, sense_flags));
 }
 
 static int
 chioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td)
 {
 	struct cam_periph *periph;
 	struct ch_softc *softc;
 	int error;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	cam_periph_lock(periph);
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("entering chioctl\n"));
 
 	softc = (struct ch_softc *)periph->softc;
 
 	error = 0;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, 
 		  ("trying to do ioctl %#lx\n", cmd));
 
 	/*
 	 * If this command can change the device's state, we must
 	 * have the device open for writing.
 	 */
 	switch (cmd) {
 	case CHIOGPICKER:
 	case CHIOGPARAMS:
 	case OCHIOGSTATUS:
 	case CHIOGSTATUS:
 		break;
 
 	default:
 		if ((flag & FWRITE) == 0) {
 			cam_periph_unlock(periph);
 			return (EBADF);
 		}
 	}
 
 	switch (cmd) {
 	case CHIOMOVE:
 		error = chmove(periph, (struct changer_move *)addr);
 		break;
 
 	case CHIOEXCHANGE:
 		error = chexchange(periph, (struct changer_exchange *)addr);
 		break;
 
 	case CHIOPOSITION:
 		error = chposition(periph, (struct changer_position *)addr);
 		break;
 
 	case CHIOGPICKER:
 		*(int *)addr = softc->sc_picker - softc->sc_firsts[CHET_MT];
 		break;
 
 	case CHIOSPICKER:
 	{
 		int new_picker = *(int *)addr;
 
 		if (new_picker > (softc->sc_counts[CHET_MT] - 1)) {
 			error = EINVAL;
 			break;
 		}
 		softc->sc_picker = softc->sc_firsts[CHET_MT] + new_picker;
 		break;
 	}
 	case CHIOGPARAMS:
 	{
 		struct changer_params *cp = (struct changer_params *)addr;
 
 		cp->cp_npickers = softc->sc_counts[CHET_MT];
 		cp->cp_nslots = softc->sc_counts[CHET_ST];
 		cp->cp_nportals = softc->sc_counts[CHET_IE];
 		cp->cp_ndrives = softc->sc_counts[CHET_DT];
 		break;
 	}
 	case CHIOIELEM:
 		error = chielem(periph, *(unsigned int *)addr);
 		break;
 
 	case OCHIOGSTATUS:
 	{
 		error = chgetelemstatus(periph, SCSI_REV_2, cmd,
 		    (struct changer_element_status_request *)addr);
 		break;
 	}
 
 	case CHIOGSTATUS:
 	{
 		int scsi_version;
 
 		scsi_version = chscsiversion(periph);
 		if (scsi_version >= SCSI_REV_0) {
 			error = chgetelemstatus(periph, scsi_version, cmd,
 			    (struct changer_element_status_request *)addr);
 	  	}
 		else { /* unable to determine the SCSI version */
 			cam_periph_unlock(periph);
 			return (ENXIO);
 		}
 		break;
 	}
 
 	case CHIOSETVOLTAG:
 	{
 		error = chsetvoltag(periph,
 				    (struct changer_set_voltag_request *) addr);
 		break;
 	}
 
 	/* Implement prevent/allow? */
 
 	default:
 		error = cam_periph_ioctl(periph, cmd, addr, cherror);
 		break;
 	}
 
 	cam_periph_unlock(periph);
 	return (error);
 }
 
 static int
 chmove(struct cam_periph *periph, struct changer_move *cm)
 {
 	struct ch_softc *softc;
 	u_int16_t fromelem, toelem;
 	union ccb *ccb;
 	int error;
 
 	error = 0;
 	softc = (struct ch_softc *)periph->softc;
 
 	/*
 	 * Check arguments.
 	 */
 	if ((cm->cm_fromtype > CHET_DT) || (cm->cm_totype > CHET_DT))
 		return (EINVAL);
 	if ((cm->cm_fromunit > (softc->sc_counts[cm->cm_fromtype] - 1)) ||
 	    (cm->cm_tounit > (softc->sc_counts[cm->cm_totype] - 1)))
 		return (ENODEV);
 
 	/*
 	 * Check the request against the changer's capabilities.
 	 */
 	if ((softc->sc_movemask[cm->cm_fromtype] & (1 << cm->cm_totype)) == 0)
 		return (ENODEV);
 
 	/*
 	 * Calculate the source and destination elements.
 	 */
 	fromelem = softc->sc_firsts[cm->cm_fromtype] + cm->cm_fromunit;
 	toelem = softc->sc_firsts[cm->cm_totype] + cm->cm_tounit;
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_move_medium(&ccb->csio,
 			 /* retries */ 1,
 			 /* cbfcnp */ chdone,
 			 /* tag_action */ MSG_SIMPLE_Q_TAG,
 			 /* tea */ softc->sc_picker,
 			 /* src */ fromelem,
 			 /* dst */ toelem,
 			 /* invert */ (cm->cm_flags & CM_INVERT) ? TRUE : FALSE,
 			 /* sense_len */ SSD_FULL_SIZE,
 			 /* timeout */ CH_TIMEOUT_MOVE_MEDIUM);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	xpt_release_ccb(ccb);
 
 	return(error);
 }
 
 static int
 chexchange(struct cam_periph *periph, struct changer_exchange *ce)
 {
 	struct ch_softc *softc;
 	u_int16_t src, dst1, dst2;
 	union ccb *ccb;
 	int error;
 
 	error = 0;
 	softc = (struct ch_softc *)periph->softc;
 	/*
 	 * Check arguments.
 	 */
 	if ((ce->ce_srctype > CHET_DT) || (ce->ce_fdsttype > CHET_DT) ||
 	    (ce->ce_sdsttype > CHET_DT))
 		return (EINVAL);
 	if ((ce->ce_srcunit > (softc->sc_counts[ce->ce_srctype] - 1)) ||
 	    (ce->ce_fdstunit > (softc->sc_counts[ce->ce_fdsttype] - 1)) ||
 	    (ce->ce_sdstunit > (softc->sc_counts[ce->ce_sdsttype] - 1)))
 		return (ENODEV);
 
 	/*
 	 * Check the request against the changer's capabilities.
 	 */
 	if (((softc->sc_exchangemask[ce->ce_srctype] &
 	     (1 << ce->ce_fdsttype)) == 0) ||
 	    ((softc->sc_exchangemask[ce->ce_fdsttype] &
 	     (1 << ce->ce_sdsttype)) == 0))
 		return (ENODEV);
 
 	/*
 	 * Calculate the source and destination elements.
 	 */
 	src = softc->sc_firsts[ce->ce_srctype] + ce->ce_srcunit;
 	dst1 = softc->sc_firsts[ce->ce_fdsttype] + ce->ce_fdstunit;
 	dst2 = softc->sc_firsts[ce->ce_sdsttype] + ce->ce_sdstunit;
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_exchange_medium(&ccb->csio,
 			     /* retries */ 1,
 			     /* cbfcnp */ chdone,
 			     /* tag_action */ MSG_SIMPLE_Q_TAG,
 			     /* tea */ softc->sc_picker,
 			     /* src */ src,
 			     /* dst1 */ dst1,
 			     /* dst2 */ dst2,
 			     /* invert1 */ (ce->ce_flags & CE_INVERT1) ?
 			                   TRUE : FALSE,
 			     /* invert2 */ (ce->ce_flags & CE_INVERT2) ?
 			                   TRUE : FALSE,
 			     /* sense_len */ SSD_FULL_SIZE,
 			     /* timeout */ CH_TIMEOUT_EXCHANGE_MEDIUM);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	xpt_release_ccb(ccb);
 
 	return(error);
 }
 
 static int
 chposition(struct cam_periph *periph, struct changer_position *cp)
 {
 	struct ch_softc *softc;
 	u_int16_t dst;
 	union ccb *ccb;
 	int error;
 
 	error = 0;
 	softc = (struct ch_softc *)periph->softc;
 
 	/*
 	 * Check arguments.
 	 */
 	if (cp->cp_type > CHET_DT)
 		return (EINVAL);
 	if (cp->cp_unit > (softc->sc_counts[cp->cp_type] - 1))
 		return (ENODEV);
 
 	/*
 	 * Calculate the destination element.
 	 */
 	dst = softc->sc_firsts[cp->cp_type] + cp->cp_unit;
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_position_to_element(&ccb->csio,
 				 /* retries */ 1,
 				 /* cbfcnp */ chdone,
 				 /* tag_action */ MSG_SIMPLE_Q_TAG,
 				 /* tea */ softc->sc_picker,
 				 /* dst */ dst,
 				 /* invert */ (cp->cp_flags & CP_INVERT) ?
 					      TRUE : FALSE,
 				 /* sense_len */ SSD_FULL_SIZE,
 				 /* timeout */ CH_TIMEOUT_POSITION_TO_ELEMENT);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	xpt_release_ccb(ccb);
 
 	return(error);
 }
 
 /*
  * Copy a volume tag to a volume_tag struct, converting SCSI byte order
  * to host native byte order in the volume serial number.  The volume
  * label as returned by the changer is transferred to user mode as
  * nul-terminated string.  Volume labels are truncated at the first
  * space, as suggested by SCSI-2.
  */
 static	void
 copy_voltag(struct changer_voltag *uvoltag, struct volume_tag *voltag)
 {
 	int i;
 	for (i=0; i<CH_VOLTAG_MAXLEN; i++) {
 		char c = voltag->vif[i];
 		if (c && c != ' ')
 			uvoltag->cv_volid[i] = c;
 	        else
 			break;
 	}
 	uvoltag->cv_serial = scsi_2btoul(voltag->vsn);
 }
 
 /*
  * Copy an element status descriptor to a user-mode
  * changer_element_status structure.
  */
 static void
 copy_element_status(struct ch_softc *softc,
 		    u_int16_t flags,
 		    struct read_element_status_descriptor *desc,
 		    struct changer_element_status *ces,
 		    int scsi_version)
 {
 	u_int16_t eaddr = scsi_2btoul(desc->eaddr);
 	u_int16_t et;
 	struct volume_tag *pvol_tag = NULL, *avol_tag = NULL;
 	struct read_element_status_device_id *devid = NULL;
 
 	ces->ces_int_addr = eaddr;
 	/* set up logical address in element status */
 	for (et = CHET_MT; et <= CHET_DT; et++) {
 		if ((softc->sc_firsts[et] <= eaddr)
 		    && ((softc->sc_firsts[et] + softc->sc_counts[et])
 			> eaddr)) {
 			ces->ces_addr = eaddr - softc->sc_firsts[et];
 			ces->ces_type = et;
 			break;
 		}
 	}
 
 	ces->ces_flags = desc->flags1;
 
 	ces->ces_sensecode = desc->sense_code;
 	ces->ces_sensequal = desc->sense_qual;
 
 	if (desc->flags2 & READ_ELEMENT_STATUS_INVERT)
 		ces->ces_flags |= CES_INVERT;
 
 	if (desc->flags2 & READ_ELEMENT_STATUS_SVALID) {
 
 		eaddr = scsi_2btoul(desc->ssea);
 
 		/* convert source address to logical format */
 		for (et = CHET_MT; et <= CHET_DT; et++) {
 			if ((softc->sc_firsts[et] <= eaddr)
 			    && ((softc->sc_firsts[et] + softc->sc_counts[et])
 				> eaddr)) {
 				ces->ces_source_addr =
 					eaddr - softc->sc_firsts[et];
 				ces->ces_source_type = et;
 				ces->ces_flags |= CES_SOURCE_VALID;
 				break;
 			}
 		}
 
 		if (!(ces->ces_flags & CES_SOURCE_VALID))
 			printf("ch: warning: could not map element source "
 			       "address %ud to a valid element type\n",
 			       eaddr);
 	}
 
 	/*
 	 * pvoltag and avoltag are common between SCSI-2 and later versions
 	 */
 	if (flags & READ_ELEMENT_STATUS_PVOLTAG)
 		pvol_tag = &desc->voltag_devid.pvoltag;
 	if (flags & READ_ELEMENT_STATUS_AVOLTAG)
 		avol_tag = (flags & READ_ELEMENT_STATUS_PVOLTAG) ?
 		    &desc->voltag_devid.voltag[1] :&desc->voltag_devid.pvoltag;
 	/*
 	 * For SCSI-3 and later, element status can carry designator and
 	 * other information.
 	 */
 	if (scsi_version >= SCSI_REV_SPC) {
 		if ((flags & READ_ELEMENT_STATUS_PVOLTAG) ^
 		    (flags & READ_ELEMENT_STATUS_AVOLTAG))
 			devid = &desc->voltag_devid.pvol_and_devid.devid;
 		else if (!(flags & READ_ELEMENT_STATUS_PVOLTAG) &&
 			 !(flags & READ_ELEMENT_STATUS_AVOLTAG))
 			devid = &desc->voltag_devid.devid;
 		else /* Have both PVOLTAG and AVOLTAG */
 			devid = &desc->voltag_devid.vol_tags_and_devid.devid;
 	}
 
 	if (pvol_tag)
 		copy_voltag(&(ces->ces_pvoltag), pvol_tag);
 	if (avol_tag)
 		copy_voltag(&(ces->ces_pvoltag), avol_tag);
 	if (devid != NULL) {
 		if (devid->designator_length > 0) {
 			bcopy((void *)devid->designator,
 			      (void *)ces->ces_designator,
 			      devid->designator_length);
 			ces->ces_designator_length = devid->designator_length;
 			/*
 			 * Make sure we are always NUL terminated.  The
 			 * This won't matter for the binary code set,
 			 * since the user will only pay attention to the
 			 * length field.
 			 */
 			ces->ces_designator[devid->designator_length]= '\0';
 		}
 		if (devid->piv_assoc_designator_type &
 		    READ_ELEMENT_STATUS_PIV_SET) {
 			ces->ces_flags |= CES_PIV;
 			ces->ces_protocol_id =
 			    READ_ELEMENT_STATUS_PROTOCOL_ID(
 			    devid->prot_code_set);
 		}
 		ces->ces_code_set =
 		    READ_ELEMENT_STATUS_CODE_SET(devid->prot_code_set);
 		ces->ces_assoc = READ_ELEMENT_STATUS_ASSOCIATION(
 		    devid->piv_assoc_designator_type);
 		ces->ces_designator_type = READ_ELEMENT_STATUS_DESIGNATOR_TYPE(
 		    devid->piv_assoc_designator_type);
 	} else if (scsi_version > SCSI_REV_2) {
 		/* SCSI-SPC and No devid, no designator */
 		ces->ces_designator_length = 0;
 		ces->ces_designator[0] = '\0';
 		ces->ces_protocol_id = CES_PROTOCOL_ID_FCP_4;
 	}
 
 	if (scsi_version <= SCSI_REV_2) {
 		if (desc->dt_or_obsolete.scsi_2.dt_scsi_flags &
 		    READ_ELEMENT_STATUS_DT_IDVALID) {
 			ces->ces_flags |= CES_SCSIID_VALID;
 			ces->ces_scsi_id =
 			    desc->dt_or_obsolete.scsi_2.dt_scsi_addr;
 		}
 
 		if (desc->dt_or_obsolete.scsi_2.dt_scsi_addr &
 		    READ_ELEMENT_STATUS_DT_LUVALID) {
 			ces->ces_flags |= CES_LUN_VALID;
 			ces->ces_scsi_lun =
 			    desc->dt_or_obsolete.scsi_2.dt_scsi_flags &
 			    READ_ELEMENT_STATUS_DT_LUNMASK;
 		}
 	}
 }
 
 static int
 chgetelemstatus(struct cam_periph *periph, int scsi_version, u_long cmd,
 		struct changer_element_status_request *cesr)
 {
 	struct read_element_status_header *st_hdr;
 	struct read_element_status_page_header *pg_hdr;
 	struct read_element_status_descriptor *desc;
 	caddr_t data = NULL;
 	size_t size, desclen;
-	int avail, i, error = 0;
+	u_int avail, i;
 	int curdata, dvcid, sense_flags;
 	int try_no_dvcid = 0;
 	struct changer_element_status *user_data = NULL;
 	struct ch_softc *softc;
 	union ccb *ccb;
 	int chet = cesr->cesr_element_type;
+	int error = 0;
 	int want_voltags = (cesr->cesr_flags & CESR_VOLTAGS) ? 1 : 0;
 
 	softc = (struct ch_softc *)periph->softc;
 
 	/* perform argument checking */
 
 	/*
 	 * Perform a range check on the cesr_element_{base,count}
 	 * request argument fields.
 	 */
 	if ((softc->sc_counts[chet] - cesr->cesr_element_base) <= 0
 	    || (cesr->cesr_element_base + cesr->cesr_element_count)
 	        > softc->sc_counts[chet])
 		return (EINVAL);
 
 	/*
 	 * Request one descriptor for the given element type.  This
 	 * is used to determine the size of the descriptor so that
 	 * we can allocate enough storage for all of them.  We assume
 	 * that the first one can fit into 1k.
 	 */
 	cam_periph_unlock(periph);
 	data = (caddr_t)malloc(1024, M_DEVBUF, M_WAITOK);
 
 	cam_periph_lock(periph);
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	sense_flags = SF_RETRY_UA;
 	if (softc->quirks & CH_Q_NO_DVCID) {
 		dvcid = 0;
 		curdata = 0;
 	} else {
 		dvcid = 1;
 		curdata = 1;
 		/*
 		 * Don't print anything for an Illegal Request, because
 		 * these flags can cause some changers to complain.  We'll
 		 * retry without them if we get an error.
 		 */
 		sense_flags |= SF_QUIET_IR;
 	}
 
 retry_einval:
 
 	scsi_read_element_status(&ccb->csio,
 				 /* retries */ 1,
 				 /* cbfcnp */ chdone,
 				 /* tag_action */ MSG_SIMPLE_Q_TAG,
 				 /* voltag */ want_voltags,
 				 /* sea */ softc->sc_firsts[chet],
 				 /* curdata */ curdata,
 				 /* dvcid */ dvcid,
 				 /* count */ 1,
 				 /* data_ptr */ data,
 				 /* dxfer_len */ 1024,
 				 /* sense_len */ SSD_FULL_SIZE,
 				 /* timeout */ CH_TIMEOUT_READ_ELEMENT_STATUS);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /*sense_flags*/ sense_flags,
 				  softc->device_stats);
 
 	/*
 	 * An Illegal Request sense key (only used if there is no asc/ascq)
 	 * or 0x24,0x00 for an ASC/ASCQ both map to EINVAL.  If dvcid or
 	 * curdata are set (we set both or neither), try turning them off
 	 * and see if the command is successful.
 	 */
 	if ((error == EINVAL)
 	 && (dvcid || curdata))  {
 		dvcid = 0;
 		curdata = 0;
 		error = 0;
 		/* At this point we want to report any Illegal Request */
 		sense_flags &= ~SF_QUIET_IR;
 		try_no_dvcid = 1;
 		goto retry_einval;
 	}
 
 	/*
 	 * In this case, we tried a read element status with dvcid and
 	 * curdata set, and it failed.  We retried without those bits, and
 	 * it succeeded.  Suggest to the user that he set a quirk, so we
 	 * don't go through the retry process the first time in the future.
 	 * This should only happen on changers that claim SCSI-3 or higher,
 	 * but don't support these bits.
 	 */
 	if ((try_no_dvcid != 0)
 	 && (error == 0))
 		softc->quirks |= CH_Q_NO_DVCID;
 
 	if (error)
 		goto done;
 	cam_periph_unlock(periph);
 
 	st_hdr = (struct read_element_status_header *)data;
 	pg_hdr = (struct read_element_status_page_header *)((uintptr_t)st_hdr +
 		  sizeof(struct read_element_status_header));
 	desclen = scsi_2btoul(pg_hdr->edl);
 
 	size = sizeof(struct read_element_status_header) +
 	       sizeof(struct read_element_status_page_header) +
 	       (desclen * cesr->cesr_element_count);
 	/*
 	 * Reallocate storage for descriptors and get them from the
 	 * device.
 	 */
 	free(data, M_DEVBUF);
 	data = (caddr_t)malloc(size, M_DEVBUF, M_WAITOK);
 
 	cam_periph_lock(periph);
 	scsi_read_element_status(&ccb->csio,
 				 /* retries */ 1,
 				 /* cbfcnp */ chdone,
 				 /* tag_action */ MSG_SIMPLE_Q_TAG,
 				 /* voltag */ want_voltags,
 				 /* sea */ softc->sc_firsts[chet]
 				 + cesr->cesr_element_base,
 				 /* curdata */ curdata,
 				 /* dvcid */ dvcid,
 				 /* count */ cesr->cesr_element_count,
 				 /* data_ptr */ data,
 				 /* dxfer_len */ size,
 				 /* sense_len */ SSD_FULL_SIZE,
 				 /* timeout */ CH_TIMEOUT_READ_ELEMENT_STATUS);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	if (error)
 		goto done;
 	cam_periph_unlock(periph);
 
 	/*
 	 * Fill in the user status array.
 	 */
 	st_hdr = (struct read_element_status_header *)data;
 	pg_hdr = (struct read_element_status_page_header *)((uintptr_t)st_hdr +
 		  sizeof(struct read_element_status_header));
 	avail = scsi_2btoul(st_hdr->count);
 
 	if (avail != cesr->cesr_element_count) {
 		xpt_print(periph->path,
 		    "warning, READ ELEMENT STATUS avail != count\n");
 	}
 
 	user_data = (struct changer_element_status *)
 		malloc(avail * sizeof(struct changer_element_status),
 		       M_DEVBUF, M_WAITOK | M_ZERO);
 
 	desc = (struct read_element_status_descriptor *)((uintptr_t)data +
 		sizeof(struct read_element_status_header) +
 		sizeof(struct read_element_status_page_header));
 	/*
 	 * Set up the individual element status structures
 	 */
 	for (i = 0; i < avail; ++i) {
 		struct changer_element_status *ces;
 
 		/*
 		 * In the changer_element_status structure, fields from
 		 * the beginning to the field of ces_scsi_lun are common
 		 * between SCSI-2 and SCSI-3, while all the rest are new
 		 * from SCSI-3. In order to maintain backward compatibility
 		 * of the chio command, the ces pointer, below, is computed
 		 * such that it lines up with the structure boundary
 		 * corresponding to the SCSI version.
 		 */
 		ces = cmd == OCHIOGSTATUS ?
 		    (struct changer_element_status *)
 		    ((unsigned char *)user_data + i *
 		     (offsetof(struct changer_element_status,ces_scsi_lun)+1)):
 		    &user_data[i];
 
 		copy_element_status(softc, pg_hdr->flags, desc,
 				    ces, scsi_version);
 
 		desc = (struct read_element_status_descriptor *)
 		       ((unsigned char *)desc + desclen);
 	}
 
 	/* Copy element status structures out to userspace. */
 	if (cmd == OCHIOGSTATUS)
 		error = copyout(user_data,
 				cesr->cesr_element_status,
 				avail* (offsetof(struct changer_element_status,
 				ces_scsi_lun) + 1));
 	else
 		error = copyout(user_data,
 				cesr->cesr_element_status,
 				avail * sizeof(struct changer_element_status));
 
 	cam_periph_lock(periph);
 
  done:
 	xpt_release_ccb(ccb);
 
 	if (data != NULL)
 		free(data, M_DEVBUF);
 	if (user_data != NULL)
 		free(user_data, M_DEVBUF);
 
 	return (error);
 }
 
 static int
 chielem(struct cam_periph *periph,
 	unsigned int timeout)
 {
 	union ccb *ccb;
 	struct ch_softc *softc;
 	int error;
 
 	if (!timeout) {
 		timeout = CH_TIMEOUT_INITIALIZE_ELEMENT_STATUS;
 	} else {
 		timeout *= 1000;
 	}
 
 	error = 0;
 	softc = (struct ch_softc *)periph->softc;
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_initialize_element_status(&ccb->csio,
 				      /* retries */ 1,
 				      /* cbfcnp */ chdone,
 				      /* tag_action */ MSG_SIMPLE_Q_TAG,
 				      /* sense_len */ SSD_FULL_SIZE,
 				      /* timeout */ timeout);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	xpt_release_ccb(ccb);
 
 	return(error);
 }
 
 static int
 chsetvoltag(struct cam_periph *periph,
 	    struct changer_set_voltag_request *csvr)
 {
 	union ccb *ccb;
 	struct ch_softc *softc;
 	u_int16_t ea;
 	u_int8_t sac;
 	struct scsi_send_volume_tag_parameters ssvtp;
 	int error;
 	int i;
 
 	error = 0;
 	softc = (struct ch_softc *)periph->softc;
 
 	bzero(&ssvtp, sizeof(ssvtp));
 	for (i=0; i<sizeof(ssvtp.vitf); i++) {
 		ssvtp.vitf[i] = ' ';
 	}
 
 	/*
 	 * Check arguments.
 	 */
 	if (csvr->csvr_type > CHET_DT)
 		return EINVAL;
 	if (csvr->csvr_addr > (softc->sc_counts[csvr->csvr_type] - 1))
 		return ENODEV;
 
 	ea = softc->sc_firsts[csvr->csvr_type] + csvr->csvr_addr;
 
 	if (csvr->csvr_flags & CSVR_ALTERNATE) {
 		switch (csvr->csvr_flags & CSVR_MODE_MASK) {
 		case CSVR_MODE_SET:
 			sac = SEND_VOLUME_TAG_ASSERT_ALTERNATE;
 			break;
 		case CSVR_MODE_REPLACE:
 			sac = SEND_VOLUME_TAG_REPLACE_ALTERNATE;
 			break;
 		case CSVR_MODE_CLEAR:
 			sac = SEND_VOLUME_TAG_UNDEFINED_ALTERNATE;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 	} else {
 		switch (csvr->csvr_flags & CSVR_MODE_MASK) {
 		case CSVR_MODE_SET:
 			sac = SEND_VOLUME_TAG_ASSERT_PRIMARY;
 			break;
 		case CSVR_MODE_REPLACE:
 			sac = SEND_VOLUME_TAG_REPLACE_PRIMARY;
 			break;
 		case CSVR_MODE_CLEAR:
 			sac = SEND_VOLUME_TAG_UNDEFINED_PRIMARY;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 	}
 
 	memcpy(ssvtp.vitf, csvr->csvr_voltag.cv_volid,
 	       min(strlen(csvr->csvr_voltag.cv_volid), sizeof(ssvtp.vitf)));
 	scsi_ulto2b(csvr->csvr_voltag.cv_serial, ssvtp.minvsn);
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_send_volume_tag(&ccb->csio,
 			     /* retries */ 1,
 			     /* cbfcnp */ chdone,
 			     /* tag_action */ MSG_SIMPLE_Q_TAG,
 			     /* element_address */ ea,
 			     /* send_action_code */ sac,
 			     /* parameters */ &ssvtp,
 			     /* sense_len */ SSD_FULL_SIZE,
 			     /* timeout */ CH_TIMEOUT_SEND_VOLTAG);
 	
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /*sense_flags*/ SF_RETRY_UA,
 				  softc->device_stats);
 
 	xpt_release_ccb(ccb);
 
  out:
 	return error;
 }
 
 static int
 chgetparams(struct cam_periph *periph)
 {
 	union ccb *ccb;
 	struct ch_softc *softc;
 	void *mode_buffer;
 	int mode_buffer_len;
 	struct page_element_address_assignment *ea;
 	struct page_device_capabilities *cap;
 	int error, from, dbd;
 	u_int8_t *moves, *exchanges;
 
 	error = 0;
 
 	softc = (struct ch_softc *)periph->softc;
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	/*
 	 * The scsi_mode_sense_data structure is just a convenience
 	 * structure that allows us to easily calculate the worst-case
 	 * storage size of the mode sense buffer.
 	 */
 	mode_buffer_len = sizeof(struct scsi_mode_sense_data);
 
 	mode_buffer = malloc(mode_buffer_len, M_SCSICH, M_NOWAIT);
 
 	if (mode_buffer == NULL) {
 		printf("chgetparams: couldn't malloc mode sense data\n");
 		xpt_release_ccb(ccb);
 		return(ENOSPC);
 	}
 
 	bzero(mode_buffer, mode_buffer_len);
 
 	if (softc->quirks & CH_Q_NO_DBD)
 		dbd = FALSE;
 	else
 		dbd = TRUE;
 
 	/*
 	 * Get the element address assignment page.
 	 */
 	scsi_mode_sense(&ccb->csio,
 			/* retries */ 1,
 			/* cbfcnp */ chdone,
 			/* tag_action */ MSG_SIMPLE_Q_TAG,
 			/* dbd */ dbd,
 			/* pc */ SMS_PAGE_CTRL_CURRENT,
 			/* page */ CH_ELEMENT_ADDR_ASSIGN_PAGE,
 			/* param_buf */ (u_int8_t *)mode_buffer,
 			/* param_len */ mode_buffer_len,
 			/* sense_len */ SSD_FULL_SIZE,
 			/* timeout */ CH_TIMEOUT_MODE_SENSE);
 
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /* sense_flags */ SF_RETRY_UA|SF_NO_PRINT,
 				  softc->device_stats);
 
 	if (error) {
 		if (dbd) {
 			struct scsi_mode_sense_6 *sms;
 
 			sms = (struct scsi_mode_sense_6 *)
 				ccb->csio.cdb_io.cdb_bytes;
 
 			sms->byte2 &= ~SMS_DBD;
 			error = cam_periph_runccb(ccb, cherror,
 						  /*cam_flags*/ CAM_RETRY_SELTO,
 				  		  /*sense_flags*/ SF_RETRY_UA,
 						  softc->device_stats);
 		} else {
 			/*
 			 * Since we disabled sense printing above, print
 			 * out the sense here since we got an error.
 			 */
 			scsi_sense_print(&ccb->csio);
 		}
 
 		if (error) {
 			xpt_print(periph->path,
 			    "chgetparams: error getting element "
 			    "address page\n");
 			xpt_release_ccb(ccb);
 			free(mode_buffer, M_SCSICH);
 			return(error);
 		}
 	}
 
 	ea = (struct page_element_address_assignment *)
 		find_mode_page_6((struct scsi_mode_header_6 *)mode_buffer);
 
 	softc->sc_firsts[CHET_MT] = scsi_2btoul(ea->mtea);
 	softc->sc_counts[CHET_MT] = scsi_2btoul(ea->nmte);
 	softc->sc_firsts[CHET_ST] = scsi_2btoul(ea->fsea);
 	softc->sc_counts[CHET_ST] = scsi_2btoul(ea->nse);
 	softc->sc_firsts[CHET_IE] = scsi_2btoul(ea->fieea);
 	softc->sc_counts[CHET_IE] = scsi_2btoul(ea->niee);
 	softc->sc_firsts[CHET_DT] = scsi_2btoul(ea->fdtea);
 	softc->sc_counts[CHET_DT] = scsi_2btoul(ea->ndte);
 
 	bzero(mode_buffer, mode_buffer_len);
 
 	/*
 	 * Now get the device capabilities page.
 	 */
 	scsi_mode_sense(&ccb->csio,
 			/* retries */ 1,
 			/* cbfcnp */ chdone,
 			/* tag_action */ MSG_SIMPLE_Q_TAG,
 			/* dbd */ dbd,
 			/* pc */ SMS_PAGE_CTRL_CURRENT,
 			/* page */ CH_DEVICE_CAP_PAGE,
 			/* param_buf */ (u_int8_t *)mode_buffer,
 			/* param_len */ mode_buffer_len,
 			/* sense_len */ SSD_FULL_SIZE,
 			/* timeout */ CH_TIMEOUT_MODE_SENSE);
 	
 	error = cam_periph_runccb(ccb, cherror, /*cam_flags*/ CAM_RETRY_SELTO,
 				  /* sense_flags */ SF_RETRY_UA | SF_NO_PRINT,
 				  softc->device_stats);
 
 	if (error) {
 		if (dbd) {
 			struct scsi_mode_sense_6 *sms;
 
 			sms = (struct scsi_mode_sense_6 *)
 				ccb->csio.cdb_io.cdb_bytes;
 
 			sms->byte2 &= ~SMS_DBD;
 			error = cam_periph_runccb(ccb, cherror,
 						  /*cam_flags*/ CAM_RETRY_SELTO,
 				  		  /*sense_flags*/ SF_RETRY_UA,
 						  softc->device_stats);
 		} else {
 			/*
 			 * Since we disabled sense printing above, print
 			 * out the sense here since we got an error.
 			 */
 			scsi_sense_print(&ccb->csio);
 		}
 
 		if (error) {
 			xpt_print(periph->path,
 			    "chgetparams: error getting device "
 			    "capabilities page\n");
 			xpt_release_ccb(ccb);
 			free(mode_buffer, M_SCSICH);
 			return(error);
 		}
 	}
 
 	xpt_release_ccb(ccb);
 
 	cap = (struct page_device_capabilities *)
 		find_mode_page_6((struct scsi_mode_header_6 *)mode_buffer);
 
 	bzero(softc->sc_movemask, sizeof(softc->sc_movemask));
 	bzero(softc->sc_exchangemask, sizeof(softc->sc_exchangemask));
 	moves = cap->move_from;
 	exchanges = cap->exchange_with;
 	for (from = CHET_MT; from <= CHET_MAX; ++from) {
 		softc->sc_movemask[from] = moves[from];
 		softc->sc_exchangemask[from] = exchanges[from];
 	}
 
 	free(mode_buffer, M_SCSICH);
 
 	return(error);
 }
 
 static int
 chscsiversion(struct cam_periph *periph)
 {
 	struct scsi_inquiry_data *inq_data;
 	struct ccb_getdev *cgd;
 	int dev_scsi_version;
 
 	cam_periph_assert(periph, MA_OWNED);
 	if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) == NULL)
 		return (-1);
 	/*
 	 * Get the device information.
 	 */
 	xpt_setup_ccb(&cgd->ccb_h,
 		      periph->path,
 		      CAM_PRIORITY_NORMAL);
 	cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 	xpt_action((union ccb *)cgd);
 
 	if (cgd->ccb_h.status != CAM_REQ_CMP) {
 		xpt_free_ccb((union ccb *)cgd);
 		return -1;
 	}
 
 	inq_data = &cgd->inq_data;
 	dev_scsi_version = inq_data->version;
 	xpt_free_ccb((union ccb *)cgd);
 
 	return dev_scsi_version;
 }
 
 void
 scsi_move_medium(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int32_t tea, u_int32_t src,
 		 u_int32_t dst, int invert, u_int8_t sense_len,
 		 u_int32_t timeout)
 {
 	struct scsi_move_medium *scsi_cmd;
 
 	scsi_cmd = (struct scsi_move_medium *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = MOVE_MEDIUM;
 
 	scsi_ulto2b(tea, scsi_cmd->tea);
 	scsi_ulto2b(src, scsi_cmd->src);
 	scsi_ulto2b(dst, scsi_cmd->dst);
 
 	if (invert)
 		scsi_cmd->invert |= MOVE_MEDIUM_INVERT;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/ NULL,
 		      /*dxfer_len*/ 0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_exchange_medium(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, u_int32_t tea, u_int32_t src,
 		     u_int32_t dst1, u_int32_t dst2, int invert1,
 		     int invert2, u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_exchange_medium *scsi_cmd;
 
 	scsi_cmd = (struct scsi_exchange_medium *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = EXCHANGE_MEDIUM;
 
 	scsi_ulto2b(tea, scsi_cmd->tea);
 	scsi_ulto2b(src, scsi_cmd->src);
 	scsi_ulto2b(dst1, scsi_cmd->fdst);
 	scsi_ulto2b(dst2, scsi_cmd->sdst);
 
 	if (invert1)
 		scsi_cmd->invert |= EXCHANGE_MEDIUM_INV1;
 
 	if (invert2)
 		scsi_cmd->invert |= EXCHANGE_MEDIUM_INV2;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/ NULL,
 		      /*dxfer_len*/ 0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_position_to_element(struct ccb_scsiio *csio, u_int32_t retries,
 			 void (*cbfcnp)(struct cam_periph *, union ccb *),
 			 u_int8_t tag_action, u_int32_t tea, u_int32_t dst,
 			 int invert, u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_position_to_element *scsi_cmd;
 
 	scsi_cmd = (struct scsi_position_to_element *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = POSITION_TO_ELEMENT;
 
 	scsi_ulto2b(tea, scsi_cmd->tea);
 	scsi_ulto2b(dst, scsi_cmd->dst);
 
 	if (invert)
 		scsi_cmd->invert |= POSITION_TO_ELEMENT_INVERT;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/ NULL,
 		      /*dxfer_len*/ 0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_read_element_status(struct ccb_scsiio *csio, u_int32_t retries,
 			 void (*cbfcnp)(struct cam_periph *, union ccb *),
 			 u_int8_t tag_action, int voltag, u_int32_t sea,
 			 int curdata, int dvcid,
 			 u_int32_t count, u_int8_t *data_ptr,
 			 u_int32_t dxfer_len, u_int8_t sense_len,
 			 u_int32_t timeout)
 {
 	struct scsi_read_element_status *scsi_cmd;
 
 	scsi_cmd = (struct scsi_read_element_status *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = READ_ELEMENT_STATUS;
 
 	scsi_ulto2b(sea, scsi_cmd->sea);
 	scsi_ulto2b(count, scsi_cmd->count);
 	scsi_ulto3b(dxfer_len, scsi_cmd->len);
 	if (dvcid)
 		scsi_cmd->flags |= READ_ELEMENT_STATUS_DVCID;
 	if (curdata)
 		scsi_cmd->flags |= READ_ELEMENT_STATUS_CURDATA;
 
 	if (voltag)
 		scsi_cmd->byte2 |= READ_ELEMENT_STATUS_VOLTAG;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_initialize_element_status(struct ccb_scsiio *csio, u_int32_t retries,
 			       void (*cbfcnp)(struct cam_periph *, union ccb *),
 			       u_int8_t tag_action, u_int8_t sense_len,
 			       u_int32_t timeout)
 {
 	struct scsi_initialize_element_status *scsi_cmd;
 
 	scsi_cmd = (struct scsi_initialize_element_status *)
 		    &csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = INITIALIZE_ELEMENT_STATUS;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_NONE,
 		      tag_action,
 		      /* data_ptr */ NULL,
 		      /* dxfer_len */ 0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_send_volume_tag(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, 
 		     u_int16_t element_address,
 		     u_int8_t send_action_code,
 		     struct scsi_send_volume_tag_parameters *parameters,
 		     u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_send_volume_tag *scsi_cmd;
 
 	scsi_cmd = (struct scsi_send_volume_tag *) &csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = SEND_VOLUME_TAG;
 	scsi_ulto2b(element_address, scsi_cmd->ea);
 	scsi_cmd->sac = send_action_code;
 	scsi_ulto2b(sizeof(*parameters), scsi_cmd->pll);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ CAM_DIR_OUT,
 		      tag_action,
 		      /* data_ptr */ (u_int8_t *) parameters,
 		      sizeof(*parameters),
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
Index: head/sys/dev/mps/mps.c
===================================================================
--- head/sys/dev/mps/mps.c	(revision 328237)
+++ head/sys/dev/mps/mps.c	(revision 328238)
@@ -1,3055 +1,3055 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Yahoo! Inc.
  * Copyright (c) 2011-2015 LSI Corp.
  * Copyright (c) 2013-2015 Avago Technologies
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Communications core for Avago Technologies (LSI) MPT2 */
 
 /* TODO Move headers to mpsvar */
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/selinfo.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/uio.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/taskqueue.h>
 #include <sys/endian.h>
 #include <sys/eventhandler.h>
 #include <sys/sbuf.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 #include <sys/proc.h>
 
 #include <dev/pci/pcivar.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 
 #include <dev/mps/mpi/mpi2_type.h>
 #include <dev/mps/mpi/mpi2.h>
 #include <dev/mps/mpi/mpi2_ioc.h>
 #include <dev/mps/mpi/mpi2_sas.h>
 #include <dev/mps/mpi/mpi2_cnfg.h>
 #include <dev/mps/mpi/mpi2_init.h>
 #include <dev/mps/mpi/mpi2_tool.h>
 #include <dev/mps/mps_ioctl.h>
 #include <dev/mps/mpsvar.h>
 #include <dev/mps/mps_table.h>
 
 static int mps_diag_reset(struct mps_softc *sc, int sleep_flag);
 static int mps_init_queues(struct mps_softc *sc);
 static void mps_resize_queues(struct mps_softc *sc);
 static int mps_message_unit_reset(struct mps_softc *sc, int sleep_flag);
 static int mps_transition_operational(struct mps_softc *sc);
 static int mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching);
 static void mps_iocfacts_free(struct mps_softc *sc);
 static void mps_startup(void *arg);
 static int mps_send_iocinit(struct mps_softc *sc);
 static int mps_alloc_queues(struct mps_softc *sc);
 static int mps_alloc_hw_queues(struct mps_softc *sc);
 static int mps_alloc_replies(struct mps_softc *sc);
 static int mps_alloc_requests(struct mps_softc *sc);
 static int mps_attach_log(struct mps_softc *sc);
 static __inline void mps_complete_command(struct mps_softc *sc,
     struct mps_command *cm);
 static void mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *reply);
 static void mps_config_complete(struct mps_softc *sc, struct mps_command *cm);
 static void mps_periodic(void *);
 static int mps_reregister_events(struct mps_softc *sc);
 static void mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm);
 static int mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts);
 static int mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag);
 static int mps_debug_sysctl(SYSCTL_HANDLER_ARGS);
 static void mps_parse_debug(struct mps_softc *sc, char *list);
 
 SYSCTL_NODE(_hw, OID_AUTO, mps, CTLFLAG_RD, 0, "MPS Driver Parameters");
 
 MALLOC_DEFINE(M_MPT2, "mps", "mpt2 driver memory");
 
 /*
  * Do a "Diagnostic Reset" aka a hard reset.  This should get the chip out of
  * any state and back to its initialization state machine.
  */
 static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };
 
 /* Added this union to smoothly convert le64toh cm->cm_desc.Words.
  * Compiler only support unint64_t to be passed as argument.
  * Otherwise it will throw below error
  * "aggregate value used where an integer was expected"
  */
 
 typedef union _reply_descriptor {
         u64 word;
         struct {
                 u32 low;
                 u32 high;
         } u;
 }reply_descriptor,address_descriptor;
 
 /* Rate limit chain-fail messages to 1 per minute */
 static struct timeval mps_chainfail_interval = { 60, 0 };
 
 /* 
  * sleep_flag can be either CAN_SLEEP or NO_SLEEP.
  * If this function is called from process context, it can sleep
  * and there is no harm to sleep, in case if this fuction is called
  * from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
  * based on sleep flags driver will call either msleep, pause or DELAY.
  * msleep and pause are of same variant, but pause is used when mps_mtx
  * is not hold by driver.
  *
  */
 static int
 mps_diag_reset(struct mps_softc *sc,int sleep_flag)
 {
 	uint32_t reg;
 	int i, error, tries = 0;
 	uint8_t first_wait_done = FALSE;
 
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	/* Clear any pending interrupts */
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/*
 	 * Force NO_SLEEP for threads prohibited to sleep
  	 * e.a Thread from interrupt handler are prohibited to sleep.
  	 */
 	if (curthread->td_no_sleeping != 0)
 		sleep_flag = NO_SLEEP;
 
 	mps_dprint(sc, MPS_INIT, "sequence start, sleep_flag= %d\n", sleep_flag);
  
 	/* Push the magic sequence */
 	error = ETIMEDOUT;
 	while (tries++ < 20) {
 		for (i = 0; i < sizeof(mpt2_reset_magic); i++)
 			mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
 			    mpt2_reset_magic[i]);
 		/* wait 100 msec */
 		if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 			msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
 			    "mpsdiag", hz/10);
 		else if (sleep_flag == CAN_SLEEP)
 			pause("mpsdiag", hz/10);
 		else
 			DELAY(100 * 1000);
 
 		reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
 		if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
 			error = 0;
 			break;
 		}
 	}
 	if (error) {
 		mps_dprint(sc, MPS_INIT, "sequence failed, error=%d, exit\n",
 		    error);
 		return (error);
 	}
 
 	/* Send the actual reset.  XXX need to refresh the reg? */
 	reg |= MPI2_DIAG_RESET_ADAPTER;
 	mps_dprint(sc, MPS_INIT, "sequence success, sending reset, reg= 0x%x\n",
 		reg);
 	mps_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg);
 
 	/* Wait up to 300 seconds in 50ms intervals */
 	error = ETIMEDOUT;
 	for (i = 0; i < 6000; i++) {
 		/*
 		 * Wait 50 msec. If this is the first time through, wait 256
 		 * msec to satisfy Diag Reset timing requirements.
 		 */
 		if (first_wait_done) {
 			if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 				msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0,
 				    "mpsdiag", hz/20);
 			else if (sleep_flag == CAN_SLEEP)
 				pause("mpsdiag", hz/20);
 			else
 				DELAY(50 * 1000);
 		} else {
 			DELAY(256 * 1000);
 			first_wait_done = TRUE;
 		}
 		/*
 		 * Check for the RESET_ADAPTER bit to be cleared first, then
 		 * wait for the RESET state to be cleared, which takes a little
 		 * longer.
 		 */
 		reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
 		if (reg & MPI2_DIAG_RESET_ADAPTER) {
 			continue;
 		}
 		reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
 			error = 0;
 			break;
 		}
 	}
 	if (error) {
 		mps_dprint(sc, MPS_INIT, "reset failed, error= %d, exit\n",
 		    error);
 		return (error);
 	}
 
 	mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
 	mps_dprint(sc, MPS_INIT, "diag reset success, exit\n");
 
 	return (0);
 }
 
 static int
 mps_message_unit_reset(struct mps_softc *sc, int sleep_flag)
 {
 	int error;
 
 	MPS_FUNCTRACE(sc);
 
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	error = 0;
 	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
 	    MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
 	    MPI2_DOORBELL_FUNCTION_SHIFT);
 
 	if (mps_wait_db_ack(sc, 5, sleep_flag) != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT,
 		    "Doorbell handshake failed\n");
 		error = ETIMEDOUT;
 	}
 
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 	return (error);
 }
 
 static int
 mps_transition_ready(struct mps_softc *sc)
 {
 	uint32_t reg, state;
 	int error, tries = 0;
 	int sleep_flags;
 
 	MPS_FUNCTRACE(sc);
 	/* If we are in attach call, do not sleep */
 	sleep_flags = (sc->mps_flags & MPS_FLAGS_ATTACH_DONE)
 					? CAN_SLEEP:NO_SLEEP;
 	error = 0;
 
 	mps_dprint(sc, MPS_INIT, "%s entered, sleep_flags= %d\n",
 	   __func__, sleep_flags);
 
 	while (tries++ < 1200) {
 		reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		mps_dprint(sc, MPS_INIT, "  Doorbell= 0x%x\n", reg);
 
 		/*
 		 * Ensure the IOC is ready to talk.  If it's not, try
 		 * resetting it.
 		 */
 		if (reg & MPI2_DOORBELL_USED) {
 			mps_dprint(sc, MPS_INIT, "  Not ready, sending diag "
 			    "reset\n");
 			mps_diag_reset(sc, sleep_flags);
 			DELAY(50000);
 			continue;
 		}
 
 		/* Is the adapter owned by another peer? */
 		if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
 		    (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, "IOC is under the "
 			    "control of another peer host, aborting "
 			    "initialization.\n");
 			error = ENXIO;
 			break;
 		}
 		
 		state = reg & MPI2_IOC_STATE_MASK;
 		if (state == MPI2_IOC_STATE_READY) {
 			/* Ready to go! */
 			error = 0;
 			break;
 		} else if (state == MPI2_IOC_STATE_FAULT) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, "IOC in fault "
 			    "state 0x%x, resetting\n",
 			    state & MPI2_DOORBELL_FAULT_CODE_MASK);
 			mps_diag_reset(sc, sleep_flags);
 		} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
 			/* Need to take ownership */
 			mps_message_unit_reset(sc, sleep_flags);
 		} else if (state == MPI2_IOC_STATE_RESET) {
 			/* Wait a bit, IOC might be in transition */
 			mps_dprint(sc, MPS_INIT|MPS_FAULT,
 			    "IOC in unexpected reset state\n");
 		} else {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT,
 			    "IOC in unknown state 0x%x\n", state);
 			error = EINVAL;
 			break;
 		}
 	
 		/* Wait 50ms for things to settle down. */
 		DELAY(50000);
 	}
 
 	if (error)
 		mps_dprint(sc, MPS_INIT|MPS_FAULT,
 		    "Cannot transition IOC to ready\n");
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 
 	return (error);
 }
 
 static int
 mps_transition_operational(struct mps_softc *sc)
 {
 	uint32_t reg, state;
 	int error;
 
 	MPS_FUNCTRACE(sc);
 
 	error = 0;
 	reg = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 	mps_dprint(sc, MPS_INIT, "%s entered, Doorbell= 0x%x\n", __func__, reg);
 
 	state = reg & MPI2_IOC_STATE_MASK;
 	if (state != MPI2_IOC_STATE_READY) {
 		mps_dprint(sc, MPS_INIT, "IOC not ready\n");
 		if ((error = mps_transition_ready(sc)) != 0) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, 
 			    "failed to transition ready, exit\n");
 			return (error);
 		}
 	}
 
 	error = mps_send_iocinit(sc);
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 
 	return (error);
 }
 
 static void
 mps_resize_queues(struct mps_softc *sc)
 {
 	int reqcr, prireqcr;
  
 	/*
 	 * Size the queues. Since the reply queues always need one free
 	 * entry, we'll deduct one reply message here.  The LSI documents
 	 * suggest instead to add a count to the request queue, but I think
 	 * that it's better to deduct from reply queue.
 	 */
 	prireqcr = MAX(1, sc->max_prireqframes);
 	prireqcr = MIN(prireqcr, sc->facts->HighPriorityCredit);
 
 	reqcr = MAX(2, sc->max_reqframes);
 	reqcr = MIN(reqcr, sc->facts->RequestCredit);
 
 	sc->num_reqs = prireqcr + reqcr;
 	sc->num_replies = MIN(sc->max_replyframes + sc->max_evtframes,
 	    sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;
 
 	/*
 	 * Figure out the number of MSIx-based queues.  If the firmware or
 	 * user has done something crazy and not allowed enough credit for
 	 * the queues to be useful then don't enable multi-queue.
 	 */
 	if (sc->facts->MaxMSIxVectors < 2)
 		sc->msi_msgs = 1;
 
 	if (sc->msi_msgs > 1) {
 		sc->msi_msgs = MIN(sc->msi_msgs, mp_ncpus);
 		sc->msi_msgs = MIN(sc->msi_msgs, sc->facts->MaxMSIxVectors);
 		if (sc->num_reqs / sc->msi_msgs < 2)
 			sc->msi_msgs = 1;
 	}
 
 	mps_dprint(sc, MPS_INIT, "Sized queues to q=%d reqs=%d replies=%d\n",
 	    sc->msi_msgs, sc->num_reqs, sc->num_replies);
 }
 
 /*
  * This is called during attach and when re-initializing due to a Diag Reset.
  * IOC Facts is used to allocate many of the structures needed by the driver.
  * If called from attach, de-allocation is not required because the driver has
  * not allocated any structures yet, but if called from a Diag Reset, previously
  * allocated structures based on IOC Facts will need to be freed and re-
  * allocated bases on the latest IOC Facts.
  */
 static int
 mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching)
 {
 	int error;
 	Mpi2IOCFactsReply_t saved_facts;
 	uint8_t saved_mode, reallocating;
 
 	mps_dprint(sc, MPS_INIT|MPS_TRACE, "%s entered\n", __func__);
 
 	/* Save old IOC Facts and then only reallocate if Facts have changed */
 	if (!attaching) {
 		bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
 	}
 
 	/*
 	 * Get IOC Facts.  In all cases throughout this function, panic if doing
 	 * a re-initialization and only return the error if attaching so the OS
 	 * can handle it.
 	 */
 	if ((error = mps_get_iocfacts(sc, sc->facts)) != 0) {
 		if (attaching) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, "Failed to get "
 			    "IOC Facts with error %d, exit\n", error);
 			return (error);
 		} else {
 			panic("%s failed to get IOC Facts with error %d\n",
 			    __func__, error);
 		}
 	}
 
 	MPS_DPRINT_PAGE(sc, MPS_XINFO, iocfacts, sc->facts);
 
 	snprintf(sc->fw_version, sizeof(sc->fw_version), 
 	    "%02d.%02d.%02d.%02d", 
 	    sc->facts->FWVersion.Struct.Major,
 	    sc->facts->FWVersion.Struct.Minor,
 	    sc->facts->FWVersion.Struct.Unit,
 	    sc->facts->FWVersion.Struct.Dev);
 
 	mps_dprint(sc, MPS_INFO, "Firmware: %s, Driver: %s\n", sc->fw_version,
 	    MPS_DRIVER_VERSION);
 	mps_dprint(sc, MPS_INFO, "IOCCapabilities: %b\n",
 	     sc->facts->IOCCapabilities,
 	    "\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
 	    "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
 	    "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc");
 
 	/*
 	 * If the chip doesn't support event replay then a hard reset will be
 	 * required to trigger a full discovery.  Do the reset here then
 	 * retransition to Ready.  A hard reset might have already been done,
 	 * but it doesn't hurt to do it again.  Only do this if attaching, not
 	 * for a Diag Reset.
 	 */
 	if (attaching && ((sc->facts->IOCCapabilities &
 	    MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0)) {
 		mps_dprint(sc, MPS_INIT, "No event replay, reseting\n");
 		mps_diag_reset(sc, NO_SLEEP);
 		if ((error = mps_transition_ready(sc)) != 0) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, "Failed to "
 			    "transition to ready with error %d, exit\n",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Set flag if IR Firmware is loaded.  If the RAID Capability has
 	 * changed from the previous IOC Facts, log a warning, but only if
 	 * checking this after a Diag Reset and not during attach.
 	 */
 	saved_mode = sc->ir_firmware;
 	if (sc->facts->IOCCapabilities &
 	    MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
 		sc->ir_firmware = 1;
 	if (!attaching) {
 		if (sc->ir_firmware != saved_mode) {
 			mps_dprint(sc, MPS_INIT|MPS_FAULT, "new IR/IT mode "
 			    "in IOC Facts does not match previous mode\n");
 		}
 	}
 
 	/* Only deallocate and reallocate if relevant IOC Facts have changed */
 	reallocating = FALSE;
 	sc->mps_flags &= ~MPS_FLAGS_REALLOCATED;
 
 	if ((!attaching) &&
 	    ((saved_facts.MsgVersion != sc->facts->MsgVersion) ||
 	    (saved_facts.HeaderVersion != sc->facts->HeaderVersion) ||
 	    (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) ||
 	    (saved_facts.RequestCredit != sc->facts->RequestCredit) ||
 	    (saved_facts.ProductID != sc->facts->ProductID) ||
 	    (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) ||
 	    (saved_facts.IOCRequestFrameSize !=
 	    sc->facts->IOCRequestFrameSize) ||
 	    (saved_facts.MaxTargets != sc->facts->MaxTargets) ||
 	    (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) ||
 	    (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) ||
 	    (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) ||
 	    (saved_facts.MaxReplyDescriptorPostQueueDepth !=
 	    sc->facts->MaxReplyDescriptorPostQueueDepth) ||
 	    (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) ||
 	    (saved_facts.MaxVolumes != sc->facts->MaxVolumes) ||
 	    (saved_facts.MaxPersistentEntries !=
 	    sc->facts->MaxPersistentEntries))) {
 		reallocating = TRUE;
 
 		/* Record that we reallocated everything */
 		sc->mps_flags |= MPS_FLAGS_REALLOCATED;
 	}
 
 	/*
 	 * Some things should be done if attaching or re-allocating after a Diag
 	 * Reset, but are not needed after a Diag Reset if the FW has not
 	 * changed.
 	 */
 	if (attaching || reallocating) {
 		/*
 		 * Check if controller supports FW diag buffers and set flag to
 		 * enable each type.
 		 */
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
 			    enabled = TRUE;
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
 			    enabled = TRUE;
 		if (sc->facts->IOCCapabilities &
 		    MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
 			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
 			    enabled = TRUE;
 
 		/*
 		 * Set flag if EEDP is supported and if TLR is supported.
 		 */
 		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
 			sc->eedp_enabled = TRUE;
 		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
 			sc->control_TLR = TRUE;
 
 		mps_resize_queues(sc);
 
 		/*
 		 * Initialize all Tail Queues
 		 */
 		TAILQ_INIT(&sc->req_list);
 		TAILQ_INIT(&sc->high_priority_req_list);
 		TAILQ_INIT(&sc->chain_list);
 		TAILQ_INIT(&sc->tm_list);
 	}
 
 	/*
 	 * If doing a Diag Reset and the FW is significantly different
 	 * (reallocating will be set above in IOC Facts comparison), then all
 	 * buffers based on the IOC Facts will need to be freed before they are
 	 * reallocated.
 	 */
 	if (reallocating) {
 		mps_iocfacts_free(sc);
 		mpssas_realloc_targets(sc, saved_facts.MaxTargets +
 		    saved_facts.MaxVolumes);
 	}
 
 	/*
 	 * Any deallocation has been completed.  Now start reallocating
 	 * if needed.  Will only need to reallocate if attaching or if the new
 	 * IOC Facts are different from the previous IOC Facts after a Diag
 	 * Reset. Targets have already been allocated above if needed.
 	 */
 	error = 0;
 	while (attaching || reallocating) {
 		if ((error = mps_alloc_hw_queues(sc)) != 0)
 			break;
 		if ((error = mps_alloc_replies(sc)) != 0)
 			break;
 		if ((error = mps_alloc_requests(sc)) != 0)
 			break;
 		if ((error = mps_alloc_queues(sc)) != 0)
 			break;
 
 		break;
 	}
 	if (error) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT,
 		    "Failed to alloc queues with error %d\n", error);
 		mps_free(sc);
 		return (error);
 	}
 
 	/* Always initialize the queues */
 	bzero(sc->free_queue, sc->fqdepth * 4);
 	mps_init_queues(sc);
 
 	/*
 	 * Always get the chip out of the reset state, but only panic if not
 	 * attaching.  If attaching and there is an error, that is handled by
 	 * the OS.
 	 */
 	error = mps_transition_operational(sc);
 	if (error != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "Failed to "
 		    "transition to operational with error %d\n", error);
 		mps_free(sc);
 		return (error);
 	}
 
 	/*
 	 * Finish the queue initialization.
 	 * These are set here instead of in mps_init_queues() because the
 	 * IOC resets these values during the state transition in
 	 * mps_transition_operational().  The free index is set to 1
 	 * because the corresponding index in the IOC is set to 0, and the
 	 * IOC treats the queues as full if both are set to the same value.
 	 * Hence the reason that the queue can't hold all of the possible
 	 * replies.
 	 */
 	sc->replypostindex = 0;
 	mps_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
 	mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);
 
 	/*
 	 * Attach the subsystems so they can prepare their event masks.
 	 * XXX Should be dynamic so that IM/IR and user modules can attach
 	 */
 	error = 0;
 	while (attaching) {
 		mps_dprint(sc, MPS_INIT, "Attaching subsystems\n");
 		if ((error = mps_attach_log(sc)) != 0)
 			break;
 		if ((error = mps_attach_sas(sc)) != 0)
 			break;
 		if ((error = mps_attach_user(sc)) != 0)
 			break;
 		break;
 	}
 	if (error) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "Failed to attach all "
 		    "subsystems: error %d\n", error);
 		mps_free(sc);
 		return (error);
 	}
 
 	/*
 	 * XXX If the number of MSI-X vectors changes during re-init, this
 	 * won't see it and adjust.
 	 */
 	if (attaching && (error = mps_pci_setup_interrupts(sc)) != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "Failed to setup "
 		    "interrupts\n");
 		mps_free(sc);
 		return (error);
 	}
 
 	/*
 	 * Set flag if this is a WD controller.  This shouldn't ever change, but
 	 * reset it after a Diag Reset, just in case.
 	 */
 	sc->WD_available = FALSE;
 	if (pci_get_device(sc->mps_dev) == MPI2_MFGPAGE_DEVID_SSS6200)
 		sc->WD_available = TRUE;
 
 	return (error);
 }
 
 /*
  * This is called if memory is being free (during detach for example) and when
  * buffers need to be reallocated due to a Diag Reset.
  */
 static void
 mps_iocfacts_free(struct mps_softc *sc)
 {
 	struct mps_command *cm;
 	int i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if (sc->free_busaddr != 0)
 		bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
 	if (sc->free_queue != NULL)
 		bus_dmamem_free(sc->queues_dmat, sc->free_queue,
 		    sc->queues_map);
 	if (sc->queues_dmat != NULL)
 		bus_dma_tag_destroy(sc->queues_dmat);
 
 	if (sc->chain_busaddr != 0)
 		bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
 	if (sc->chain_frames != NULL)
 		bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
 		    sc->chain_map);
 	if (sc->chain_dmat != NULL)
 		bus_dma_tag_destroy(sc->chain_dmat);
 
 	if (sc->sense_busaddr != 0)
 		bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
 	if (sc->sense_frames != NULL)
 		bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
 		    sc->sense_map);
 	if (sc->sense_dmat != NULL)
 		bus_dma_tag_destroy(sc->sense_dmat);
 
 	if (sc->reply_busaddr != 0)
 		bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
 	if (sc->reply_frames != NULL)
 		bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
 		    sc->reply_map);
 	if (sc->reply_dmat != NULL)
 		bus_dma_tag_destroy(sc->reply_dmat);
 
 	if (sc->req_busaddr != 0)
 		bus_dmamap_unload(sc->req_dmat, sc->req_map);
 	if (sc->req_frames != NULL)
 		bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
 	if (sc->req_dmat != NULL)
 		bus_dma_tag_destroy(sc->req_dmat);
 
 	if (sc->chains != NULL)
 		free(sc->chains, M_MPT2);
 	if (sc->commands != NULL) {
 		for (i = 1; i < sc->num_reqs; i++) {
 			cm = &sc->commands[i];
 			bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
 		}
 		free(sc->commands, M_MPT2);
 	}
 	if (sc->buffer_dmat != NULL)
 		bus_dma_tag_destroy(sc->buffer_dmat);
 
 	mps_pci_free_interrupts(sc);
 	free(sc->queues, M_MPT2);
 	sc->queues = NULL;
 }
 
 /* 
  * The terms diag reset and hard reset are used interchangeably in the MPI
  * docs to mean resetting the controller chip.  In this code diag reset
  * cleans everything up, and the hard reset function just sends the reset
  * sequence to the chip.  This should probably be refactored so that every
  * subsystem gets a reset notification of some sort, and can clean up
  * appropriately.
  */
 int
 mps_reinit(struct mps_softc *sc)
 {
 	int error;
 	struct mpssas_softc *sassc;
 
 	sassc = sc->sassc;
 
 	MPS_FUNCTRACE(sc);
 
 	mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	mps_dprint(sc, MPS_INIT|MPS_INFO, "Reinitializing controller\n");
 	if (sc->mps_flags & MPS_FLAGS_DIAGRESET) {
 		mps_dprint(sc, MPS_INIT, "Reset already in progress\n");
 		return 0;
 	}
 
 	/* make sure the completion callbacks can recognize they're getting
 	 * a NULL cm_reply due to a reset.
 	 */
 	sc->mps_flags |= MPS_FLAGS_DIAGRESET;
 
 	/*
 	 * Mask interrupts here.
 	 */
 	mps_dprint(sc, MPS_INIT, "masking interrupts and resetting\n");
 	mps_mask_intr(sc);
 
 	error = mps_diag_reset(sc, CAN_SLEEP);
 	if (error != 0) {
 		/* XXXSL No need to panic here */
 		panic("%s hard reset failed with error %d\n",
 		    __func__, error);
 	}
 
 	/* Restore the PCI state, including the MSI-X registers */
 	mps_pci_restore(sc);
 
 	/* Give the I/O subsystem special priority to get itself prepared */
 	mpssas_handle_reinit(sc);
 
 	/*
 	 * Get IOC Facts and allocate all structures based on this information.
 	 * The attach function will also call mps_iocfacts_allocate at startup.
 	 * If relevant values have changed in IOC Facts, this function will free
 	 * all of the memory based on IOC Facts and reallocate that memory.
 	 */
 	if ((error = mps_iocfacts_allocate(sc, FALSE)) != 0) {
 		panic("%s IOC Facts based allocation failed with error %d\n",
 		    __func__, error);
 	}
 
 	/*
 	 * Mapping structures will be re-allocated after getting IOC Page8, so
 	 * free these structures here.
 	 */
 	mps_mapping_exit(sc);
 
 	/*
 	 * The static page function currently read is IOC Page8.  Others can be
 	 * added in future.  It's possible that the values in IOC Page8 have
 	 * changed after a Diag Reset due to user modification, so always read
 	 * these.  Interrupts are masked, so unmask them before getting config
 	 * pages.
 	 */
 	mps_unmask_intr(sc);
 	sc->mps_flags &= ~MPS_FLAGS_DIAGRESET;
 	mps_base_static_config_pages(sc);
 
 	/*
 	 * Some mapping info is based in IOC Page8 data, so re-initialize the
 	 * mapping tables.
 	 */
 	mps_mapping_initialize(sc);
 
 	/*
 	 * Restart will reload the event masks clobbered by the reset, and
 	 * then enable the port.
 	 */
 	mps_reregister_events(sc);
 
 	/* the end of discovery will release the simq, so we're done. */
 	mps_dprint(sc, MPS_INIT|MPS_XINFO, "Finished sc %p post %u free %u\n", 
 	    sc, sc->replypostindex, sc->replyfreeindex);
 
 	mpssas_release_simq_reinit(sassc);
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 
 	return 0;
 }
 
 /* Wait for the chip to ACK a word that we've put into its FIFO 
  * Wait for <timeout> seconds. In single loop wait for busy loop
  * for 500 microseconds.
  * Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
  * */
 static int
 mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag)
 {
 
 	u32 cntdn, count;
 	u32 int_status;
 	u32 doorbell;
 
 	count = 0;
 	cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout;
 	do {
 		int_status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
 		if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
 			mps_dprint(sc, MPS_TRACE, 
 			"%s: successful count(%d), timeout(%d)\n",
 			__func__, count, timeout);
 		return 0;
 		} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
 			doorbell = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 			if ((doorbell & MPI2_IOC_STATE_MASK) ==
 				MPI2_IOC_STATE_FAULT) {
 				mps_dprint(sc, MPS_FAULT, 
 					"fault_state(0x%04x)!\n", doorbell);
 				return (EFAULT);
 			}
 		} else if (int_status == 0xFFFFFFFF)
 			goto out;
 
 		/* If it can sleep, sleep for 1 milisecond, else busy loop for 
 		* 0.5 milisecond */
 		if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP)
 			msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, 
 			"mpsdba", hz/1000);
 		else if (sleep_flag == CAN_SLEEP)
 			pause("mpsdba", hz/1000);
 		else
 			DELAY(500);
 		count++;
 	} while (--cntdn);
 
 	out:
 	mps_dprint(sc, MPS_FAULT, "%s: failed due to timeout count(%d), "
 		"int_status(%x)!\n", __func__, count, int_status);
 	return (ETIMEDOUT);
 
 }
 
 /* Wait for the chip to signal that the next word in its FIFO can be fetched */
 static int
 mps_wait_db_int(struct mps_softc *sc)
 {
 	int retry;
 
 	for (retry = 0; retry < MPS_DB_MAX_WAIT; retry++) {
 		if ((mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
 		    MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
 			return (0);
 		DELAY(2000);
 	}
 	return (ETIMEDOUT);
 }
 
 /* Step through the synchronous command state machine, i.e. "Doorbell mode" */
 static int
 mps_request_sync(struct mps_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply,
     int req_sz, int reply_sz, int timeout)
 {
 	uint32_t *data32;
 	uint16_t *data16;
 	int i, count, ioc_sz, residual;
 	int sleep_flags = CAN_SLEEP;
 
 	if (curthread->td_no_sleeping != 0)
 		sleep_flags = NO_SLEEP;
 
 	/* Step 1 */
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/* Step 2 */
 	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 		return (EBUSY);
 
 	/* Step 3
 	 * Announce that a message is coming through the doorbell.  Messages
 	 * are pushed at 32bit words, so round up if needed.
 	 */
 	count = (req_sz + 3) / 4;
 	mps_regwrite(sc, MPI2_DOORBELL_OFFSET,
 	    (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) |
 	    (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));
 
 	/* Step 4 */
 	if (mps_wait_db_int(sc) ||
 	    (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
 		mps_dprint(sc, MPS_FAULT, "Doorbell failed to activate\n");
 		return (ENXIO);
 	}
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed\n");
 		return (ENXIO);
 	}
 
 	/* Step 5 */
 	/* Clock out the message data synchronously in 32-bit dwords*/
 	data32 = (uint32_t *)req;
 	for (i = 0; i < count; i++) {
 		mps_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
 		if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout while writing doorbell\n");
 			return (ENXIO);
 		}
 	}
 
 	/* Step 6 */
 	/* Clock in the reply in 16-bit words.  The total length of the
 	 * message is always in the 4th byte, so clock out the first 2 words
 	 * manually, then loop the rest.
 	 */
 	data16 = (uint16_t *)reply;
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 0\n");
 		return (ENXIO);
 	}
 	data16[0] =
 	    mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 1\n");
 		return (ENXIO);
 	}
 	data16[1] =
 	    mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	/* Number of 32bit words in the message */
 	ioc_sz = reply->MsgLength;
 
 	/*
 	 * Figure out how many 16bit words to clock in without overrunning.
 	 * The precision loss with dividing reply_sz can safely be
 	 * ignored because the messages can only be multiples of 32bits.
 	 */
 	residual = 0;
 	count = MIN((reply_sz / 4), ioc_sz) * 2;
 	if (count < ioc_sz * 2) {
 		residual = ioc_sz * 2 - count;
 		mps_dprint(sc, MPS_ERROR, "Driver error, throwing away %d "
 		    "residual message words\n", residual);
 	}
 
 	for (i = 2; i < count; i++) {
 		if (mps_wait_db_int(sc) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout reading doorbell %d\n", i);
 			return (ENXIO);
 		}
 		data16[i] = mps_regread(sc, MPI2_DOORBELL_OFFSET) &
 		    MPI2_DOORBELL_DATA_MASK;
 		mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	}
 
 	/*
 	 * Pull out residual words that won't fit into the provided buffer.
 	 * This keeps the chip from hanging due to a driver programming
 	 * error.
 	 */
 	while (residual--) {
 		if (mps_wait_db_int(sc) != 0) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Timeout reading doorbell\n");
 			return (ENXIO);
 		}
 		(void)mps_regread(sc, MPI2_DOORBELL_OFFSET);
 		mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 	}
 
 	/* Step 7 */
 	if (mps_wait_db_int(sc) != 0) {
 		mps_dprint(sc, MPS_FAULT, "Timeout waiting to exit doorbell\n");
 		return (ENXIO);
 	}
 	if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 		mps_dprint(sc, MPS_FAULT, "Warning, doorbell still active\n");
 	mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 
 	return (0);
 }
 
 static void
 mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm)
 {
 	reply_descriptor rd;
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_TRACE, "SMID %u cm %p ccb %p\n",
 	    cm->cm_desc.Default.SMID, cm, cm->cm_ccb);
 
 	if (sc->mps_flags & MPS_FLAGS_ATTACH_DONE && !(sc->mps_flags & MPS_FLAGS_SHUTDOWN))
 		mtx_assert(&sc->mps_mtx, MA_OWNED);
 
 	if (++sc->io_cmds_active > sc->io_cmds_highwater)
 		sc->io_cmds_highwater++;
 	rd.u.low = cm->cm_desc.Words.Low;
 	rd.u.high = cm->cm_desc.Words.High;
 	rd.word = htole64(rd.word);
 	/* TODO-We may need to make below regwrite atomic */
 	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
 	    rd.u.low);
 	mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
 	    rd.u.high);
 }
 
 /*
  * Just the FACTS, ma'am.
  */
 static int
 mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts)
 {
 	MPI2_DEFAULT_REPLY *reply;
 	MPI2_IOC_FACTS_REQUEST request;
 	int error, req_sz, reply_sz;
 
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
 	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
 	reply = (MPI2_DEFAULT_REPLY *)facts;
 
 	bzero(&request, req_sz);
 	request.Function = MPI2_FUNCTION_IOC_FACTS;
 	error = mps_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
 	mps_dprint(sc, MPS_INIT, "%s exit error= %d\n", __func__, error);
 
 	return (error);
 }
 
 static int
 mps_send_iocinit(struct mps_softc *sc)
 {
 	MPI2_IOC_INIT_REQUEST	init;
 	MPI2_DEFAULT_REPLY	reply;
 	int req_sz, reply_sz, error;
 	struct timeval now;
 	uint64_t time_in_msec;
 
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
 	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
 	bzero(&init, req_sz);
 	bzero(&reply, reply_sz);
 
 	/*
 	 * Fill in the init block.  Note that most addresses are
 	 * deliberately in the lower 32bits of memory.  This is a micro-
 	 * optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
 	 */
 	init.Function = MPI2_FUNCTION_IOC_INIT;
 	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
 	init.MsgVersion = htole16(MPI2_VERSION);
 	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
 	init.SystemRequestFrameSize = htole16(sc->facts->IOCRequestFrameSize);
 	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
 	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
 	init.SenseBufferAddressHigh = 0;
 	init.SystemReplyAddressHigh = 0;
 	init.SystemRequestFrameBaseAddress.High = 0;
 	init.SystemRequestFrameBaseAddress.Low = htole32((uint32_t)sc->req_busaddr);
 	init.ReplyDescriptorPostQueueAddress.High = 0;
 	init.ReplyDescriptorPostQueueAddress.Low = htole32((uint32_t)sc->post_busaddr);
 	init.ReplyFreeQueueAddress.High = 0;
 	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
 	getmicrotime(&now);
 	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
 	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
 	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
 
 	error = mps_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
 	if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 		error = ENXIO;
 
 	mps_dprint(sc, MPS_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus);
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 	return (error);
 }
 
 void
 mps_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	bus_addr_t *addr;
 
 	addr = arg;
 	*addr = segs[0].ds_addr;
 }
 
 static int
 mps_alloc_queues(struct mps_softc *sc)
 {
 	struct mps_queue *q;
-	int nq, i;
+	u_int nq, i;
 
 	nq = sc->msi_msgs;
 	mps_dprint(sc, MPS_INIT|MPS_XINFO, "Allocating %d I/O queues\n", nq);
 
 	sc->queues = malloc(sizeof(struct mps_queue) * nq, M_MPT2,
 	    M_NOWAIT|M_ZERO);
 	if (sc->queues == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nq; i++) {
 		q = &sc->queues[i];
 		mps_dprint(sc, MPS_INIT, "Configuring queue %d %p\n", i, q);
 		q->sc = sc;
 		q->qnum = i;
 	}
 
 	return (0);
 }
 
 static int
 mps_alloc_hw_queues(struct mps_softc *sc)
 {
 	bus_addr_t queues_busaddr;
 	uint8_t *queues;
 	int qsize, fqsize, pqsize;
 
 	/*
 	 * The reply free queue contains 4 byte entries in multiples of 16 and
 	 * aligned on a 16 byte boundary. There must always be an unused entry.
 	 * This queue supplies fresh reply frames for the firmware to use.
 	 *
 	 * The reply descriptor post queue contains 8 byte entries in
 	 * multiples of 16 and aligned on a 16 byte boundary.  This queue
 	 * contains filled-in reply frames sent from the firmware to the host.
 	 *
 	 * These two queues are allocated together for simplicity.
 	 */
 	sc->fqdepth = roundup2(sc->num_replies + 1, 16);
 	sc->pqdepth = roundup2(sc->num_replies + 1, 16);
 	fqsize= sc->fqdepth * 4;
 	pqsize = sc->pqdepth * 8;
 	qsize = fqsize + pqsize;
 
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 qsize,			/* maxsize */
                                 1,			/* nsegments */
                                 qsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->queues_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate queues DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
 	    &sc->queues_map)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate queues memory\n");
 		return (ENOMEM);
         }
         bzero(queues, qsize);
         bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
 	    mps_memaddr_cb, &queues_busaddr, 0);
 
 	sc->free_queue = (uint32_t *)queues;
 	sc->free_busaddr = queues_busaddr;
 	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
 	sc->post_busaddr = queues_busaddr + fqsize;
 
 	return (0);
 }
 
 static int
 mps_alloc_replies(struct mps_softc *sc)
 {
 	int rsize, num_replies;
 
 	/*
 	 * sc->num_replies should be one less than sc->fqdepth.  We need to
 	 * allocate space for sc->fqdepth replies, but only sc->num_replies
 	 * replies can be used at once.
 	 */
 	num_replies = max(sc->fqdepth, sc->num_replies);
 
 	rsize = sc->facts->ReplyFrameSize * num_replies * 4; 
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				4, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->reply_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate replies DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
 	    BUS_DMA_NOWAIT, &sc->reply_map)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate replies memory\n");
 		return (ENOMEM);
         }
         bzero(sc->reply_frames, rsize);
         bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
 	    mps_memaddr_cb, &sc->reply_busaddr, 0);
 
 	return (0);
 }
 
 static int
 mps_alloc_requests(struct mps_softc *sc)
 {
 	struct mps_command *cm;
 	struct mps_chain *chain;
 	int i, rsize, nsegs;
 
 	rsize = sc->facts->IOCRequestFrameSize * sc->num_reqs * 4;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->req_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate request DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
 	    BUS_DMA_NOWAIT, &sc->req_map)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate request memory\n");
 		return (ENOMEM);
         }
         bzero(sc->req_frames, rsize);
         bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
 	    mps_memaddr_cb, &sc->req_busaddr, 0);
 
 	rsize = sc->facts->IOCRequestFrameSize * sc->max_chains * 4;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				16, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->chain_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate chain DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
 	    BUS_DMA_NOWAIT, &sc->chain_map)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate chain memory\n");
 		return (ENOMEM);
         }
         bzero(sc->chain_frames, rsize);
         bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames, rsize,
 	    mps_memaddr_cb, &sc->chain_busaddr, 0);
 
 	rsize = MPS_SENSE_LEN * sc->num_reqs;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 rsize,			/* maxsize */
                                 1,			/* nsegments */
                                 rsize,			/* maxsegsize */
                                 0,			/* flags */
                                 NULL, NULL,		/* lockfunc, lockarg */
                                 &sc->sense_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate sense DMA tag\n");
 		return (ENOMEM);
         }
         if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
 	    BUS_DMA_NOWAIT, &sc->sense_map)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate sense memory\n");
 		return (ENOMEM);
         }
         bzero(sc->sense_frames, rsize);
         bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
 	    mps_memaddr_cb, &sc->sense_busaddr, 0);
 
 	sc->chains = malloc(sizeof(struct mps_chain) * sc->max_chains, M_MPT2,
 	    M_WAITOK | M_ZERO);
 	if(!sc->chains) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate chains memory\n");
 		return (ENOMEM);
 	}
 	for (i = 0; i < sc->max_chains; i++) {
 		chain = &sc->chains[i];
 		chain->chain = (MPI2_SGE_IO_UNION *)(sc->chain_frames +
 		    i * sc->facts->IOCRequestFrameSize * 4);
 		chain->chain_busaddr = sc->chain_busaddr +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		mps_free_chain(sc, chain);
 		sc->chain_free_lowwater++;
 	}
 
 	/* XXX Need to pick a more precise value */
 	nsegs = (MAXPHYS / PAGE_SIZE) + 1;
         if (bus_dma_tag_create( sc->mps_parent_dmat,    /* parent */
 				1, 0,			/* algnmnt, boundary */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
                                 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
                                 nsegs,			/* nsegments */
                                 BUS_SPACE_MAXSIZE_24BIT,/* maxsegsize */
                                 BUS_DMA_ALLOCNOW,	/* flags */
                                 busdma_lock_mutex,	/* lockfunc */
 				&sc->mps_mtx,		/* lockarg */
                                 &sc->buffer_dmat)) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate buffer DMA tag\n");
 		return (ENOMEM);
         }
 
 	/*
 	 * SMID 0 cannot be used as a free command per the firmware spec.
 	 * Just drop that command instead of risking accounting bugs.
 	 */
 	sc->commands = malloc(sizeof(struct mps_command) * sc->num_reqs,
 	    M_MPT2, M_WAITOK | M_ZERO);
 	if(!sc->commands) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate command memory\n");
 		return (ENOMEM);
 	}
 	for (i = 1; i < sc->num_reqs; i++) {
 		cm = &sc->commands[i];
 		cm->cm_req = sc->req_frames +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		cm->cm_req_busaddr = sc->req_busaddr +
 		    i * sc->facts->IOCRequestFrameSize * 4;
 		cm->cm_sense = &sc->sense_frames[i];
 		cm->cm_sense_busaddr = sc->sense_busaddr + i * MPS_SENSE_LEN;
 		cm->cm_desc.Default.SMID = i;
 		cm->cm_sc = sc;
 		TAILQ_INIT(&cm->cm_chain_list);
 		callout_init_mtx(&cm->cm_callout, &sc->mps_mtx, 0);
 
 		/* XXX Is a failure here a critical problem? */
 		if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap) == 0)
 			if (i <= sc->facts->HighPriorityCredit)
 				mps_free_high_priority_command(sc, cm);
 			else
 				mps_free_command(sc, cm);
 		else {
 			panic("failed to allocate command %d\n", i);
 			sc->num_reqs = i;
 			break;
 		}
 	}
 
 	return (0);
 }
 
 static int
 mps_init_queues(struct mps_softc *sc)
 {
 	int i;
 
 	memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8);
 
 	/*
 	 * According to the spec, we need to use one less reply than we
 	 * have space for on the queue.  So sc->num_replies (the number we
 	 * use) should be less than sc->fqdepth (allocated size).
 	 */
 	if (sc->num_replies >= sc->fqdepth)
 		return (EINVAL);
 
 	/*
 	 * Initialize all of the free queue entries.
 	 */
 	for (i = 0; i < sc->fqdepth; i++)
 		sc->free_queue[i] = sc->reply_busaddr + (i * sc->facts->ReplyFrameSize * 4);
 	sc->replyfreeindex = sc->num_replies;
 
 	return (0);
 }
 
 /* Get the driver parameter tunables.  Lowest priority are the driver defaults.
  * Next are the global settings, if they exist.  Highest are the per-unit
  * settings, if they exist.
  */
 void
 mps_get_tunables(struct mps_softc *sc)
 {
 	char tmpstr[80], mps_debug[80];
 
 	/* XXX default to some debugging for now */
 	sc->mps_debug = MPS_INFO|MPS_FAULT;
 	sc->disable_msix = 0;
 	sc->disable_msi = 0;
 	sc->max_msix = MPS_MSIX_MAX;
 	sc->max_chains = MPS_CHAIN_FRAMES;
 	sc->max_io_pages = MPS_MAXIO_PAGES;
 	sc->enable_ssu = MPS_SSU_ENABLE_SSD_DISABLE_HDD;
 	sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
 	sc->use_phynum = 1;
 	sc->max_reqframes = MPS_REQ_FRAMES;
 	sc->max_prireqframes = MPS_PRI_REQ_FRAMES;
 	sc->max_replyframes = MPS_REPLY_FRAMES;
 	sc->max_evtframes = MPS_EVT_REPLY_FRAMES;
 
 	/*
 	 * Grab the global variables.
 	 */
 	bzero(mps_debug, 80);
 	if (TUNABLE_STR_FETCH("hw.mps.debug_level", mps_debug, 80) != 0)
 		mps_parse_debug(sc, mps_debug);
 	TUNABLE_INT_FETCH("hw.mps.disable_msix", &sc->disable_msix);
 	TUNABLE_INT_FETCH("hw.mps.disable_msi", &sc->disable_msi);
 	TUNABLE_INT_FETCH("hw.mps.max_msix", &sc->max_msix);
 	TUNABLE_INT_FETCH("hw.mps.max_chains", &sc->max_chains);
 	TUNABLE_INT_FETCH("hw.mps.max_io_pages", &sc->max_io_pages);
 	TUNABLE_INT_FETCH("hw.mps.enable_ssu", &sc->enable_ssu);
 	TUNABLE_INT_FETCH("hw.mps.spinup_wait_time", &sc->spinup_wait_time);
 	TUNABLE_INT_FETCH("hw.mps.use_phy_num", &sc->use_phynum);
 	TUNABLE_INT_FETCH("hw.mps.max_reqframes", &sc->max_reqframes);
 	TUNABLE_INT_FETCH("hw.mps.max_prireqframes", &sc->max_prireqframes);
 	TUNABLE_INT_FETCH("hw.mps.max_replyframes", &sc->max_replyframes);
 	TUNABLE_INT_FETCH("hw.mps.max_evtframes", &sc->max_evtframes);
 
 	/* Grab the unit-instance variables */
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.debug_level",
 	    device_get_unit(sc->mps_dev));
 	bzero(mps_debug, 80);
 	if (TUNABLE_STR_FETCH(tmpstr, mps_debug, 80) != 0)
 		mps_parse_debug(sc, mps_debug);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msix",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msi",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_msix",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_msix);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_chains",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_io_pages",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);
 
 	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.exclude_ids",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.enable_ssu",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.spinup_wait_time",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.use_phy_num",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_reqframes",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_reqframes);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_prireqframes",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_prireqframes);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_replyframes",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_replyframes);
 
 	snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_evtframes",
 	    device_get_unit(sc->mps_dev));
 	TUNABLE_INT_FETCH(tmpstr, &sc->max_evtframes);
 
 }
 
 static void
 mps_setup_sysctl(struct mps_softc *sc)
 {
 	struct sysctl_ctx_list	*sysctl_ctx = NULL;
 	struct sysctl_oid	*sysctl_tree = NULL;
 	char tmpstr[80], tmpstr2[80];
 
 	/*
 	 * Setup the sysctl variable so the user can change the debug level
 	 * on the fly.
 	 */
 	snprintf(tmpstr, sizeof(tmpstr), "MPS controller %d",
 	    device_get_unit(sc->mps_dev));
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mps_dev));
 
 	sysctl_ctx = device_get_sysctl_ctx(sc->mps_dev);
 	if (sysctl_ctx != NULL)
 		sysctl_tree = device_get_sysctl_tree(sc->mps_dev);
 
 	if (sysctl_tree == NULL) {
 		sysctl_ctx_init(&sc->sysctl_ctx);
 		sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_hw_mps), OID_AUTO, tmpstr2,
 		    CTLFLAG_RD, 0, tmpstr);
 		if (sc->sysctl_tree == NULL)
 			return;
 		sysctl_ctx = &sc->sysctl_ctx;
 		sysctl_tree = sc->sysctl_tree;
 	}
 
 	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "debug_level", CTLTYPE_STRING | CTLFLAG_RW |CTLFLAG_MPSAFE,
 	    sc, 0, mps_debug_sysctl, "A", "mps debug level");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
 	    "Disable the use of MSI-X interrupts");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "disable_msi", CTLFLAG_RD, &sc->disable_msi, 0,
 	    "Disable the use of MSI interrupts");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_msix", CTLFLAG_RD, &sc->max_msix, 0,
 	    "User-defined maximum number of MSIX queues");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "msix_msgs", CTLFLAG_RD, &sc->msi_msgs, 0,
 	    "Negotiated number of MSIX queues");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_reqframes", CTLFLAG_RD, &sc->max_reqframes, 0,
 	    "Total number of allocated request frames");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_prireqframes", CTLFLAG_RD, &sc->max_prireqframes, 0,
 	    "Total number of allocated high priority request frames");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_replyframes", CTLFLAG_RD, &sc->max_replyframes, 0,
 	    "Total number of allocated reply frames");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_evtframes", CTLFLAG_RD, &sc->max_evtframes, 0,
 	    "Total number of event frames allocated");
 
 	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "firmware_version", CTLFLAG_RW, sc->fw_version,
 	    strlen(sc->fw_version), "firmware version");
 
 	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "driver_version", CTLFLAG_RW, MPS_DRIVER_VERSION,
 	    strlen(MPS_DRIVER_VERSION), "driver version");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "io_cmds_active", CTLFLAG_RD,
 	    &sc->io_cmds_active, 0, "number of currently active commands");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
 	    &sc->io_cmds_highwater, 0, "maximum active commands seen");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_free", CTLFLAG_RD,
 	    &sc->chain_free, 0, "number of free chain elements");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
 	    &sc->chain_free_lowwater, 0,"lowest number of free chain elements");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_chains", CTLFLAG_RD,
 	    &sc->max_chains, 0,"maximum chain frames that will be allocated");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "max_io_pages", CTLFLAG_RD,
 	    &sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
 	    "IOCFacts)");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
 	    "enable SSU to SATA SSD/HDD at shutdown");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
 	    &sc->chain_alloc_fail, "chain allocation failures");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
 	    &sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
 	    "spinup after SATA ID error");
 
 	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "mapping_table_dump", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    mps_mapping_dump, "A", "Mapping Table Dump");
 
 	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "encl_table_dump", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    mps_mapping_encl_dump, "A", "Enclosure Table Dump");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 	    OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
 	    "Use the phy number for enumeration");
 }
 
 static struct mps_debug_string {
 	char	*name;
 	int	flag;
 } mps_debug_strings[] = {
 	{"info", MPS_INFO},
 	{"fault", MPS_FAULT},
 	{"event", MPS_EVENT},
 	{"log", MPS_LOG},
 	{"recovery", MPS_RECOVERY},
 	{"error", MPS_ERROR},
 	{"init", MPS_INIT},
 	{"xinfo", MPS_XINFO},
 	{"user", MPS_USER},
 	{"mapping", MPS_MAPPING},
 	{"trace", MPS_TRACE}
 };
 
 enum mps_debug_level_combiner {
 	COMB_NONE,
 	COMB_ADD,
 	COMB_SUB
 };
 
 static int
 mps_debug_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct mps_softc *sc;
 	struct mps_debug_string *string;
 	struct sbuf *sbuf;
 	char *buffer;
 	size_t sz;
 	int i, len, debug, error;
 
 	sc = (struct mps_softc *)arg1;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	debug = sc->mps_debug;
 
 	sbuf_printf(sbuf, "%#x", debug);
 
 	sz = sizeof(mps_debug_strings) / sizeof(mps_debug_strings[0]);
 	for (i = 0; i < sz; i++) {
 		string = &mps_debug_strings[i];
 		if (debug & string->flag)
 			sbuf_printf(sbuf, ",%s", string->name);
 	}
 
 	error = sbuf_finish(sbuf);
 	sbuf_delete(sbuf);
 
 	if (error || req->newptr == NULL)
 		return (error);
 
 	len = req->newlen - req->newidx;
 	if (len == 0)
 		return (0);
 
 	buffer = malloc(len, M_MPT2, M_ZERO|M_WAITOK);
 	error = SYSCTL_IN(req, buffer, len);
 
 	mps_parse_debug(sc, buffer);
 
 	free(buffer, M_MPT2);
 	return (error);
 }
 
 static void
 mps_parse_debug(struct mps_softc *sc, char *list)
 {
 	struct mps_debug_string *string;
 	enum mps_debug_level_combiner op;
 	char *token, *endtoken;
 	size_t sz;
 	int flags, i;
 
 	if (list == NULL || *list == '\0')
 		return;
 
 	if (*list == '+') {
 		op = COMB_ADD;
 		list++;
 	} else if (*list == '-') {
 		op = COMB_SUB;
 		list++;
 	} else
 		op = COMB_NONE;
 	if (*list == '\0')
 		return;
 
 	flags = 0;
 	sz = sizeof(mps_debug_strings) / sizeof(mps_debug_strings[0]);
 	while ((token = strsep(&list, ":,")) != NULL) {
 
 		/* Handle integer flags */
 		flags |= strtol(token, &endtoken, 0);
 		if (token != endtoken)
 			continue;
 
 		/* Handle text flags */
 		for (i = 0; i < sz; i++) {
 			string = &mps_debug_strings[i];
 			if (strcasecmp(token, string->name) == 0) {
 				flags |= string->flag;
 				break;
 			}
 		}
 	}
 
 	switch (op) {
 	case COMB_NONE:
 		sc->mps_debug = flags;
 		break;
 	case COMB_ADD:
 		sc->mps_debug |= flags;
 		break;
 	case COMB_SUB:
 		sc->mps_debug &= (~flags);
 		break;
 	}
 
 	return;
 }
 
 int
 mps_attach(struct mps_softc *sc)
 {
 	int error;
 
 	MPS_FUNCTRACE(sc);
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	mtx_init(&sc->mps_mtx, "MPT2SAS lock", NULL, MTX_DEF);
 	callout_init_mtx(&sc->periodic, &sc->mps_mtx, 0);
 	callout_init_mtx(&sc->device_check_callout, &sc->mps_mtx, 0);
 	TAILQ_INIT(&sc->event_list);
 	timevalclear(&sc->lastfail);
 
 	if ((error = mps_transition_ready(sc)) != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "failed to transition "
 		    "ready\n");
 		return (error);
 	}
 
 	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPT2,
 	    M_ZERO|M_NOWAIT);
 	if(!sc->facts) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "Cannot allocate memory, "
 		    "exit\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Get IOC Facts and allocate all structures based on this information.
 	 * A Diag Reset will also call mps_iocfacts_allocate and re-read the IOC
 	 * Facts. If relevant values have changed in IOC Facts, this function
 	 * will free all of the memory based on IOC Facts and reallocate that
 	 * memory.  If this fails, any allocated memory should already be freed.
 	 */
 	if ((error = mps_iocfacts_allocate(sc, TRUE)) != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "IOC Facts based allocation "
 		    "failed with error %d, exit\n", error);
 		return (error);
 	}
 
 	/* Start the periodic watchdog check on the IOC Doorbell */
 	mps_periodic(sc);
 
 	/*
 	 * The portenable will kick off discovery events that will drive the
 	 * rest of the initialization process.  The CAM/SAS module will
 	 * hold up the boot sequence until discovery is complete.
 	 */
 	sc->mps_ich.ich_func = mps_startup;
 	sc->mps_ich.ich_arg = sc;
 	if (config_intrhook_establish(&sc->mps_ich) != 0) {
 		mps_dprint(sc, MPS_INIT|MPS_ERROR,
 		    "Cannot establish MPS config hook\n");
 		error = EINVAL;
 	}
 
 	/*
 	 * Allow IR to shutdown gracefully when shutdown occurs.
 	 */
 	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
 	    mpssas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);
 
 	if (sc->shutdown_eh == NULL)
 		mps_dprint(sc, MPS_INIT|MPS_ERROR,
 		    "shutdown event registration failed\n");
 
 	mps_setup_sysctl(sc);
 
 	sc->mps_flags |= MPS_FLAGS_ATTACH_DONE;
 	mps_dprint(sc, MPS_INIT, "%s exit error= %d\n", __func__, error);
 
 	return (error);
 }
 
 /* Run through any late-start handlers. */
 static void
 mps_startup(void *arg)
 {
 	struct mps_softc *sc;
 
 	sc = (struct mps_softc *)arg;
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 
 	mps_lock(sc);
 	mps_unmask_intr(sc);
 
 	/* initialize device mapping tables */
 	mps_base_static_config_pages(sc);
 	mps_mapping_initialize(sc);
 	mpssas_startup(sc);
 	mps_unlock(sc);
 
 	mps_dprint(sc, MPS_INIT, "disestablish config intrhook\n");
 	config_intrhook_disestablish(&sc->mps_ich);
 	sc->mps_ich.ich_arg = NULL;
 
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 }
 
 /* Periodic watchdog.  Is called with the driver lock already held. */
 static void
 mps_periodic(void *arg)
 {
 	struct mps_softc *sc;
 	uint32_t db;
 
 	sc = (struct mps_softc *)arg;
 	if (sc->mps_flags & MPS_FLAGS_SHUTDOWN)
 		return;
 
 	db = mps_regread(sc, MPI2_DOORBELL_OFFSET);
 	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
 		mps_dprint(sc, MPS_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
 		mps_reinit(sc);
 	}
 
 	callout_reset(&sc->periodic, MPS_PERIODIC_DELAY * hz, mps_periodic, sc);
 }
 
 static void
 mps_log_evt_handler(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *event)
 {
 	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;
 
 	MPS_DPRINT_EVENT(sc, generic, event);
 
 	switch (event->Event) {
 	case MPI2_EVENT_LOG_DATA:
 		mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_DATA:\n");
 		if (sc->mps_debug & MPS_EVENT)
 			hexdump(event->EventData, event->EventDataLength, NULL, 0);
 		break;
 	case MPI2_EVENT_LOG_ENTRY_ADDED:
 		entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
 		mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
 		    "0x%x Sequence %d:\n", entry->LogEntryQualifier,
 		     entry->LogSequence);
 		break;
 	default:
 		break;
 	}
 	return;
 }
 
 static int
 mps_attach_log(struct mps_softc *sc)
 {
 	u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS];
 
 	bzero(events, 16);
 	setbit(events, MPI2_EVENT_LOG_DATA);
 	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
 
 	mps_register_events(sc, events, mps_log_evt_handler, NULL,
 	    &sc->mps_log_eh);
 
 	return (0);
 }
 
 static int
 mps_detach_log(struct mps_softc *sc)
 {
 
 	if (sc->mps_log_eh != NULL)
 		mps_deregister_events(sc, sc->mps_log_eh);
 	return (0);
 }
 
 /*
  * Free all of the driver resources and detach submodules.  Should be called
  * without the lock held.
  */
 int
 mps_free(struct mps_softc *sc)
 {
 	int error;
 
 	mps_dprint(sc, MPS_INIT, "%s entered\n", __func__);
 	/* Turn off the watchdog */
 	mps_lock(sc);
 	sc->mps_flags |= MPS_FLAGS_SHUTDOWN;
 	mps_unlock(sc);
 	/* Lock must not be held for this */
 	callout_drain(&sc->periodic);
 	callout_drain(&sc->device_check_callout);
 
 	if (((error = mps_detach_log(sc)) != 0) ||
 	    ((error = mps_detach_sas(sc)) != 0)) {
 		mps_dprint(sc, MPS_INIT|MPS_FAULT, "failed to detach "
 		    "subsystems, exit\n");
 		return (error);
 	}
 
 	mps_detach_user(sc);
 
 	/* Put the IOC back in the READY state. */
 	mps_lock(sc);
 	if ((error = mps_transition_ready(sc)) != 0) {
 		mps_unlock(sc);
 		return (error);
 	}
 	mps_unlock(sc);
 
 	if (sc->facts != NULL)
 		free(sc->facts, M_MPT2);
 
 	/*
 	 * Free all buffers that are based on IOC Facts.  A Diag Reset may need
 	 * to free these buffers too.
 	 */
 	mps_iocfacts_free(sc);
 
 	if (sc->sysctl_tree != NULL)
 		sysctl_ctx_free(&sc->sysctl_ctx);
 
 	/* Deregister the shutdown function */
 	if (sc->shutdown_eh != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);
 
 	mtx_destroy(&sc->mps_mtx);
 	mps_dprint(sc, MPS_INIT, "%s exit\n", __func__);
 
 	return (0);
 }
 
 static __inline void
 mps_complete_command(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPS_FUNCTRACE(sc);
 
 	if (cm == NULL) {
 		mps_dprint(sc, MPS_ERROR, "Completing NULL command\n");
 		return;
 	}
 
 	if (cm->cm_flags & MPS_CM_FLAGS_POLLED)
 		cm->cm_flags |= MPS_CM_FLAGS_COMPLETE;
 
 	if (cm->cm_complete != NULL) {
 		mps_dprint(sc, MPS_TRACE,
 			   "%s cm %p calling cm_complete %p data %p reply %p\n",
 			   __func__, cm, cm->cm_complete, cm->cm_complete_data,
 			   cm->cm_reply);
 		cm->cm_complete(sc, cm);
 	}
 
 	if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) {
 		mps_dprint(sc, MPS_TRACE, "waking up %p\n", cm);
 		wakeup(cm);
 	}
 
 	if (cm->cm_sc->io_cmds_active != 0) {
 		cm->cm_sc->io_cmds_active--;
 	} else {
 		mps_dprint(sc, MPS_ERROR, "Warning: io_cmds_active is "
 		    "out of sync - resynching to 0\n");
 	}
 }
 
 
 static void
 mps_sas_log_info(struct mps_softc *sc , u32 log_info)
 {
 	union loginfo_type {
 		u32     loginfo;
 		struct {
 			u32     subcode:16;
 			u32     code:8;
 			u32     originator:4;
 			u32     bus_type:4;
 		} dw;
 	};
 	union loginfo_type sas_loginfo;
 	char *originator_str = NULL;
 
 	sas_loginfo.loginfo = log_info;
 	if (sas_loginfo.dw.bus_type != 3 /*SAS*/)
 		return;
 
 	/* each nexus loss loginfo */
 	if (log_info == 0x31170000)
 		return;
 
 	/* eat the loginfos associated with task aborts */
 	if ((log_info == 30050000 || log_info ==
 	    0x31140000 || log_info == 0x31130000))
 		return;
 
 	switch (sas_loginfo.dw.originator) {
 	case 0:
 		originator_str = "IOP";
 		break;
 	case 1:
 		originator_str = "PL";
 		break;
 	case 2:
 		originator_str = "IR";
 		break;
 }
 
 	mps_dprint(sc, MPS_LOG, "log_info(0x%08x): originator(%s), "
 	"code(0x%02x), sub_code(0x%04x)\n", log_info,
 	originator_str, sas_loginfo.dw.code,
 	sas_loginfo.dw.subcode);
 }
 
 static void
 mps_display_reply_info(struct mps_softc *sc, uint8_t *reply)
 {
 	MPI2DefaultReply_t *mpi_reply;
 	u16 sc_status;
 
 	mpi_reply = (MPI2DefaultReply_t*)reply;
 	sc_status = le16toh(mpi_reply->IOCStatus);
 	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
 		mps_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
 }
 void
 mps_intr(void *data)
 {
 	struct mps_softc *sc;
 	uint32_t status;
 
 	sc = (struct mps_softc *)data;
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/*
 	 * Check interrupt status register to flush the bus.  This is
 	 * needed for both INTx interrupts and driver-driven polling
 	 */
 	status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
 	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
 		return;
 
 	mps_lock(sc);
 	mps_intr_locked(data);
 	mps_unlock(sc);
 	return;
 }
 
 /*
  * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
  * chip.  Hopefully this theory is correct.
  */
 void
 mps_intr_msi(void *data)
 {
 	struct mps_softc *sc;
 
 	sc = (struct mps_softc *)data;
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 	mps_lock(sc);
 	mps_intr_locked(data);
 	mps_unlock(sc);
 	return;
 }
 
 /*
  * The locking is overly broad and simplistic, but easy to deal with for now.
  */
 void
 mps_intr_locked(void *data)
 {
 	MPI2_REPLY_DESCRIPTORS_UNION *desc;
 	struct mps_softc *sc;
 	struct mps_command *cm = NULL;
 	uint8_t flags;
 	u_int pq;
 	MPI2_DIAG_RELEASE_REPLY *rel_rep;
 	mps_fw_diagnostic_buffer_t *pBuffer;
 
 	sc = (struct mps_softc *)data;
 
 	pq = sc->replypostindex;
 	mps_dprint(sc, MPS_TRACE,
 	    "%s sc %p starting with replypostindex %u\n", 
 	    __func__, sc, sc->replypostindex);
 
 	for ( ;; ) {
 		cm = NULL;
 		desc = &sc->post_queue[sc->replypostindex];
 		flags = desc->Default.ReplyFlags &
 		    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
 		if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
 		 || (le32toh(desc->Words.High) == 0xffffffff))
 			break;
 
 		/* increment the replypostindex now, so that event handlers
 		 * and cm completion handlers which decide to do a diag
 		 * reset can zero it without it getting incremented again
 		 * afterwards, and we break out of this loop on the next
 		 * iteration since the reply post queue has been cleared to
 		 * 0xFF and all descriptors look unused (which they are).
 		 */
 		if (++sc->replypostindex >= sc->pqdepth)
 			sc->replypostindex = 0;
 
 		switch (flags) {
 		case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
 			cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
 			cm->cm_reply = NULL;
 			break;
 		case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
 		{
 			uint32_t baddr;
 			uint8_t *reply;
 
 			/*
 			 * Re-compose the reply address from the address
 			 * sent back from the chip.  The ReplyFrameAddress
 			 * is the lower 32 bits of the physical address of
 			 * particular reply frame.  Convert that address to
 			 * host format, and then use that to provide the
 			 * offset against the virtual address base
 			 * (sc->reply_frames).
 			 */
 			baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
 			reply = sc->reply_frames +
 				(baddr - ((uint32_t)sc->reply_busaddr));
 			/*
 			 * Make sure the reply we got back is in a valid
 			 * range.  If not, go ahead and panic here, since
 			 * we'll probably panic as soon as we deference the
 			 * reply pointer anyway.
 			 */
 			if ((reply < sc->reply_frames)
 			 || (reply > (sc->reply_frames +
 			     (sc->fqdepth * sc->facts->ReplyFrameSize * 4)))) {
 				printf("%s: WARNING: reply %p out of range!\n",
 				       __func__, reply);
 				printf("%s: reply_frames %p, fqdepth %d, "
 				       "frame size %d\n", __func__,
 				       sc->reply_frames, sc->fqdepth,
 				       sc->facts->ReplyFrameSize * 4);
 				printf("%s: baddr %#x,\n", __func__, baddr);
 				/* LSI-TODO. See Linux Code. Need Graceful exit*/
 				panic("Reply address out of range");
 			}
 			if (le16toh(desc->AddressReply.SMID) == 0) {
 				if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
 				    MPI2_FUNCTION_DIAG_BUFFER_POST) {
 					/*
 					 * If SMID is 0 for Diag Buffer Post,
 					 * this implies that the reply is due to
 					 * a release function with a status that
 					 * the buffer has been released.  Set
 					 * the buffer flags accordingly.
 					 */
 					rel_rep =
 					    (MPI2_DIAG_RELEASE_REPLY *)reply;
 					if ((le16toh(rel_rep->IOCStatus) &
 					    MPI2_IOCSTATUS_MASK) ==
 					    MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
 					{
 						pBuffer =
 						    &sc->fw_diag_buffer_list[
 						    rel_rep->BufferType];
 						pBuffer->valid_data = TRUE;
 						pBuffer->owned_by_firmware =
 						    FALSE;
 						pBuffer->immediate = FALSE;
 					}
 				} else
 					mps_dispatch_event(sc, baddr,
 					    (MPI2_EVENT_NOTIFICATION_REPLY *)
 					    reply);
 			} else {
 				cm = &sc->commands[le16toh(desc->AddressReply.SMID)];
 				cm->cm_reply = reply;
 				cm->cm_reply_data =
 				    le32toh(desc->AddressReply.ReplyFrameAddress);
 			}
 			break;
 		}
 		case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
 		case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
 		case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
 		default:
 			/* Unhandled */
 			mps_dprint(sc, MPS_ERROR, "Unhandled reply 0x%x\n",
 			    desc->Default.ReplyFlags);
 			cm = NULL;
 			break;
 		}
 		
 
 		if (cm != NULL) {
 			// Print Error reply frame
 			if (cm->cm_reply)
 				mps_display_reply_info(sc,cm->cm_reply);
 			mps_complete_command(sc, cm);
 		}
 
 		desc->Words.Low = 0xffffffff;
 		desc->Words.High = 0xffffffff;
 	}
 
 	if (pq != sc->replypostindex) {
 		mps_dprint(sc, MPS_TRACE,
 		    "%s sc %p writing postindex %d\n",
 		    __func__, sc, sc->replypostindex);
 		mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, sc->replypostindex);
 	}
 
 	return;
 }
 
 static void
 mps_dispatch_event(struct mps_softc *sc, uintptr_t data,
     MPI2_EVENT_NOTIFICATION_REPLY *reply)
 {
 	struct mps_event_handle *eh;
 	int event, handled = 0;
 
 	event = le16toh(reply->Event);
 	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 		if (isset(eh->mask, event)) {
 			eh->callback(sc, data, reply);
 			handled++;
 		}
 	}
 
 	if (handled == 0)
 		mps_dprint(sc, MPS_EVENT, "Unhandled event 0x%x\n", le16toh(event));
 
 	/*
 	 * This is the only place that the event/reply should be freed.
 	 * Anything wanting to hold onto the event data should have
 	 * already copied it into their own storage.
 	 */
 	mps_free_reply(sc, data);
 }
 
 static void
 mps_reregister_events_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if (cm->cm_reply)
 		MPS_DPRINT_EVENT(sc, generic,
 			(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);
 
 	mps_free_command(sc, cm);
 
 	/* next, send a port enable */
 	mpssas_startup(sc);
 }
 
 /*
  * For both register_events and update_events, the caller supplies a bitmap
  * of events that it _wants_.  These functions then turn that into a bitmask
  * suitable for the controller.
  */
 int
 mps_register_events(struct mps_softc *sc, u32 *mask,
     mps_evt_callback_t *cb, void *data, struct mps_event_handle **handle)
 {
 	struct mps_event_handle *eh;
 	int error = 0;
 
 	eh = malloc(sizeof(struct mps_event_handle), M_MPT2, M_WAITOK|M_ZERO);
 	if(!eh) {
 		mps_dprint(sc, MPS_ERROR, "Cannot allocate event memory\n");
 		return (ENOMEM);
 	}
 	eh->callback = cb;
 	eh->data = data;
 	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
 	if (mask != NULL)
 		error = mps_update_events(sc, eh, mask);
 	*handle = eh;
 
 	return (error);
 }
 
 int
 mps_update_events(struct mps_softc *sc, struct mps_event_handle *handle,
     u32 *mask)
 {
 	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 	MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
 	struct mps_command *cm;
 	int error, i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	if ((mask != NULL) && (handle != NULL))
 		bcopy(mask, &handle->mask[0], sizeof(u32) * 
 				MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
     
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] = -1;
 
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] &= ~handle->mask[i];
 
 
 	if ((cm = mps_alloc_command(sc)) == NULL)
 		return (EBUSY);
 	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 	evtreq->MsgFlags = 0;
 	evtreq->SASBroadcastPrimitiveMasks = 0;
 #ifdef MPS_DEBUG_ALL_EVENTS
 	{
 		u_char fullmask[16];
 		memset(fullmask, 0x00, 16);
 		bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) * 
 				MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
 	}
 #else
         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
                 evtreq->EventMasks[i] =
                     htole32(sc->event_mask[i]);
 #endif
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_data = NULL;
 
 	error = mps_wait_command(sc, &cm, 60, 0);
 	if (cm != NULL)
 		reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
 	if ((reply == NULL) ||
 	    (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 		error = ENXIO;
 
 	if (reply)
 		MPS_DPRINT_EVENT(sc, generic, reply);
 
 	mps_dprint(sc, MPS_TRACE, "%s finished error %d\n", __func__, error);
 
 	if (cm != NULL)
 		mps_free_command(sc, cm);
 	return (error);
 }
 
 static int
 mps_reregister_events(struct mps_softc *sc)
 {
 	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 	struct mps_command *cm;
 	struct mps_event_handle *eh;
 	int error, i;
 
 	mps_dprint(sc, MPS_TRACE, "%s\n", __func__);
 
 	/* first, reregister events */
 
 	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 		sc->event_mask[i] = -1;
 
 	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 		for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 			sc->event_mask[i] &= ~eh->mask[i];
 	}
 
 	if ((cm = mps_alloc_command(sc)) == NULL)
 		return (EBUSY);
 	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 	evtreq->MsgFlags = 0;
 	evtreq->SASBroadcastPrimitiveMasks = 0;
 #ifdef MPS_DEBUG_ALL_EVENTS
 	{
 		u_char fullmask[16];
 		memset(fullmask, 0x00, 16);
 		bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) *
 			MPI2_EVENT_NOTIFY_EVENTMASK_WORDS);
 	}
 #else
         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
                 evtreq->EventMasks[i] =
                     htole32(sc->event_mask[i]);
 #endif
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 	cm->cm_data = NULL;
 	cm->cm_complete = mps_reregister_events_complete;
 
 	error = mps_map_command(sc, cm);
 
 	mps_dprint(sc, MPS_TRACE, "%s finished with error %d\n", __func__,
 	    error);
 	return (error);
 }
 
 void
 mps_deregister_events(struct mps_softc *sc, struct mps_event_handle *handle)
 {
 
 	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
 	free(handle, M_MPT2);
 }
 
 /*
  * Add a chain element as the next SGE for the specified command.
  * Reset cm_sge and cm_sgesize to indicate all the available space.
  */
 static int
 mps_add_chain(struct mps_command *cm)
 {
 	MPI2_SGE_CHAIN32 *sgc;
 	struct mps_chain *chain;
 	int space;
 
 	if (cm->cm_sglsize < MPS_SGC_SIZE)
 		panic("MPS: Need SGE Error Code\n");
 
 	chain = mps_alloc_chain(cm->cm_sc);
 	if (chain == NULL)
 		return (ENOBUFS);
 
 	space = (int)cm->cm_sc->facts->IOCRequestFrameSize * 4;
 
 	/*
 	 * Note: a double-linked list is used to make it easier to
 	 * walk for debugging.
 	 */
 	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);
 
 	sgc = (MPI2_SGE_CHAIN32 *)&cm->cm_sge->MpiChain;
 	sgc->Length = htole16(space);
 	sgc->NextChainOffset = 0;
 	/* TODO Looks like bug in Setting sgc->Flags. 
 	 *	sgc->Flags = ( MPI2_SGE_FLAGS_CHAIN_ELEMENT | MPI2_SGE_FLAGS_64_BIT_ADDRESSING |
 	 *	            MPI2_SGE_FLAGS_SYSTEM_ADDRESS) << MPI2_SGE_FLAGS_SHIFT
 	 *	This is fine.. because we are not using simple element. In case of 
 	 *	MPI2_SGE_CHAIN32, we have separate Length and Flags feild.
  	 */
 	sgc->Flags = MPI2_SGE_FLAGS_CHAIN_ELEMENT;
 	sgc->Address = htole32(chain->chain_busaddr);
 
 	cm->cm_sge = (MPI2_SGE_IO_UNION *)&chain->chain->MpiSimple;
 	cm->cm_sglsize = space;
 	return (0);
 }
 
 /*
  * Add one scatter-gather element (chain, simple, transaction context)
  * to the scatter-gather list for a command.  Maintain cm_sglsize and
  * cm_sge as the remaining size and pointer to the next SGE to fill
  * in, respectively.
  */
 int
 mps_push_sge(struct mps_command *cm, void *sgep, size_t len, int segsleft)
 {
 	MPI2_SGE_TRANSACTION_UNION *tc = sgep;
 	MPI2_SGE_SIMPLE64 *sge = sgep;
 	int error, type;
 	uint32_t saved_buf_len, saved_address_low, saved_address_high;
 
 	type = (tc->Flags & MPI2_SGE_FLAGS_ELEMENT_MASK);
 
 #ifdef INVARIANTS
 	switch (type) {
 	case MPI2_SGE_FLAGS_TRANSACTION_ELEMENT: {
 		if (len != tc->DetailsLength + 4)
 			panic("TC %p length %u or %zu?", tc,
 			    tc->DetailsLength + 4, len);
 		}
 		break;
 	case MPI2_SGE_FLAGS_CHAIN_ELEMENT:
 		/* Driver only uses 32-bit chain elements */
 		if (len != MPS_SGC_SIZE)
 			panic("CHAIN %p length %u or %zu?", sgep,
 			    MPS_SGC_SIZE, len);
 		break;
 	case MPI2_SGE_FLAGS_SIMPLE_ELEMENT:
 		/* Driver only uses 64-bit SGE simple elements */
 		if (len != MPS_SGE64_SIZE)
 			panic("SGE simple %p length %u or %zu?", sge,
 			    MPS_SGE64_SIZE, len);
 		if (((le32toh(sge->FlagsLength) >> MPI2_SGE_FLAGS_SHIFT) &
 		    MPI2_SGE_FLAGS_ADDRESS_SIZE) == 0)
 			panic("SGE simple %p not marked 64-bit?", sge);
 
 		break;
 	default:
 		panic("Unexpected SGE %p, flags %02x", tc, tc->Flags);
 	}
 #endif
 
 	/*
 	 * case 1: 1 more segment, enough room for it
 	 * case 2: 2 more segments, enough room for both
 	 * case 3: >=2 more segments, only enough room for 1 and a chain
 	 * case 4: >=1 more segment, enough room for only a chain
 	 * case 5: >=1 more segment, no room for anything (error)
          */
 
 	/*
 	 * There should be room for at least a chain element, or this
 	 * code is buggy.  Case (5).
 	 */
 	if (cm->cm_sglsize < MPS_SGC_SIZE)
 		panic("MPS: Need SGE Error Code\n");
 
 	if (segsleft >= 2 &&
 	    cm->cm_sglsize < len + MPS_SGC_SIZE + MPS_SGE64_SIZE) {
 		/*
 		 * There are 2 or more segments left to add, and only
 		 * enough room for 1 and a chain.  Case (3).
 		 *
 		 * Mark as last element in this chain if necessary.
 		 */
 		if (type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
 			sge->FlagsLength |= htole32(
 			    MPI2_SGE_FLAGS_LAST_ELEMENT << MPI2_SGE_FLAGS_SHIFT);
 		}
 
 		/*
 		 * Add the item then a chain.  Do the chain now,
 		 * rather than on the next iteration, to simplify
 		 * understanding the code.
 		 */
 		cm->cm_sglsize -= len;
 		bcopy(sgep, cm->cm_sge, len);
 		cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 		return (mps_add_chain(cm));
 	}
 
 	if (segsleft >= 1 && cm->cm_sglsize < len + MPS_SGC_SIZE) {
 		/*
 		 * 1 or more segment, enough room for only a chain.
 		 * Hope the previous element wasn't a Simple entry
 		 * that needed to be marked with
 		 * MPI2_SGE_FLAGS_LAST_ELEMENT.  Case (4).
 		 */
 		if ((error = mps_add_chain(cm)) != 0)
 			return (error);
 	}
 
 #ifdef INVARIANTS
 	/* Case 1: 1 more segment, enough room for it. */
 	if (segsleft == 1 && cm->cm_sglsize < len)
 		panic("1 seg left and no room? %u versus %zu",
 		    cm->cm_sglsize, len);
 
 	/* Case 2: 2 more segments, enough room for both */
 	if (segsleft == 2 && cm->cm_sglsize < len + MPS_SGE64_SIZE)
 		panic("2 segs left and no room? %u versus %zu",
 		    cm->cm_sglsize, len);
 #endif
 
 	if (segsleft == 1 && type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) {
 		/*
 		 * If this is a bi-directional request, need to account for that
 		 * here.  Save the pre-filled sge values.  These will be used
 		 * either for the 2nd SGL or for a single direction SGL.  If
 		 * cm_out_len is non-zero, this is a bi-directional request, so
 		 * fill in the OUT SGL first, then the IN SGL, otherwise just
 		 * fill in the IN SGL.  Note that at this time, when filling in
 		 * 2 SGL's for a bi-directional request, they both use the same
 		 * DMA buffer (same cm command).
 		 */
 		saved_buf_len = le32toh(sge->FlagsLength) & 0x00FFFFFF;
 		saved_address_low = sge->Address.Low;
 		saved_address_high = sge->Address.High;
 		if (cm->cm_out_len) {
 			sge->FlagsLength = htole32(cm->cm_out_len |
 			    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 			    MPI2_SGE_FLAGS_END_OF_BUFFER |
 			    MPI2_SGE_FLAGS_HOST_TO_IOC |
 			    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 			    MPI2_SGE_FLAGS_SHIFT));
 			cm->cm_sglsize -= len;
 			bcopy(sgep, cm->cm_sge, len);
 			cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge
 			    + len);
 		}
 		saved_buf_len |=
 		    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 		    MPI2_SGE_FLAGS_END_OF_BUFFER |
 		    MPI2_SGE_FLAGS_LAST_ELEMENT |
 		    MPI2_SGE_FLAGS_END_OF_LIST |
 		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 		    MPI2_SGE_FLAGS_SHIFT);
 		if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) {
 			saved_buf_len |=
 			    ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
 			    MPI2_SGE_FLAGS_SHIFT);
 		} else {
 			saved_buf_len |=
 			    ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
 			    MPI2_SGE_FLAGS_SHIFT);
 		}
 		sge->FlagsLength = htole32(saved_buf_len);
 		sge->Address.Low = saved_address_low;
 		sge->Address.High = saved_address_high;
 	}
 
 	cm->cm_sglsize -= len;
 	bcopy(sgep, cm->cm_sge, len);
 	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 	return (0);
 }
 
 /*
  * Add one dma segment to the scatter-gather list for a command.
  */
 int
 mps_add_dmaseg(struct mps_command *cm, vm_paddr_t pa, size_t len, u_int flags,
     int segsleft)
 {
 	MPI2_SGE_SIMPLE64 sge;
 
 	/*
 	 * This driver always uses 64-bit address elements for simplicity.
 	 */
 	bzero(&sge, sizeof(sge));
 	flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 	    MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
 	sge.FlagsLength = htole32(len | (flags << MPI2_SGE_FLAGS_SHIFT));
 	mps_from_u64(pa, &sge.Address);
 
 	return (mps_push_sge(cm, &sge, sizeof sge, segsleft));
 }
 
 static void
 mps_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	struct mps_softc *sc;
 	struct mps_command *cm;
 	u_int i, dir, sflags;
 
 	cm = (struct mps_command *)arg;
 	sc = cm->cm_sc;
 
 	/*
 	 * In this case, just print out a warning and let the chip tell the
 	 * user they did the wrong thing.
 	 */
 	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
 		mps_dprint(sc, MPS_ERROR,
 			   "%s: warning: busdma returned %d segments, "
 			   "more than the %d allowed\n", __func__, nsegs,
 			   cm->cm_max_segs);
 	}
 
 	/*
 	 * Set up DMA direction flags.  Bi-directional requests are also handled
 	 * here.  In that case, both direction flags will be set.
 	 */
 	sflags = 0;
 	if (cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) {
 		/*
 		 * We have to add a special case for SMP passthrough, there
 		 * is no easy way to generically handle it.  The first
 		 * S/G element is used for the command (therefore the
 		 * direction bit needs to be set).  The second one is used
 		 * for the reply.  We'll leave it to the caller to make
 		 * sure we only have two buffers.
 		 */
 		/*
 		 * Even though the busdma man page says it doesn't make
 		 * sense to have both direction flags, it does in this case.
 		 * We have one s/g element being accessed in each direction.
 		 */
 		dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD;
 
 		/*
 		 * Set the direction flag on the first buffer in the SMP
 		 * passthrough request.  We'll clear it for the second one.
 		 */
 		sflags |= MPI2_SGE_FLAGS_DIRECTION |
 			  MPI2_SGE_FLAGS_END_OF_BUFFER;
 	} else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) {
 		sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC;
 		dir = BUS_DMASYNC_PREWRITE;
 	} else
 		dir = BUS_DMASYNC_PREREAD;
 
 	for (i = 0; i < nsegs; i++) {
 		if ((cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) && (i != 0)) {
 			sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
 		}
 		error = mps_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
 		    sflags, nsegs - i);
 		if (error != 0) {
 			/* Resource shortage, roll back! */
 			if (ratecheck(&sc->lastfail, &mps_chainfail_interval))
 				mps_dprint(sc, MPS_INFO, "Out of chain frames, "
 				    "consider increasing hw.mps.max_chains.\n");
 			cm->cm_flags |= MPS_CM_FLAGS_CHAIN_FAILED;
 			mps_complete_command(sc, cm);
 			return;
 		}
 	}
 
 	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
 	mps_enqueue_request(sc, cm);
 
 	return;
 }
 
 static void
 mps_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize,
 	     int error)
 {
 	mps_data_cb(arg, segs, nsegs, error);
 }
 
 /*
  * This is the routine to enqueue commands ansynchronously.
  * Note that the only error path here is from bus_dmamap_load(), which can
  * return EINPROGRESS if it is waiting for resources.  Other than this, it's
  * assumed that if you have a command in-hand, then you have enough credits
  * to use it.
  */
 int
 mps_map_command(struct mps_softc *sc, struct mps_command *cm)
 {
 	int error = 0;
 
 	if (cm->cm_flags & MPS_CM_FLAGS_USE_UIO) {
 		error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
 		    &cm->cm_uio, mps_data_cb2, cm, 0);
 	} else if (cm->cm_flags & MPS_CM_FLAGS_USE_CCB) {
 		error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
 		    cm->cm_data, mps_data_cb, cm, 0);
 	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
 		error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
 		    cm->cm_data, cm->cm_length, mps_data_cb, cm, 0);
 	} else {
 		/* Add a zero-length element as needed */
 		if (cm->cm_sge != NULL)
 			mps_add_dmaseg(cm, 0, 0, 0, 1);
 		mps_enqueue_request(sc, cm);	
 	}
 
 	return (error);
 }
 
 /*
  * This is the routine to enqueue commands synchronously.  An error of
  * EINPROGRESS from mps_map_command() is ignored since the command will
  * be executed and enqueued automatically.  Other errors come from msleep().
  */
 int
 mps_wait_command(struct mps_softc *sc, struct mps_command **cmp, int timeout,
     int sleep_flag)
 {
 	int error, rc;
 	struct timeval cur_time, start_time;
 	struct mps_command *cm = *cmp;
 
 	if (sc->mps_flags & MPS_FLAGS_DIAGRESET) 
 		return  EBUSY;
 
 	cm->cm_complete = NULL;
 	cm->cm_flags |= MPS_CM_FLAGS_POLLED;
 	error = mps_map_command(sc, cm);
 	if ((error != 0) && (error != EINPROGRESS))
 		return (error);
 
 	/*
 	 * Check for context and wait for 50 mSec at a time until time has
 	 * expired or the command has finished.  If msleep can't be used, need
 	 * to poll.
 	 */
 	if (curthread->td_no_sleeping != 0)
 		sleep_flag = NO_SLEEP;
 	getmicrouptime(&start_time);
 	if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) {
 		cm->cm_flags |= MPS_CM_FLAGS_WAKEUP;
 		error = msleep(cm, &sc->mps_mtx, 0, "mpswait", timeout*hz);
 		if (error == EWOULDBLOCK) {
 			/*
 			 * Record the actual elapsed time in the case of a
 			 * timeout for the message below.
 			 */
 			getmicrouptime(&cur_time);
 			timevalsub(&cur_time, &start_time);
 		}
 	} else {
 		while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) {
 			mps_intr_locked(sc);
 			if (sleep_flag == CAN_SLEEP)
 				pause("mpswait", hz/20);
 			else
 				DELAY(50000);
 		
 			getmicrouptime(&cur_time);
 			timevalsub(&cur_time, &start_time);
 			if (cur_time.tv_sec > timeout) {
 				error = EWOULDBLOCK;
 				break;
 			}
 		}
 	}
 
 	if (error == EWOULDBLOCK) {
 		mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s, timeout=%d,"
 		    " elapsed=%jd\n", __func__, timeout,
 		    (intmax_t)cur_time.tv_sec);
 		rc = mps_reinit(sc);
 		mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
 		    "failed");
 		if (sc->mps_flags & MPS_FLAGS_REALLOCATED) {
 			/*
 			 * Tell the caller that we freed the command in a
 			 * reinit.
 			 */
 			*cmp = NULL;
 		}
 		error = ETIMEDOUT;
 	}
 	return (error);
 }
 
 /*
  * The MPT driver had a verbose interface for config pages.  In this driver,
  * reduce it to much simpler terms, similar to the Linux driver.
  */
 int
 mps_read_config_page(struct mps_softc *sc, struct mps_config_params *params)
 {
 	MPI2_CONFIG_REQUEST *req;
 	struct mps_command *cm;
 	int error;
 
 	if (sc->mps_flags & MPS_FLAGS_BUSY) {
 		return (EBUSY);
 	}
 
 	cm = mps_alloc_command(sc);
 	if (cm == NULL) {
 		return (EBUSY);
 	}
 
 	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
 	req->Function = MPI2_FUNCTION_CONFIG;
 	req->Action = params->action;
 	req->SGLFlags = 0;
 	req->ChainOffset = 0;
 	req->PageAddress = params->page_address;
 	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 		MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
 
 		hdr = &params->hdr.Ext;
 		req->ExtPageType = hdr->ExtPageType;
 		req->ExtPageLength = hdr->ExtPageLength;
 		req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
 		req->Header.PageLength = 0; /* Must be set to zero */
 		req->Header.PageNumber = hdr->PageNumber;
 		req->Header.PageVersion = hdr->PageVersion;
 	} else {
 		MPI2_CONFIG_PAGE_HEADER *hdr;
 
 		hdr = &params->hdr.Struct;
 		req->Header.PageType = hdr->PageType;
 		req->Header.PageNumber = hdr->PageNumber;
 		req->Header.PageLength = hdr->PageLength;
 		req->Header.PageVersion = hdr->PageVersion;
 	}
 
 	cm->cm_data = params->buffer;
 	cm->cm_length = params->length;
 	if (cm->cm_data != NULL) {
 		cm->cm_sge = &req->PageBufferSGE;
 		cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
 		cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE | MPS_CM_FLAGS_DATAIN;
 	} else
 		cm->cm_sge = NULL;
 	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 
 	cm->cm_complete_data = params;
 	if (params->callback != NULL) {
 		cm->cm_complete = mps_config_complete;
 		return (mps_map_command(sc, cm));
 	} else {
 		error = mps_wait_command(sc, &cm, 0, CAN_SLEEP);
 		if (error) {
 			mps_dprint(sc, MPS_FAULT,
 			    "Error %d reading config page\n", error);
 			if (cm != NULL)
 				mps_free_command(sc, cm);
 			return (error);
 		}
 		mps_config_complete(sc, cm);
 	}
 
 	return (0);
 }
 
 int
 mps_write_config_page(struct mps_softc *sc, struct mps_config_params *params)
 {
 	return (EINVAL);
 }
 
 static void
 mps_config_complete(struct mps_softc *sc, struct mps_command *cm)
 {
 	MPI2_CONFIG_REPLY *reply;
 	struct mps_config_params *params;
 
 	MPS_FUNCTRACE(sc);
 	params = cm->cm_complete_data;
 
 	if (cm->cm_data != NULL) {
 		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
 		    BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 	}
 
 	/*
 	 * XXX KDM need to do more error recovery?  This results in the
 	 * device in question not getting probed.
 	 */
 	if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) {
 		params->status = MPI2_IOCSTATUS_BUSY;
 		goto done;
 	}
 
 	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
 	if (reply == NULL) {
 		params->status = MPI2_IOCSTATUS_BUSY;
 		goto done;
 	}
 	params->status = reply->IOCStatus;
 	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 		params->hdr.Ext.ExtPageType = reply->ExtPageType;
 		params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
 		params->hdr.Ext.PageType = reply->Header.PageType;
 		params->hdr.Ext.PageNumber = reply->Header.PageNumber;
 		params->hdr.Ext.PageVersion = reply->Header.PageVersion;
 	} else {
 		params->hdr.Struct.PageType = reply->Header.PageType;
 		params->hdr.Struct.PageNumber = reply->Header.PageNumber;
 		params->hdr.Struct.PageLength = reply->Header.PageLength;
 		params->hdr.Struct.PageVersion = reply->Header.PageVersion;
 	}
 
 done:
 	mps_free_command(sc, cm);
 	if (params->callback != NULL)
 		params->callback(sc, params);
 
 	return;
 }
Index: head/sys/fs/cd9660/cd9660_vnops.c
===================================================================
--- head/sys/fs/cd9660/cd9660_vnops.c	(revision 328237)
+++ head/sys/fs/cd9660/cd9660_vnops.c	(revision 328238)
@@ -1,921 +1,921 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vnops.c	8.19 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/unistd.h>
 #include <sys/filio.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #include <fs/cd9660/iso.h>
 #include <fs/cd9660/cd9660_node.h>
 #include <fs/cd9660/iso_rrip.h>
 
 static vop_setattr_t	cd9660_setattr;
 static vop_open_t	cd9660_open;
 static vop_access_t	cd9660_access;
 static vop_getattr_t	cd9660_getattr;
 static vop_ioctl_t	cd9660_ioctl;
 static vop_pathconf_t	cd9660_pathconf;
 static vop_read_t	cd9660_read;
 struct isoreaddir;
 static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off);
 static int iso_shipdir(struct isoreaddir *idp);
 static vop_readdir_t	cd9660_readdir;
 static vop_readlink_t	cd9660_readlink;
 static vop_strategy_t	cd9660_strategy;
 static vop_vptofh_t	cd9660_vptofh;
 static vop_getpages_t	cd9660_getpages;
 
 /*
  * Setattr call. Only allowed for block and character special devices.
  */
 static int
 cd9660_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
 		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC.
  * The mode is shifted to select the owner/group/other fields. The
  * super user is granted all permissions.
  */
 /* ARGSUSED */
 static int
 cd9660_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts unless the file is a socket,
 	 * fifo, or a block or character device resident on the
 	 * filesystem.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid,
 	    ip->inode.iso_gid, ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 cd9660_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		struct file *a_fp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	vnode_create_vobject(vp, ip->i_size, ap->a_td);
 	return (0);
 }
 
 
 static int
 cd9660_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct iso_node *ip = VTOI(vp);
 
 	vap->va_fsid    = dev2udev(ip->i_mnt->im_dev);
 	vap->va_fileid	= ip->i_number;
 
 	vap->va_mode	= ip->inode.iso_mode;
 	vap->va_nlink	= ip->inode.iso_links;
 	vap->va_uid	= ip->inode.iso_uid;
 	vap->va_gid	= ip->inode.iso_gid;
 	vap->va_atime	= ip->inode.iso_atime;
 	vap->va_mtime	= ip->inode.iso_mtime;
 	vap->va_ctime	= ip->inode.iso_ctime;
 	vap->va_rdev	= ip->inode.iso_rdev;
 
 	vap->va_size	= (u_quad_t) ip->i_size;
 	if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) {
 		struct vop_readlink_args rdlnk;
 		struct iovec aiov;
 		struct uio auio;
 		char *cp;
 
 		cp = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = curthread;
 		auio.uio_resid = MAXPATHLEN;
 		rdlnk.a_uio = &auio;
 		rdlnk.a_vp = ap->a_vp;
 		rdlnk.a_cred = ap->a_cred;
 		if (cd9660_readlink(&rdlnk) == 0)
 			vap->va_size = MAXPATHLEN - auio.uio_resid;
 		free(cp, M_TEMP);
 	}
 	vap->va_flags	= 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = ip->i_mnt->logical_block_size;
 	vap->va_bytes	= (u_quad_t) ip->i_size;
 	vap->va_type	= vp->v_type;
 	vap->va_filerev	= 0;
 	return (0);
 }
 
 /*
  * Vnode op for ioctl.
  */
 static int
 cd9660_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct iso_node *ip;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		VOP_UNLOCK(vp, 0);
 		return (EOPNOTSUPP);
 	}
 
 	ip = VTOI(vp);
 	error = 0;
 
 	switch (ap->a_command) {
 	case FIOGETLBA:
 		*(int *)(ap->a_data) = ip->iso_start;
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 cd9660_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct iso_node *ip = VTOI(vp);
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff;
 	int rasize, error = 0;
 	int seqcount;
 	long size, n, on;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	imp = ip->i_mnt;
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
 		n = MIN(imp->logical_block_size - on, uio->uio_resid);
 		diff = (off_t)ip->i_size - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = blksize(imp, ip, lbn);
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(imp, rablock) < ip->i_size)
 				error = cluster_read(vp, (off_t)ip->i_size,
 					 lbn, size, NOCRED, uio->uio_resid,
 					 (ap->a_ioflag >> 16), 0, &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
 			if (seqcount > 1 &&
 			    lblktosize(imp, rablock) < ip->i_size) {
 				rasize = blksize(imp, ip, rablock);
 				error = breadn(vp, lbn, size, &rablock,
 					       &rasize, 1, NOCRED, &bp);
 			} else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error != 0)
 			return (error);
 		n = MIN(n, size - bp->b_resid);
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Structure for reading directories
  */
 struct isoreaddir {
 	struct dirent saveent;
 	struct dirent assocent;
 	struct dirent current;
 	off_t saveoff;
 	off_t assocoff;
 	off_t curroff;
 	struct uio *uio;
 	off_t uio_off;
 	int eofflag;
 	u_long *cookies;
 	int ncookies;
 };
 
 static int
 iso_uiodir(idp,dp,off)
 	struct isoreaddir *idp;
 	struct dirent *dp;
 	off_t off;
 {
 	int error;
 
 	dp->d_name[dp->d_namlen] = 0;
 	dp->d_reclen = GENERIC_DIRSIZ(dp);
 
 	if (idp->uio->uio_resid < dp->d_reclen) {
 		idp->eofflag = 0;
 		return (-1);
 	}
 
 	if (idp->cookies) {
 		if (idp->ncookies <= 0) {
 			idp->eofflag = 0;
 			return (-1);
 		}
 
 		*idp->cookies++ = off;
 		--idp->ncookies;
 	}
 
 	if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)
 		return (error);
 	idp->uio_off = off;
 	return (0);
 }
 
 static int
 iso_shipdir(idp)
 	struct isoreaddir *idp;
 {
 	struct dirent *dp;
 	int cl, sl, assoc;
 	int error;
 	char *cname, *sname;
 
 	cl = idp->current.d_namlen;
 	cname = idp->current.d_name;
 	assoc = (cl > 1) && (*cname == ASSOCCHAR);
 	if (assoc) {
 		cl--;
 		cname++;
 	}
 
 	dp = &idp->saveent;
 	sname = dp->d_name;
 	if (!(sl = dp->d_namlen)) {
 		dp = &idp->assocent;
 		sname = dp->d_name + 1;
 		sl = dp->d_namlen - 1;
 	}
 	if (sl > 0) {
 		if (sl != cl
 		    || bcmp(sname,cname,sl)) {
 			if (idp->assocent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0)
 					return (error);
 				idp->assocent.d_namlen = 0;
 			}
 			if (idp->saveent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0)
 					return (error);
 				idp->saveent.d_namlen = 0;
 			}
 		}
 	}
 	idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current);
 	if (assoc) {
 		idp->assocoff = idp->curroff;
 		bcopy(&idp->current,&idp->assocent,idp->current.d_reclen);
 	} else {
 		idp->saveoff = idp->curroff;
 		bcopy(&idp->current,&idp->saveent,idp->current.d_reclen);
 	}
 	return (0);
 }
 
 /*
  * Vnode op for readdir
  */
 static int
 cd9660_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct isoreaddir *idp;
 	struct vnode *vdp = ap->a_vp;
 	struct iso_node *dp;
 	struct iso_mnt *imp;
 	struct buf *bp = NULL;
 	struct iso_directory_record *ep;
 	int entryoffsetinblock;
 	doff_t endsearch;
 	u_long bmask;
 	int error = 0;
 	int reclen;
 	u_short namelen;
-	int ncookies = 0;
+	u_int ncookies = 0;
 	u_long *cookies = NULL;
 	cd_ino_t ino;
 
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 	bmask = imp->im_bmask;
 
 	idp = malloc(sizeof(*idp), M_TEMP, M_WAITOK);
 	idp->saveent.d_namlen = idp->assocent.d_namlen = 0;
 	/*
 	 * XXX
 	 * Is it worth trying to figure out the type?
 	 */
 	idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type =
 	    DT_UNKNOWN;
 	idp->uio = uio;
 	if (ap->a_ncookies == NULL) {
 		idp->cookies = NULL;
 	} else {
 		/*
 		 * Guess the number of cookies needed.
 		 */
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		idp->cookies = cookies;
 		idp->ncookies = ncookies;
 	}
 	idp->eofflag = 1;
 	idp->curroff = uio->uio_offset;
 	idp->uio_off = uio->uio_offset;
 
 	if ((entryoffsetinblock = idp->curroff & bmask) &&
 	    (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) {
 		free(idp, M_TEMP);
 		return (error);
 	}
 	endsearch = dp->i_size;
 
 	while (idp->curroff < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((idp->curroff & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0)
 				break;
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			idp->curroff =
 			    (idp->curroff & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size) {
 			error = EINVAL;
 			/* illegal directory, so stop looking */
 			break;
 		}
 
 		idp->current.d_namlen = isonum_711(ep->name_len);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (isonum_711(ep->flags)&2)
 			idp->current.d_fileno = isodirino(ep, imp);
 		else
 			idp->current.d_fileno = dbtob(bp->b_blkno) +
 				entryoffsetinblock;
 
 		idp->curroff += reclen;
 
 		switch (imp->iso_ftype) {
 		case ISO_FTYPE_RRIP:
 			ino = idp->current.d_fileno;
 			cd9660_rrip_getname(ep, idp->current.d_name, &namelen,
 			    &ino, imp);
 			idp->current.d_fileno = ino;
 			idp->current.d_namlen = (u_char)namelen;
 			if (idp->current.d_namlen)
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			break;
 		default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/
 			strcpy(idp->current.d_name,"..");
 			if (idp->current.d_namlen == 1 && ep->name[0] == 0) {
 				idp->current.d_namlen = 1;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else if (idp->current.d_namlen == 1 && ep->name[0] == 1) {
 				idp->current.d_namlen = 2;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else {
 				isofntrans(ep->name,idp->current.d_namlen,
 					   idp->current.d_name, &namelen,
 					   imp->iso_ftype == ISO_FTYPE_9660,
 					   isonum_711(ep->flags)&4,
 					   imp->joliet_level,
 					   imp->im_flags,
 					   imp->im_d2l);
 				idp->current.d_namlen = (u_char)namelen;
 				if (imp->iso_ftype == ISO_FTYPE_DEFAULT)
 					error = iso_shipdir(idp);
 				else
 					error = iso_uiodir(idp,&idp->current,idp->curroff);
 			}
 		}
 		if (error)
 			break;
 
 		entryoffsetinblock += reclen;
 	}
 
 	if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) {
 		idp->current.d_namlen = 0;
 		error = iso_shipdir(idp);
 	}
 	if (error < 0)
 		error = 0;
 
 	if (ap->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			/*
 			 * Work out the number of cookies actually used.
 			 */
 			*ap->a_ncookies = ncookies - idp->ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 
 	if (bp)
 		brelse (bp);
 
 	uio->uio_offset = idp->uio_off;
 	*ap->a_eofflag = idp->eofflag;
 
 	free(idp, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  * Shouldn't we get the parent vnode and read the data from there?
  * This could eventually result in deadlocks in cd9660_lookup.
  * But otherwise the block read here is in the block buffer two times.
  */
 typedef struct iso_directory_record ISODIR;
 typedef struct iso_node		    ISONODE;
 typedef struct iso_mnt		    ISOMNT;
 static int
 cd9660_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	ISONODE	*ip;
 	ISODIR	*dirp;
 	ISOMNT	*imp;
 	struct	buf *bp;
 	struct	uio *uio;
 	u_short	symlen;
 	int	error;
 	char	*symname;
 
 	ip  = VTOI(ap->a_vp);
 	imp = ip->i_mnt;
 	uio = ap->a_uio;
 
 	if (imp->iso_ftype != ISO_FTYPE_RRIP)
 		return (EINVAL);
 
 	/*
 	 * Get parents directory record block that this inode included.
 	 */
 	error = bread(imp->im_devvp,
 		      (ip->i_number >> imp->im_bshift) <<
 		      (imp->im_bshift - DEV_BSHIFT),
 		      imp->logical_block_size, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Setup the directory pointer for this inode
 	 */
 	dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask));
 
 	/*
 	 * Just make sure, we have a right one....
 	 *   1: Check not cross boundary on block
 	 */
 	if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length)
 	    > (unsigned)imp->logical_block_size) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Now get a buffer
 	 * Abuse a namei buffer for now.
 	 */
 	if (uio->uio_segflg == UIO_SYSSPACE)
 		symname = uio->uio_iov->iov_base;
 	else
 		symname = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Ok, we just gathering a symbolic name in SL record.
 	 */
 	if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) {
 		if (uio->uio_segflg != UIO_SYSSPACE)
 			uma_zfree(namei_zone, symname);
 		brelse(bp);
 		return (EINVAL);
 	}
 	/*
 	 * Don't forget before you leave from home ;-)
 	 */
 	brelse(bp);
 
 	/*
 	 * return with the symbolic name to caller's.
 	 */
 	if (uio->uio_segflg != UIO_SYSSPACE) {
 		error = uiomove(symname, symlen, uio);
 		uma_zfree(namei_zone, symname);
 		return (error);
 	}
 	uio->uio_resid -= symlen;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen;
 	uio->uio_iov->iov_len -= symlen;
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 static int
 cd9660_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip;
 	struct bufobj *bo;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("cd9660_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		bp->b_blkno = (ip->iso_start + bp->b_lblkno) <<
 		    (ip->i_mnt->im_bshift - DEV_BSHIFT);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_mnt->im_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to cd9660 filesystems.
  */
 static int
 cd9660_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP)
 			*ap->a_retval = NAME_MAX;
 		else
 			*ap->a_retval = 37;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) {
 			*ap->a_retval = MAXPATHLEN;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Vnode pointer to File handle
  */
 static int
 cd9660_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	struct ifid ifh;
 	struct iso_node *ip = VTOI(ap->a_vp);
 
 	ifh.ifid_len = sizeof(struct ifid);
 
 	ifh.ifid_ino = ip->i_number;
 	ifh.ifid_start = ip->iso_start;
 	/*
 	 * This intentionally uses sizeof(ifh) in order to not copy stack
 	 * garbage on ILP32.
 	 */
 	memcpy(ap->a_fhp, &ifh, sizeof(ifh));
 
 #ifdef	ISOFS_DBG
 	printf("vptofh: ino %jd, start %ld\n",
 	    (uintmax_t)ifh.ifid_ino, ifh.ifid_start);
 #endif
 
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, cd9660, CTLFLAG_RW, 0, "cd9660 filesystem");
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_cd9660, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 cd9660_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (lblkno(VTOI(vp)->i_mnt, off));
 }
 
 static int
 cd9660_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	struct iso_node *ip;
 
 	ip = VTOI(vp);
 	return (blksize(ip->i_mnt, ip, lbn));
 }
 
 static int
 cd9660_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, cd9660_gbp_getblkno,
 		    cd9660_gbp_getblksz));
 	return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 /*
  * Global vfs data structures for cd9660
  */
 struct vop_vector cd9660_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_open =		cd9660_open,
 	.vop_access =		cd9660_access,
 	.vop_bmap =		cd9660_bmap,
 	.vop_cachedlookup =	cd9660_lookup,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_ioctl =		cd9660_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_pathconf =		cd9660_pathconf,
 	.vop_read =		cd9660_read,
 	.vop_readdir =		cd9660_readdir,
 	.vop_readlink =		cd9660_readlink,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_strategy =		cd9660_strategy,
 	.vop_vptofh =		cd9660_vptofh,
 	.vop_getpages =		cd9660_getpages,
 };
 
 /*
  * Special device vnode ops
  */
 
 struct vop_vector cd9660_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		cd9660_access,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_vptofh =		cd9660_vptofh,
 };
Index: head/sys/fs/nandfs/nandfs_vfsops.c
===================================================================
--- head/sys/fs/nandfs/nandfs_vfsops.c	(revision 328237)
+++ head/sys/fs/nandfs/nandfs_vfsops.c	(revision 328238)
@@ -1,1600 +1,1600 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010-2012 Semihalf
  * Copyright (c) 2008, 2009 Reinoud Zandijk
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * From: NetBSD: nilfs_vfsops.c,v 1.1 2009/07/18 16:31:42 reinoud Exp
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/priv.h>
 #include <sys/vnode.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/libkern.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <machine/_inttypes.h>
 
 #include <fs/nandfs/nandfs_mount.h>
 #include <fs/nandfs/nandfs.h>
 #include <fs/nandfs/nandfs_subr.h>
 
 static MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount structure");
 
 #define	NANDFS_SET_SYSTEMFILE(vp) {	\
 	(vp)->v_vflag |= VV_SYSTEM;	\
 	vref(vp);			\
 	vput(vp); }
 
 #define	NANDFS_UNSET_SYSTEMFILE(vp) {	\
 	VOP_LOCK(vp, LK_EXCLUSIVE);	\
 	MPASS(vp->v_bufobj.bo_dirty.bv_cnt == 0); \
 	(vp)->v_vflag &= ~VV_SYSTEM;	\
 	vgone(vp);			\
 	vput(vp); }
 
 /* Globals */
 struct _nandfs_devices nandfs_devices;
 
 /* Parameters */
 int nandfs_verbose = 0;
 
 static void
 nandfs_tunable_init(void *arg)
 {
 
 	TUNABLE_INT_FETCH("vfs.nandfs.verbose", &nandfs_verbose);
 }
 SYSINIT(nandfs_tunables, SI_SUB_VFS, SI_ORDER_ANY, nandfs_tunable_init, NULL);
 
 static SYSCTL_NODE(_vfs, OID_AUTO, nandfs, CTLFLAG_RD, 0, "NAND filesystem");
 static SYSCTL_NODE(_vfs_nandfs, OID_AUTO, mount, CTLFLAG_RD, 0,
     "NANDFS mountpoints");
 SYSCTL_INT(_vfs_nandfs, OID_AUTO, verbose, CTLFLAG_RW, &nandfs_verbose, 0, "");
 
 #define NANDFS_CONSTR_INTERVAL	5
 int nandfs_sync_interval = NANDFS_CONSTR_INTERVAL; /* sync every 5 seconds */
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, sync_interval, CTLFLAG_RW,
     &nandfs_sync_interval, 0, "");
 
 #define NANDFS_MAX_DIRTY_SEGS	5
 int nandfs_max_dirty_segs = NANDFS_MAX_DIRTY_SEGS; /* sync when 5 dirty seg */
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, max_dirty_segs, CTLFLAG_RW,
     &nandfs_max_dirty_segs, 0, "");
 
 #define NANDFS_CPS_BETWEEN_SBLOCKS 5
 int nandfs_cps_between_sblocks = NANDFS_CPS_BETWEEN_SBLOCKS; /* write superblock every 5 checkpoints */
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cps_between_sblocks, CTLFLAG_RW,
     &nandfs_cps_between_sblocks, 0, "");
 
 #define NANDFS_CLEANER_ENABLE 1
 int nandfs_cleaner_enable = NANDFS_CLEANER_ENABLE;
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_enable, CTLFLAG_RW,
     &nandfs_cleaner_enable, 0, "");
 
 #define NANDFS_CLEANER_INTERVAL 5
 int nandfs_cleaner_interval = NANDFS_CLEANER_INTERVAL;
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_interval, CTLFLAG_RW,
     &nandfs_cleaner_interval, 0, "");
 
 #define NANDFS_CLEANER_SEGMENTS 5
 int nandfs_cleaner_segments = NANDFS_CLEANER_SEGMENTS;
 SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_segments, CTLFLAG_RW,
     &nandfs_cleaner_segments, 0, "");
 
 static int nandfs_mountfs(struct vnode *devvp, struct mount *mp);
 static vfs_mount_t	nandfs_mount;
 static vfs_root_t	nandfs_root;
 static vfs_statfs_t	nandfs_statfs;
 static vfs_unmount_t	nandfs_unmount;
 static vfs_vget_t	nandfs_vget;
 static vfs_sync_t	nandfs_sync;
 static const char *nandfs_opts[] = {
 	"snap", "from", "noatime", NULL
 };
 
 /* System nodes */
 static int
 nandfs_create_system_nodes(struct nandfs_device *nandfsdev)
 {
 	int error;
 
 	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_DAT_INO,
 	    &nandfsdev->nd_super_root.sr_dat, &nandfsdev->nd_dat_node);
 	if (error)
 		goto errorout;
 
 	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_CPFILE_INO,
 	    &nandfsdev->nd_super_root.sr_cpfile, &nandfsdev->nd_cp_node);
 	if (error)
 		goto errorout;
 
 	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_SUFILE_INO,
 	    &nandfsdev->nd_super_root.sr_sufile, &nandfsdev->nd_su_node);
 	if (error)
 		goto errorout;
 
 	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_GC_INO,
 	    NULL, &nandfsdev->nd_gc_node);
 	if (error)
 		goto errorout;
 
 	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
 	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
 	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
 	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
 
 	DPRINTF(VOLUMES, ("System vnodes: dat: %p cp: %p su: %p\n",
 	    NTOV(nandfsdev->nd_dat_node), NTOV(nandfsdev->nd_cp_node),
 	    NTOV(nandfsdev->nd_su_node)));
 	return (0);
 
 errorout:
 	nandfs_dispose_node(&nandfsdev->nd_gc_node);
 	nandfs_dispose_node(&nandfsdev->nd_dat_node);
 	nandfs_dispose_node(&nandfsdev->nd_cp_node);
 	nandfs_dispose_node(&nandfsdev->nd_su_node);
 
 	return (error);
 }
 
 static void
 nandfs_release_system_nodes(struct nandfs_device *nandfsdev)
 {
 
 	if (!nandfsdev)
 		return;
 	if (nandfsdev->nd_refcnt > 0)
 		return;
 
 	if (nandfsdev->nd_gc_node)
 		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
 	if (nandfsdev->nd_dat_node)
 		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
 	if (nandfsdev->nd_cp_node)
 		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
 	if (nandfsdev->nd_su_node)
 		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
 }
 
 static int
 nandfs_check_fsdata_crc(struct nandfs_fsdata *fsdata)
 {
 	uint32_t fsdata_crc, comp_crc;
 
 	if (fsdata->f_magic != NANDFS_FSDATA_MAGIC)
 		return (0);
 
 	/* Preserve CRC */
 	fsdata_crc = fsdata->f_sum;
 
 	/* Calculate */
 	fsdata->f_sum = (0);
 	comp_crc = crc32((uint8_t *)fsdata, fsdata->f_bytes);
 
 	/* Restore */
 	fsdata->f_sum = fsdata_crc;
 
 	/* Check CRC */
 	return (fsdata_crc == comp_crc);
 }
 
 static int
 nandfs_check_superblock_crc(struct nandfs_fsdata *fsdata,
     struct nandfs_super_block *super)
 {
 	uint32_t super_crc, comp_crc;
 
 	/* Check super block magic */
 	if (super->s_magic != NANDFS_SUPER_MAGIC)
 		return (0);
 
 	/* Preserve CRC */
 	super_crc = super->s_sum;
 
 	/* Calculate */
 	super->s_sum = (0);
 	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
 
 	/* Restore */
 	super->s_sum = super_crc;
 
 	/* Check CRC */
 	return (super_crc == comp_crc);
 }
 
 static void
 nandfs_calc_superblock_crc(struct nandfs_fsdata *fsdata,
     struct nandfs_super_block *super)
 {
 	uint32_t comp_crc;
 
 	/* Calculate */
 	super->s_sum = 0;
 	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
 
 	/* Restore */
 	super->s_sum = comp_crc;
 }
 
 static int
 nandfs_is_empty(u_char *area, int size)
 {
 	int i;
 
 	for (i = 0; i < size; i++)
 		if (area[i] != 0xff)
 			return (0);
 
 	return (1);
 }
 
 static __inline int
 nandfs_sblocks_in_esize(struct nandfs_device *fsdev)
 {
 
 	return ((fsdev->nd_erasesize - NANDFS_SBLOCK_OFFSET_BYTES) /
 	    sizeof(struct nandfs_super_block));
 }
 
 static __inline int
 nandfs_max_sblocks(struct nandfs_device *fsdev)
 {
 
 	return (NANDFS_NFSAREAS * nandfs_sblocks_in_esize(fsdev));
 }
 
 static __inline int
 nandfs_sblocks_in_block(struct nandfs_device *fsdev)
 {
 
 	return (fsdev->nd_devblocksize / sizeof(struct nandfs_super_block));
 }
 
 #if 0
 static __inline int
 nandfs_sblocks_in_first_block(struct nandfs_device *fsdev)
 {
 	int n;
 
 	n = nandfs_sblocks_in_block(fsdev) -
 	    NANDFS_SBLOCK_OFFSET_BYTES / sizeof(struct nandfs_super_block);
 	if (n < 0)
 		n = 0;
 
 	return (n);
 }
 #endif
 
 static int
 nandfs_write_superblock_at(struct nandfs_device *fsdev,
     struct nandfs_fsarea *fstp)
 {
 	struct nandfs_super_block *super, *supert;
 	struct buf *bp;
 	int sb_per_sector, sbs_in_fsd, read_block;
 	int index, pos, error;
 	off_t offset;
 
 	DPRINTF(SYNC, ("%s: last_used %d nandfs_sblocks_in_esize %d\n",
 	    __func__, fstp->last_used, nandfs_sblocks_in_esize(fsdev)));
 	if (fstp->last_used == nandfs_sblocks_in_esize(fsdev) - 1)
 		index = 0;
 	else
 		index = fstp->last_used + 1;
 
 	super = &fsdev->nd_super;
 	supert = NULL;
 
 	sb_per_sector = nandfs_sblocks_in_block(fsdev);
 	sbs_in_fsd = sizeof(struct nandfs_fsdata) /
 	    sizeof(struct nandfs_super_block);
 	index += sbs_in_fsd;
 	offset = fstp->offset;
 
 	DPRINTF(SYNC, ("%s: offset %#jx s_last_pseg %#jx s_last_cno %#jx "
 	    "s_last_seq %#jx wtime %jd index %d\n", __func__, offset,
 	    super->s_last_pseg, super->s_last_cno, super->s_last_seq,
 	    super->s_wtime, index));
 
 	read_block = btodb(offset + rounddown(index, sb_per_sector) *
 	    sizeof(struct nandfs_super_block));
 
 	DPRINTF(SYNC, ("%s: read_block %#x\n", __func__, read_block));
 
 	if (index == sbs_in_fsd) {
 		error = nandfs_erase(fsdev, offset, fsdev->nd_erasesize);
 		if (error)
 			return (error);
 
 		error = bread(fsdev->nd_devvp, btodb(offset),
 		    fsdev->nd_devblocksize, NOCRED, &bp);
 		if (error) {
 			printf("NANDFS: couldn't read initial data: %d\n",
 			    error);
 			brelse(bp);
 			return (error);
 		}
 		memcpy(bp->b_data, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata));
 		/*
 		 * 0xff-out the rest. This bp could be cached, so potentially
 		 * b_data contains stale super blocks.
 		 *
 		 * We don't mind cached bp since most of the time we just add
 		 * super blocks to already 0xff-out b_data and don't need to
 		 * perform actual read.
 		 */
 		if (fsdev->nd_devblocksize > sizeof(fsdev->nd_fsdata))
 			memset(bp->b_data + sizeof(fsdev->nd_fsdata), 0xff,
 			    fsdev->nd_devblocksize - sizeof(fsdev->nd_fsdata));
 		error = bwrite(bp);
 		if (error) {
 			printf("NANDFS: cannot rewrite initial data at %jx\n",
 			    offset);
 			return (error);
 		}
 	}
 
 	error = bread(fsdev->nd_devvp, read_block, fsdev->nd_devblocksize,
 	    NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 
 	supert = (struct nandfs_super_block *)(bp->b_data);
 	pos = index % sb_per_sector;
 
 	DPRINTF(SYNC, ("%s: storing at %d\n", __func__, pos));
 	memcpy(&supert[pos], super, sizeof(struct nandfs_super_block));
 
 	/*
 	 * See comment above in code that performs erase.
 	 */
 	if (pos == 0)
 		memset(&supert[1], 0xff,
 		    (sb_per_sector - 1) * sizeof(struct nandfs_super_block));
 
 	error = bwrite(bp);
 	if (error) {
 		printf("NANDFS: cannot update superblock at %jx\n", offset);
 		return (error);
 	}
 
 	DPRINTF(SYNC, ("%s: fstp->last_used %d -> %d\n", __func__,
 	    fstp->last_used, index - sbs_in_fsd));
 	fstp->last_used = index - sbs_in_fsd;
 
 	return (0);
 }
 
 int
 nandfs_write_superblock(struct nandfs_device *fsdev)
 {
 	struct nandfs_super_block *super;
 	struct timespec ts;
 	int error;
 	int i, j;
 
 	vfs_timestamp(&ts);
 
 	super = &fsdev->nd_super;
 
 	super->s_last_pseg = fsdev->nd_last_pseg;
 	super->s_last_cno = fsdev->nd_last_cno;
 	super->s_last_seq = fsdev->nd_seg_sequence;
 	super->s_wtime = ts.tv_sec;
 
 	nandfs_calc_superblock_crc(&fsdev->nd_fsdata, super);
 
 	error = 0;
 	for (i = 0, j = fsdev->nd_last_fsarea; i < NANDFS_NFSAREAS;
 	    i++, j = (j + 1 % NANDFS_NFSAREAS)) {
 		if (fsdev->nd_fsarea[j].flags & NANDFS_FSSTOR_FAILED) {
 			DPRINTF(SYNC, ("%s: skipping %d\n", __func__, j));
 			continue;
 		}
 		error = nandfs_write_superblock_at(fsdev, &fsdev->nd_fsarea[j]);
 		if (error) {
 			printf("NANDFS: writing superblock at offset %d failed:"
 			    "%d\n", j * fsdev->nd_erasesize, error);
 			fsdev->nd_fsarea[j].flags |= NANDFS_FSSTOR_FAILED;
 		} else
 			break;
 	}
 
 	if (i == NANDFS_NFSAREAS) {
 		printf("NANDFS: superblock was not written\n");
 		/*
 		 * TODO: switch to read-only?
 		 */
 		return (error);
 	} else
 		fsdev->nd_last_fsarea = (j + 1) % NANDFS_NFSAREAS;
 
 	return (0);
 }
 
 static int
 nandfs_select_fsdata(struct nandfs_device *fsdev,
     struct nandfs_fsdata *fsdatat, struct nandfs_fsdata **fsdata, int nfsds)
 {
 	int i;
 
 	*fsdata = NULL;
 	for (i = 0; i < nfsds; i++) {
 		DPRINTF(VOLUMES, ("%s: i %d f_magic %x f_crc %x\n", __func__,
 		    i, fsdatat[i].f_magic, fsdatat[i].f_sum));
 		if (!nandfs_check_fsdata_crc(&fsdatat[i]))
 			continue;
 		*fsdata = &fsdatat[i];
 		break;
 	}
 
 	return (*fsdata != NULL ? 0 : EINVAL);
 }
 
 static int
 nandfs_select_sb(struct nandfs_device *fsdev,
     struct nandfs_super_block *supert, struct nandfs_super_block **super,
     int nsbs)
 {
 	int i;
 
 	*super = NULL;
 	for (i = 0; i < nsbs; i++) {
 		if (!nandfs_check_superblock_crc(&fsdev->nd_fsdata, &supert[i]))
 			continue;
 		DPRINTF(SYNC, ("%s: i %d s_last_cno %jx s_magic %x "
 		    "s_wtime %jd\n", __func__, i, supert[i].s_last_cno,
 		    supert[i].s_magic, supert[i].s_wtime));
 		if (*super == NULL || supert[i].s_last_cno >
 		    (*super)->s_last_cno)
 			*super = &supert[i];
 	}
 
 	return (*super != NULL ? 0 : EINVAL);
 }
 
 static int
 nandfs_read_structures_at(struct nandfs_device *fsdev,
     struct nandfs_fsarea *fstp, struct nandfs_fsdata *fsdata,
     struct nandfs_super_block *super)
 {
 	struct nandfs_super_block *tsuper, *tsuperd;
 	struct buf *bp;
 	int error, read_size;
 	int i;
 	int offset;
 
 	offset = fstp->offset;
 
 	if (fsdev->nd_erasesize > MAXBSIZE)
 		read_size = MAXBSIZE;
 	else
 		read_size = fsdev->nd_erasesize;
 
 	error = bread(fsdev->nd_devvp, btodb(offset), read_size, NOCRED, &bp);
 	if (error) {
 		printf("couldn't read: %d\n", error);
 		brelse(bp);
 		fstp->flags |= NANDFS_FSSTOR_FAILED;
 		return (error);
 	}
 
 	tsuper = super;
 
 	memcpy(fsdata, bp->b_data, sizeof(struct nandfs_fsdata));
 	memcpy(tsuper, (bp->b_data + sizeof(struct nandfs_fsdata)),
 	    read_size - sizeof(struct nandfs_fsdata));
 	brelse(bp);
 
 	tsuper += (read_size - sizeof(struct nandfs_fsdata)) /
 	    sizeof(struct nandfs_super_block);
 
 	for (i = 1; i < fsdev->nd_erasesize / read_size; i++) {
 		error = bread(fsdev->nd_devvp, btodb(offset + i * read_size),
 		    read_size, NOCRED, &bp);
 		if (error) {
 			printf("couldn't read: %d\n", error);
 			brelse(bp);
 			fstp->flags |= NANDFS_FSSTOR_FAILED;
 			return (error);
 		}
 		memcpy(tsuper, bp->b_data, read_size);
 		tsuper += read_size / sizeof(struct nandfs_super_block);
 		brelse(bp);
 	}
 
 	tsuper -= 1;
 	fstp->last_used = nandfs_sblocks_in_esize(fsdev) - 1;
 	for (tsuperd = super - 1; (tsuper != tsuperd); tsuper -= 1) {
 		if (nandfs_is_empty((u_char *)tsuper, sizeof(*tsuper)))
 			fstp->last_used--;
 		else
 			break;
 	}
 
 	DPRINTF(VOLUMES, ("%s: last_used %d\n", __func__, fstp->last_used));
 
 	return (0);
 }
 
 static int
 nandfs_read_structures(struct nandfs_device *fsdev)
 {
 	struct nandfs_fsdata *fsdata, *fsdatat;
 	struct nandfs_super_block *sblocks, *ssblock;
-	int nsbs, nfsds, i;
+	u_int nsbs, nfsds, i;
 	int error = 0;
 	int nrsbs;
 
 	nfsds = NANDFS_NFSAREAS;
 	nsbs = nandfs_max_sblocks(fsdev);
 
 	fsdatat = malloc(sizeof(struct nandfs_fsdata) * nfsds, M_NANDFSTEMP,
 	    M_WAITOK | M_ZERO);
 	sblocks = malloc(sizeof(struct nandfs_super_block) * nsbs, M_NANDFSTEMP,
 	    M_WAITOK | M_ZERO);
 
 	nrsbs = 0;
 	for (i = 0; i < NANDFS_NFSAREAS; i++) {
 		fsdev->nd_fsarea[i].offset = i * fsdev->nd_erasesize;
 		error = nandfs_read_structures_at(fsdev, &fsdev->nd_fsarea[i],
 		    &fsdatat[i], sblocks + nrsbs);
 		if (error)
 			continue;
 		nrsbs += (fsdev->nd_fsarea[i].last_used + 1);
 		if (fsdev->nd_fsarea[fsdev->nd_last_fsarea].last_used >
 		    fsdev->nd_fsarea[i].last_used)
 			fsdev->nd_last_fsarea = i;
 	}
 
 	if (nrsbs == 0) {
 		printf("nandfs: no valid superblocks found\n");
 		error = EINVAL;
 		goto out;
 	}
 
 	error = nandfs_select_fsdata(fsdev, fsdatat, &fsdata, nfsds);
 	if (error)
 		goto out;
 	memcpy(&fsdev->nd_fsdata, fsdata, sizeof(struct nandfs_fsdata));
 
 	error = nandfs_select_sb(fsdev, sblocks, &ssblock, nsbs);
 	if (error)
 		goto out;
 
 	memcpy(&fsdev->nd_super, ssblock, sizeof(struct nandfs_super_block));
 out:
 	free(fsdatat, M_NANDFSTEMP);
 	free(sblocks, M_NANDFSTEMP);
 
 	if (error == 0)
 		DPRINTF(VOLUMES, ("%s: selected sb with w_time %jd "
 		    "last_pseg %#jx\n", __func__, fsdev->nd_super.s_wtime,
 		    fsdev->nd_super.s_last_pseg));
 
 	return (error);
 }
 
 static void
 nandfs_unmount_base(struct nandfs_device *nandfsdev)
 {
 	int error;
 
 	if (!nandfsdev)
 		return;
 
 	/* Remove all our information */
 	error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0);
 	if (error) {
 		/*
 		 * Flushing buffers failed when fs was umounting, can't do
 		 * much now, just printf error and continue with umount.
 		 */
 		nandfs_error("%s(): error:%d when umounting FS\n",
 		    __func__, error);
 	}
 
 	/* Release the device's system nodes */
 	nandfs_release_system_nodes(nandfsdev);
 }
 
 static void
 nandfs_get_ncleanseg(struct nandfs_device *nandfsdev)
 {
 	struct nandfs_seg_stat nss;
 
 	nandfs_get_seg_stat(nandfsdev, &nss);
 	nandfsdev->nd_clean_segs = nss.nss_ncleansegs;
 	DPRINTF(VOLUMES, ("nandfs_mount: clean segs: %jx\n",
 	    (uintmax_t)nandfsdev->nd_clean_segs));
 }
 
 
 static int
 nandfs_mount_base(struct nandfs_device *nandfsdev, struct mount *mp,
     struct nandfs_args *args)
 {
 	uint32_t log_blocksize;
 	int error;
 
 	/* Flush out any old buffers remaining from a previous use. */
 	if ((error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0)))
 		return (error);
 
 	error = nandfs_read_structures(nandfsdev);
 	if (error) {
 		printf("nandfs: could not get valid filesystem structures\n");
 		return (error);
 	}
 
 	if (nandfsdev->nd_fsdata.f_rev_level != NANDFS_CURRENT_REV) {
 		printf("nandfs: unsupported file system revision: %d "
 		    "(supported is %d).\n", nandfsdev->nd_fsdata.f_rev_level,
 		    NANDFS_CURRENT_REV);
 		return (EINVAL);
 	}
 
 	if (nandfsdev->nd_fsdata.f_erasesize != nandfsdev->nd_erasesize) {
 		printf("nandfs: erasesize mismatch (device %#x, fs %#x)\n",
 		    nandfsdev->nd_erasesize, nandfsdev->nd_fsdata.f_erasesize);
 		return (EINVAL);
 	}
 
 	/* Get our blocksize */
 	log_blocksize = nandfsdev->nd_fsdata.f_log_block_size;
 	nandfsdev->nd_blocksize = (uint64_t) 1 << (log_blocksize + 10);
 	DPRINTF(VOLUMES, ("%s: blocksize:%x\n", __func__,
 	    nandfsdev->nd_blocksize));
 
 	DPRINTF(VOLUMES, ("%s: accepted super block with cp %#jx\n", __func__,
 	    (uintmax_t)nandfsdev->nd_super.s_last_cno));
 
 	/* Calculate dat structure parameters */
 	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_dat_mdt,
 	    nandfsdev->nd_fsdata.f_dat_entry_size);
 	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_ifile_mdt,
 	    nandfsdev->nd_fsdata.f_inode_size);
 
 	/* Search for the super root and roll forward when needed */
 	if (nandfs_search_super_root(nandfsdev)) {
 		printf("Cannot find valid SuperRoot\n");
 		return (EINVAL);
 	}
 
 	nandfsdev->nd_mount_state = nandfsdev->nd_super.s_state;
 	if (nandfsdev->nd_mount_state != NANDFS_VALID_FS) {
 		printf("FS is seriously damaged, needs repairing\n");
 		printf("aborting mount\n");
 		return (EINVAL);
 	}
 
 	/*
 	 * FS should be ok now. The superblock and the last segsum could be
 	 * updated from the repair so extract running values again.
 	 */
 	nandfsdev->nd_last_pseg = nandfsdev->nd_super.s_last_pseg;
 	nandfsdev->nd_seg_sequence = nandfsdev->nd_super.s_last_seq;
 	nandfsdev->nd_seg_num = nandfs_get_segnum_of_block(nandfsdev,
 	    nandfsdev->nd_last_pseg);
 	nandfsdev->nd_next_seg_num = nandfs_get_segnum_of_block(nandfsdev,
 	    nandfsdev->nd_last_segsum.ss_next);
 	nandfsdev->nd_ts.tv_sec = nandfsdev->nd_last_segsum.ss_create;
 	nandfsdev->nd_last_cno = nandfsdev->nd_super.s_last_cno;
 	nandfsdev->nd_fakevblk = 1;
 	/*
 	 * FIXME: bogus calculation. Should use actual number of usable segments
 	 * instead of total amount.
 	 */
 	nandfsdev->nd_segs_reserved =
 	    nandfsdev->nd_fsdata.f_nsegments *
 	    nandfsdev->nd_fsdata.f_r_segments_percentage / 100;
 	nandfsdev->nd_last_ino  = NANDFS_USER_INO;
 	DPRINTF(VOLUMES, ("%s: last_pseg %#jx last_cno %#jx last_seq %#jx\n"
 	    "fsdev: last_seg: seq %#jx num %#jx, next_seg_num %#jx "
 	    "segs_reserved %#jx\n",
 	    __func__, (uintmax_t)nandfsdev->nd_last_pseg,
 	    (uintmax_t)nandfsdev->nd_last_cno,
 	    (uintmax_t)nandfsdev->nd_seg_sequence,
 	    (uintmax_t)nandfsdev->nd_seg_sequence,
 	    (uintmax_t)nandfsdev->nd_seg_num,
 	    (uintmax_t)nandfsdev->nd_next_seg_num,
 	    (uintmax_t)nandfsdev->nd_segs_reserved));
 
 	DPRINTF(VOLUMES, ("nandfs_mount: accepted super root\n"));
 
 	/* Create system vnodes for DAT, CP and SEGSUM */
 	error = nandfs_create_system_nodes(nandfsdev);
 	if (error)
 		nandfs_unmount_base(nandfsdev);
 
 	nandfs_get_ncleanseg(nandfsdev);
 
 	return (error);
 }
 
 static void
 nandfs_unmount_device(struct nandfs_device *nandfsdev)
 {
 
 	/* Is there anything? */
 	if (nandfsdev == NULL)
 		return;
 
 	/* Remove the device only if we're the last reference */
 	nandfsdev->nd_refcnt--;
 	if (nandfsdev->nd_refcnt >= 1)
 		return;
 
 	MPASS(nandfsdev->nd_syncer == NULL);
 	MPASS(nandfsdev->nd_cleaner == NULL);
 	MPASS(nandfsdev->nd_free_base == NULL);
 
 	/* Unmount our base */
 	nandfs_unmount_base(nandfsdev);
 
 	/* Remove from our device list */
 	SLIST_REMOVE(&nandfs_devices, nandfsdev, nandfs_device, nd_next_device);
 
 	DROP_GIANT();
 	g_topology_lock();
 	g_vfs_close(nandfsdev->nd_gconsumer);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	DPRINTF(VOLUMES, ("closing device\n"));
 
 	/* Clear our mount reference and release device node */
 	vrele(nandfsdev->nd_devvp);
 
 	dev_rel(nandfsdev->nd_devvp->v_rdev);
 
 	/* Free our device info */
 	cv_destroy(&nandfsdev->nd_sync_cv);
 	mtx_destroy(&nandfsdev->nd_sync_mtx);
 	cv_destroy(&nandfsdev->nd_clean_cv);
 	mtx_destroy(&nandfsdev->nd_clean_mtx);
 	mtx_destroy(&nandfsdev->nd_mutex);
 	lockdestroy(&nandfsdev->nd_seg_const);
 	free(nandfsdev, M_NANDFSMNT);
 }
 
 static int
 nandfs_check_mounts(struct nandfs_device *nandfsdev, struct mount *mp,
     struct nandfs_args *args)
 {
 	struct nandfsmount *nmp;
 	uint64_t last_cno;
 
 	/* no double-mounting of the same checkpoint */
 	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
 		if (nmp->nm_mount_args.cpno == args->cpno)
 			return (EBUSY);
 	}
 
 	/* Allow readonly mounts without questioning here */
 	if (mp->mnt_flag & MNT_RDONLY)
 		return (0);
 
 	/* Read/write mount */
 	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
 		/* Only one RW mount on this device! */
 		if ((nmp->nm_vfs_mountp->mnt_flag & MNT_RDONLY)==0)
 			return (EROFS);
 		/* RDONLY on last mountpoint is device busy */
 		last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
 		if (nmp->nm_mount_args.cpno == last_cno)
 			return (EBUSY);
 	}
 
 	/* OK for now */
 	return (0);
 }
 
 static int
 nandfs_mount_device(struct vnode *devvp, struct mount *mp,
     struct nandfs_args *args, struct nandfs_device **nandfsdev_p)
 {
 	struct nandfs_device *nandfsdev;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	struct cdev *dev;
 	uint32_t erasesize;
 	int error, size;
 	int ronly;
 
 	DPRINTF(VOLUMES, ("Mounting NANDFS device\n"));
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	/* Look up device in our nandfs_mountpoints */
 	*nandfsdev_p = NULL;
 	SLIST_FOREACH(nandfsdev, &nandfs_devices, nd_next_device)
 		if (nandfsdev->nd_devvp == devvp)
 			break;
 
 	if (nandfsdev) {
 		DPRINTF(VOLUMES, ("device already mounted\n"));
 		error = nandfs_check_mounts(nandfsdev, mp, args);
 		if (error)
 			return error;
 		nandfsdev->nd_refcnt++;
 		*nandfsdev_p = nandfsdev;
 
 		if (!ronly) {
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(nandfsdev->nd_gconsumer, 0, 1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 		}
 		return (error);
 	}
 
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	dev = devvp->v_rdev;
 	dev_ref(dev);
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "nandfs", ronly ? 0 : 1);
 	pp = g_dev_getprovider(dev);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0);
 	if (error) {
 		dev_rel(dev);
 		return (error);
 	}
 
 	nandfsdev = malloc(sizeof(struct nandfs_device), M_NANDFSMNT, M_WAITOK | M_ZERO);
 
 	/* Initialise */
 	nandfsdev->nd_refcnt = 1;
 	nandfsdev->nd_devvp = devvp;
 	nandfsdev->nd_syncing = 0;
 	nandfsdev->nd_cleaning = 0;
 	nandfsdev->nd_gconsumer = cp;
 	cv_init(&nandfsdev->nd_sync_cv, "nandfssync");
 	mtx_init(&nandfsdev->nd_sync_mtx, "nffssyncmtx", NULL, MTX_DEF);
 	cv_init(&nandfsdev->nd_clean_cv, "nandfsclean");
 	mtx_init(&nandfsdev->nd_clean_mtx, "nffscleanmtx", NULL, MTX_DEF);
 	mtx_init(&nandfsdev->nd_mutex, "nandfsdev lock", NULL, MTX_DEF);
 	lockinit(&nandfsdev->nd_seg_const, PVFS, "nffssegcon", VLKTIMEOUT,
 	    LK_CANRECURSE);
 	STAILQ_INIT(&nandfsdev->nd_mounts);
 
 	nandfsdev->nd_devsize = pp->mediasize;
 	nandfsdev->nd_devblocksize = pp->sectorsize;
 
 	size = sizeof(erasesize);
 	error = g_io_getattr("NAND::blocksize", nandfsdev->nd_gconsumer, &size,
 	    &erasesize);
 	if (error) {
 		DPRINTF(VOLUMES, ("couldn't get erasesize: %d\n", error));
 
 		if (error == ENOIOCTL || error == EOPNOTSUPP) {
 			/*
 			 * We conclude that this is not NAND storage
 			 */
 			erasesize = NANDFS_DEF_ERASESIZE;
 		} else {
 			DROP_GIANT();
 			g_topology_lock();
 			g_vfs_close(nandfsdev->nd_gconsumer);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			dev_rel(dev);
 			free(nandfsdev, M_NANDFSMNT);
 			return (error);
 		}
 	}
 	nandfsdev->nd_erasesize = erasesize;
 
 	DPRINTF(VOLUMES, ("%s: erasesize %x\n", __func__,
 	    nandfsdev->nd_erasesize));
 
 	/* Register nandfs_device in list */
 	SLIST_INSERT_HEAD(&nandfs_devices, nandfsdev, nd_next_device);
 
 	error = nandfs_mount_base(nandfsdev, mp, args);
 	if (error) {
 		/* Remove all our information */
 		nandfs_unmount_device(nandfsdev);
 		return (EINVAL);
 	}
 
 	nandfsdev->nd_maxfilesize = nandfs_get_maxfilesize(nandfsdev);
 
 	*nandfsdev_p = nandfsdev;
 	DPRINTF(VOLUMES, ("NANDFS device mounted ok\n"));
 
 	return (0);
 }
 
 static int
 nandfs_mount_checkpoint(struct nandfsmount *nmp)
 {
 	struct nandfs_cpfile_header *cphdr;
 	struct nandfs_checkpoint *cp;
 	struct nandfs_inode ifile_inode;
 	struct nandfs_node *cp_node;
 	struct buf *bp;
 	uint64_t ncp, nsn, cpno, fcpno, blocknr, last_cno;
 	uint32_t off, dlen;
 	int cp_per_block, error;
 
 	cpno = nmp->nm_mount_args.cpno;
 	if (cpno == 0)
 		cpno = nmp->nm_nandfsdev->nd_super.s_last_cno;
 
 	DPRINTF(VOLUMES, ("%s: trying to mount checkpoint number %"PRIu64"\n",
 	    __func__, cpno));
 
 	cp_node = nmp->nm_nandfsdev->nd_cp_node;
 
 	VOP_LOCK(NTOV(cp_node), LK_SHARED);
 	/* Get cpfile header from 1st block of cp file */
 	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
 	if (error) {
 		brelse(bp);
 		VOP_UNLOCK(NTOV(cp_node), 0);
 		return (error);
 	}
 
 	cphdr = (struct nandfs_cpfile_header *) bp->b_data;
 	ncp = cphdr->ch_ncheckpoints;
 	nsn = cphdr->ch_nsnapshots;
 
 	brelse(bp);
 
 	DPRINTF(VOLUMES, ("mount_nandfs: checkpoint header read in\n"));
 	DPRINTF(VOLUMES, ("\tNumber of checkpoints %"PRIu64"\n", ncp));
 	DPRINTF(VOLUMES, ("\tNumber of snapshots %"PRIu64"\n", nsn));
 
 	/* Read in our specified checkpoint */
 	dlen = nmp->nm_nandfsdev->nd_fsdata.f_checkpoint_size;
 	cp_per_block = nmp->nm_nandfsdev->nd_blocksize / dlen;
 
 	fcpno = cpno + NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET - 1;
 	blocknr = fcpno / cp_per_block;
 	off = (fcpno % cp_per_block) * dlen;
 	error = nandfs_bread(cp_node, blocknr, NOCRED, 0, &bp);
 	if (error) {
 		brelse(bp);
 		VOP_UNLOCK(NTOV(cp_node), 0);
 		printf("mount_nandfs: couldn't read cp block %"PRIu64"\n",
 		    fcpno);
 		return (EINVAL);
 	}
 
 	/* Needs to be a valid checkpoint */
 	cp = (struct nandfs_checkpoint *) ((uint8_t *) bp->b_data + off);
 	if (cp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
 		printf("mount_nandfs: checkpoint marked invalid\n");
 		brelse(bp);
 		VOP_UNLOCK(NTOV(cp_node), 0);
 		return (EINVAL);
 	}
 
 	/* Is this really the checkpoint we want? */
 	if (cp->cp_cno != cpno) {
 		printf("mount_nandfs: checkpoint file corrupt? "
 		    "expected cpno %"PRIu64", found cpno %"PRIu64"\n",
 		    cpno, cp->cp_cno);
 		brelse(bp);
 		VOP_UNLOCK(NTOV(cp_node), 0);
 		return (EINVAL);
 	}
 
 	/* Check if it's a snapshot ! */
 	last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
 	if (cpno != last_cno) {
 		/* Only allow snapshots if not mounting on the last cp */
 		if ((cp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) == 0) {
 			printf( "mount_nandfs: checkpoint %"PRIu64" is not a "
 			    "snapshot\n", cpno);
 			brelse(bp);
 			VOP_UNLOCK(NTOV(cp_node), 0);
 			return (EINVAL);
 		}
 	}
 
 	ifile_inode = cp->cp_ifile_inode;
 	brelse(bp);
 
 	/* Get ifile inode */
 	error = nandfs_get_node_raw(nmp->nm_nandfsdev, NULL, NANDFS_IFILE_INO,
 	    &ifile_inode, &nmp->nm_ifile_node);
 	if (error) {
 		printf("mount_nandfs: can't read ifile node\n");
 		VOP_UNLOCK(NTOV(cp_node), 0);
 		return (EINVAL);
 	}
 
 	NANDFS_SET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
 	VOP_UNLOCK(NTOV(cp_node), 0);
 	/* Get root node? */
 
 	return (0);
 }
 
 static void
 free_nandfs_mountinfo(struct mount *mp)
 {
 	struct nandfsmount *nmp = VFSTONANDFS(mp);
 
 	if (nmp == NULL)
 		return;
 
 	free(nmp, M_NANDFSMNT);
 }
 
 void
 nandfs_wakeup_wait_sync(struct nandfs_device *nffsdev, int reason)
 {
 	char *reasons[] = {
 	    "umount",
 	    "vfssync",
 	    "bdflush",
 	    "fforce",
 	    "fsync",
 	    "ro_upd"
 	};
 
 	DPRINTF(SYNC, ("%s: %s\n", __func__, reasons[reason]));
 	mtx_lock(&nffsdev->nd_sync_mtx);
 	if (nffsdev->nd_syncing)
 		cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
 	if (reason == SYNCER_UMOUNT)
 		nffsdev->nd_syncer_exit = 1;
 	nffsdev->nd_syncing = 1;
 	wakeup(&nffsdev->nd_syncing);
 	cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
 
 	mtx_unlock(&nffsdev->nd_sync_mtx);
 }
 
 static void
 nandfs_gc_finished(struct nandfs_device *nffsdev, int exit)
 {
 	int error;
 
 	mtx_lock(&nffsdev->nd_sync_mtx);
 	nffsdev->nd_syncing = 0;
 	DPRINTF(SYNC, ("%s: cleaner finish\n", __func__));
 	cv_broadcast(&nffsdev->nd_sync_cv);
 	mtx_unlock(&nffsdev->nd_sync_mtx);
 	if (!exit) {
 		error = tsleep(&nffsdev->nd_syncing, PRIBIO, "-",
 		    hz * nandfs_sync_interval);
 		DPRINTF(SYNC, ("%s: cleaner waked up: %d\n",
 		    __func__, error));
 	}
 }
 
 static void
 nandfs_syncer(struct nandfsmount *nmp)
 {
 	struct nandfs_device *nffsdev;
 	struct mount *mp;
 	int flags, error;
 
 	mp = nmp->nm_vfs_mountp;
 	nffsdev = nmp->nm_nandfsdev;
 	tsleep(&nffsdev->nd_syncing, PRIBIO, "-", hz * nandfs_sync_interval);
 
 	while (!nffsdev->nd_syncer_exit) {
 		DPRINTF(SYNC, ("%s: syncer run\n", __func__));
 		nffsdev->nd_syncing = 1;
 
 		flags = (nmp->nm_flags & (NANDFS_FORCE_SYNCER | NANDFS_UMOUNT));
 
 		error = nandfs_segment_constructor(nmp, flags);
 		if (error)
 			nandfs_error("%s: error:%d when creating segments\n",
 			    __func__, error);
 
 		nmp->nm_flags &= ~flags;
 
 		nandfs_gc_finished(nffsdev, 0);
 	}
 
 	MPASS(nffsdev->nd_cleaner == NULL);
 	error = nandfs_segment_constructor(nmp,
 	    NANDFS_FORCE_SYNCER | NANDFS_UMOUNT);
 	if (error)
 		nandfs_error("%s: error:%d when creating segments\n",
 		    __func__, error);
 	nandfs_gc_finished(nffsdev, 1);
 	nffsdev->nd_syncer = NULL;
 	MPASS(nffsdev->nd_free_base == NULL);
 
 	DPRINTF(SYNC, ("%s: exiting\n", __func__));
 	kthread_exit();
 }
 
 static int
 start_syncer(struct nandfsmount *nmp)
 {
 	int error;
 
 	MPASS(nmp->nm_nandfsdev->nd_syncer == NULL);
 
 	DPRINTF(SYNC, ("%s: start syncer\n", __func__));
 
 	nmp->nm_nandfsdev->nd_syncer_exit = 0;
 
 	error = kthread_add((void(*)(void *))nandfs_syncer, nmp, NULL,
 	    &nmp->nm_nandfsdev->nd_syncer, 0, 0, "nandfs_syncer");
 
 	if (error)
 		printf("nandfs: could not start syncer: %d\n", error);
 
 	return (error);
 }
 
 static int
 stop_syncer(struct nandfsmount *nmp)
 {
 
 	MPASS(nmp->nm_nandfsdev->nd_syncer != NULL);
 
 	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_UMOUNT);
 
 	DPRINTF(SYNC, ("%s: stop syncer\n", __func__));
 	return (0);
 }
 
 /*
  * Mount null layer
  */
 static int
 nandfs_mount(struct mount *mp)
 {
 	struct nandfsmount *nmp;
 	struct vnode *devvp;
 	struct nameidata nd;
 	struct vfsoptlist *opts;
 	struct thread *td;
 	char *from;
 	int error = 0, flags;
 
 	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
 
 	td = curthread;
 	opts = mp->mnt_optnew;
 
 	if (vfs_filteropt(opts, nandfs_opts))
 		return (EINVAL);
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		nmp = VFSTONANDFS(mp);
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			return (error);
 		}
 		if (!(nmp->nm_ronly) && vfs_flagopt(opts, "ro", NULL, 0)) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			error = VFS_SYNC(mp, MNT_WAIT);
 			if (error)
 				return (error);
 			vn_finished_write(mp);
 
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 
 			nandfs_wakeup_wait_sync(nmp->nm_nandfsdev,
 			    SYNCER_ROUPD);
 			error = vflush(mp, 0, flags, td);
 			if (error)
 				return (error);
 
 			nandfs_stop_cleaner(nmp->nm_nandfsdev);
 			stop_syncer(nmp);
 			DROP_GIANT();
 			g_topology_lock();
 			g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1, 0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			nmp->nm_ronly = 1;
 
 		} else if ((nmp->nm_ronly) &&
 		    !vfs_flagopt(opts, "ro", NULL, 0)) {
 			/*
 			 * Don't allow read-write snapshots.
 			 */
 			if (nmp->nm_mount_args.cpno != 0)
 				return (EROFS);
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			devvp = nmp->nm_nandfsdev->nd_devvp;
 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error) {
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 				if (error) {
 					VOP_UNLOCK(devvp, 0);
 					return (error);
 				}
 			}
 
 			VOP_UNLOCK(devvp, 0);
 			DROP_GIANT();
 			g_topology_lock();
 			error = g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, 1,
 			    0);
 			g_topology_unlock();
 			PICKUP_GIANT();
 			if (error)
 				return (error);
 
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			error = start_syncer(nmp);
 			if (error == 0)
 				error = nandfs_start_cleaner(nmp->nm_nandfsdev);
 			if (error) {
 				DROP_GIANT();
 				g_topology_lock();
 				g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1,
 				    0);
 				g_topology_unlock();
 				PICKUP_GIANT();
 				return (error);
 			}
 
 			nmp->nm_ronly = 0;
 		}
 		return (0);
 	}
 
 	from = vfs_getopts(opts, "from", &error);
 	if (error)
 		return (error);
 
 	/*
 	 * Find device node
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, from, curthread);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	devvp = nd.ni_vp;
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/* Check the access rights on the mount device */
 	error = VOP_ACCESS(devvp, VREAD, curthread->td_ucred, curthread);
 	if (error)
 		error = priv_check(curthread, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	vfs_getnewfsid(mp);
 
 	error = nandfs_mountfs(devvp, mp);
 	if (error)
 		return (error);
 	vfs_mountedfrom(mp, from);
 
 	return (0);
 }
 
 static int
 nandfs_mountfs(struct vnode *devvp, struct mount *mp)
 {
 	struct nandfsmount *nmp = NULL;
 	struct nandfs_args *args = NULL;
 	struct nandfs_device *nandfsdev;
 	char *from;
 	int error, ronly;
 	char *cpno;
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	if (devvp->v_rdev->si_iosize_max != 0)
 		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
 	VOP_UNLOCK(devvp, 0);
 
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	from = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		goto error;
 
 	error = vfs_getopt(mp->mnt_optnew, "snap", (void **)&cpno, NULL);
 	if (error == ENOENT)
 		cpno = NULL;
 	else if (error)
 		goto error;
 
 	args = (struct nandfs_args *)malloc(sizeof(struct nandfs_args),
 	    M_NANDFSMNT, M_WAITOK | M_ZERO);
 
 	if (cpno != NULL)
 		args->cpno = strtoul(cpno, (char **)NULL, 10);
 	else
 		args->cpno = 0;
 	args->fspec = from;
 
 	if (args->cpno != 0 && !ronly) {
 		error = EROFS;
 		goto error;
 	}
 
 	printf("WARNING: NANDFS is considered to be a highly experimental "
 	    "feature in FreeBSD.\n");
 
 	error = nandfs_mount_device(devvp, mp, args, &nandfsdev);
 	if (error)
 		goto error;
 
 	nmp = (struct nandfsmount *) malloc(sizeof(struct nandfsmount),
 	    M_NANDFSMNT, M_WAITOK | M_ZERO);
 
 	mp->mnt_data = nmp;
 	nmp->nm_vfs_mountp = mp;
 	nmp->nm_ronly = ronly;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 	nmp->nm_nandfsdev = nandfsdev;
 	/* Add our mountpoint */
 	STAILQ_INSERT_TAIL(&nandfsdev->nd_mounts, nmp, nm_next_mount);
 
 	if (args->cpno > nandfsdev->nd_last_cno) {
 		printf("WARNING: supplied checkpoint number (%jd) is greater "
 		    "than last known checkpoint on filesystem (%jd). Mounting"
 		    " checkpoint %jd\n", (uintmax_t)args->cpno,
 		    (uintmax_t)nandfsdev->nd_last_cno,
 		    (uintmax_t)nandfsdev->nd_last_cno);
 		args->cpno = nandfsdev->nd_last_cno;
 	}
 
 	/* Setting up other parameters */
 	nmp->nm_mount_args = *args;
 	free(args, M_NANDFSMNT);
 	error = nandfs_mount_checkpoint(nmp);
 	if (error) {
 		nandfs_unmount(mp, MNT_FORCE);
 		goto unmounted;
 	}
 
 	if (!ronly) {
 		error = start_syncer(nmp);
 		if (error == 0)
 			error = nandfs_start_cleaner(nmp->nm_nandfsdev);
 		if (error)
 			nandfs_unmount(mp, MNT_FORCE);
 	}
 
 	return (0);
 
 error:
 	if (args != NULL)
 		free(args, M_NANDFSMNT);
 
 	if (nmp != NULL) {
 		free(nmp, M_NANDFSMNT);
 		mp->mnt_data = NULL;
 	}
 unmounted:
 	return (error);
 }
 
 static int
 nandfs_unmount(struct mount *mp, int mntflags)
 {
 	struct nandfs_device *nandfsdev;
 	struct nandfsmount *nmp;
 	int error;
 	int flags = 0;
 
 	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	nmp = mp->mnt_data;
 	nandfsdev = nmp->nm_nandfsdev;
 
 	error = vflush(mp, 0, flags | SKIPSYSTEM, curthread);
 	if (error)
 		return (error);
 
 	if (!(nmp->nm_ronly)) {
 		nandfs_stop_cleaner(nandfsdev);
 		stop_syncer(nmp);
 	}
 
 	if (nmp->nm_ifile_node)
 		NANDFS_UNSET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
 
 	/* Remove our mount point */
 	STAILQ_REMOVE(&nandfsdev->nd_mounts, nmp, nandfsmount, nm_next_mount);
 
 	/* Unmount the device itself when we're the last one */
 	nandfs_unmount_device(nandfsdev);
 
 	free_nandfs_mountinfo(mp);
 
 	/*
 	 * Finally, throw away the null_mount structure
 	 */
 	mp->mnt_data = 0;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 
 	return (0);
 }
 
 static int
 nandfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct nandfsmount *nmp;
 	struct nandfs_device *nandfsdev;
 	struct nandfs_fsdata *fsdata;
 	struct nandfs_super_block *sb;
 	struct nandfs_block_group_desc *groups;
 	struct nandfs_node *ifile;
 	struct nandfs_mdt *mdt;
 	struct buf *bp;
 	int i, error;
 	uint32_t entries_per_group;
 	uint64_t files = 0;
 
 	nmp = mp->mnt_data;
 	nandfsdev = nmp->nm_nandfsdev;
 	fsdata = &nandfsdev->nd_fsdata;
 	sb = &nandfsdev->nd_super;
 	ifile = nmp->nm_ifile_node;
 	mdt = &nandfsdev->nd_ifile_mdt;
 	entries_per_group = mdt->entries_per_group;
 
 	VOP_LOCK(NTOV(ifile), LK_SHARED);
 	error = nandfs_bread(ifile, 0, NOCRED, 0, &bp);
 	if (error) {
 		brelse(bp);
 		VOP_UNLOCK(NTOV(ifile), 0);
 		return (error);
 	}
 
 	groups = (struct nandfs_block_group_desc *)bp->b_data;
 
 	for (i = 0; i < mdt->groups_per_desc_block; i++)
 		files += (entries_per_group - groups[i].bg_nfrees);
 
 	brelse(bp);
 	VOP_UNLOCK(NTOV(ifile), 0);
 
 	sbp->f_bsize = nandfsdev->nd_blocksize;
 	sbp->f_iosize = sbp->f_bsize;
 	sbp->f_blocks = fsdata->f_blocks_per_segment * fsdata->f_nsegments;
 	sbp->f_bfree = sb->s_free_blocks_count;
 	sbp->f_bavail = sbp->f_bfree;
 	sbp->f_files = files;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 static int
 nandfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct nandfsmount *nmp = VFSTONANDFS(mp);
 	struct nandfs_node *node;
 	int error;
 
 	error = nandfs_get_node(nmp, NANDFS_ROOT_INO, &node);
 	if (error)
 		return (error);
 
 	KASSERT(NTOV(node)->v_vflag & VV_ROOT,
 	    ("root_vp->v_vflag & VV_ROOT"));
 
 	*vpp = NTOV(node);
 
 	return (error);
 }
 
 static int
 nandfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
 {
 	struct nandfsmount *nmp = VFSTONANDFS(mp);
 	struct nandfs_node *node;
 	int error;
 
 	error = nandfs_get_node(nmp, ino, &node);
 	if (node)
 		*vpp = NTOV(node);
 
 	return (error);
 }
 
 static int
 nandfs_sync(struct mount *mp, int waitfor)
 {
 	struct nandfsmount *nmp = VFSTONANDFS(mp);
 
 	DPRINTF(SYNC, ("%s: mp %p waitfor %d\n", __func__, mp, waitfor));
 
 	/*
 	 * XXX: A hack to be removed soon
 	 */
 	if (waitfor == MNT_LAZY)
 		return (0);
 	if (waitfor == MNT_SUSPEND)
 		return (0);
 	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_VFS_SYNC);
 	return (0);
 }
 
 static struct vfsops nandfs_vfsops = {
 	.vfs_init =		nandfs_init,
 	.vfs_mount =		nandfs_mount,
 	.vfs_root =		nandfs_root,
 	.vfs_statfs =		nandfs_statfs,
 	.vfs_uninit =		nandfs_uninit,
 	.vfs_unmount =		nandfs_unmount,
 	.vfs_vget =		nandfs_vget,
 	.vfs_sync =		nandfs_sync,
 };
 
 VFS_SET(nandfs_vfsops, nandfs, VFCF_LOOPBACK);
Index: head/sys/fs/nfsclient/nfs_clvnops.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clvnops.c	(revision 328237)
+++ head/sys/fs/nfsclient/nfs_clvnops.c	(revision 328238)
@@ -1,3541 +1,3541 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
  */
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <nfs/nfs_lock.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_accesscache_flush_probe_func_t
 		dtrace_nfscl_accesscache_flush_done_probe;
 uint32_t	nfscl_accesscache_flush_done_id;
 
 dtrace_nfsclient_accesscache_get_probe_func_t
 		dtrace_nfscl_accesscache_get_hit_probe,
 		dtrace_nfscl_accesscache_get_miss_probe;
 uint32_t	nfscl_accesscache_get_hit_id;
 uint32_t	nfscl_accesscache_get_miss_id;
 
 dtrace_nfsclient_accesscache_load_probe_func_t
 		dtrace_nfscl_accesscache_load_done_probe;
 uint32_t	nfscl_accesscache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 extern struct nfsstatsv1 nfsstatsv1;
 extern int nfsrv_useacl;
 extern int nfscl_debuglevel;
 MALLOC_DECLARE(M_NEWNFSREQ);
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_pathconf_t	nfs_pathconf;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static	int	nfs_lookitup(struct vnode *, char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 static vop_advlockasync_t nfs_advlockasync;
 static vop_getacl_t nfs_getacl;
 static vop_setacl_t nfs_setacl;
 static vop_set_text_t nfs_set_text;
 
 /*
  * Global vfs data structures for nfs
  */
 struct vop_vector newnfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		ncl_getpages,
 	.vop_putpages =		ncl_putpages,
 	.vop_inactive =		ncl_inactive,
 	.vop_link =		nfs_link,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		ncl_write,
 	.vop_getacl =		nfs_getacl,
 	.vop_setacl =		nfs_setacl,
 	.vop_set_text =		nfs_set_text,
 };
 
 struct vop_vector newnfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		ncl_inactive,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
 
 static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, struct vattr *vap);
 static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td);
 static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp,
     char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp,
     char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td);
 static int nfs_renameit(struct vnode *sdvp, struct vnode *svp,
     struct componentname *scnp, struct sillyrename *sp);
 
 /*
  * Global variables
  */
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfs_prime_access_cache = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
 	   &nfs_prime_access_cache, 0,
 	   "Prime NFS ACCESS cache when fetching attributes");
 
 static int	newnfs_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW,
     &newnfs_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int newnfs_directio_enable = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &newnfs_directio_enable, 0, "Enable NFS directio");
 
 int nfs_keep_dirty_on_error;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_keep_dirty_on_error, CTLFLAG_RW,
     &nfs_keep_dirty_on_error, 0, "Retry pageout if error returned");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int newnfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #define	NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY		\
 			 | NFSACCESS_EXTEND | NFSACCESS_EXECUTE	\
 			 | NFSACCESS_DELETE | NFSACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * ncl_iod_mutex : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred, u_int32_t *retmode)
 {
 	int error = 0, attrflag, i, lrupos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag,
 	    &rmode, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		lrupos = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (np->n_accesscache[i].uid == cred->cr_uid) {
 				np->n_accesscache[i].mode = rmode;
 				np->n_accesscache[i].stamp = time_second;
 				break;
 			}
 			if (i > 0 && np->n_accesscache[i].stamp <
 			    np->n_accesscache[lrupos].stamp)
 				lrupos = i;
 		}
 		if (i == NFS_ACCESSCACHESIZE) {
 			np->n_accesscache[lrupos].uid = cred->cr_uid;
 			np->n_accesscache[lrupos].mode = rmode;
 			np->n_accesscache[lrupos].stamp = time_second;
 		}
 		mtx_unlock(&np->n_mtx);
 		if (retmode != NULL)
 			*retmode = rmode;
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 #ifdef KDTRACE_HOOKS
 	if (error != 0)
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
 		    error);
 #endif
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0, i, gotahit;
 	u_int32_t mode, wmode, rmode;
 	int v34 = NFS_ISV34(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS |
 	    VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL |
 	    VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3 or v4, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v34) {
 		if (ap->a_accmode & VREAD)
 			mode = NFSACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_EXECUTE;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 		} else {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_LOOKUP;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 			if (ap->a_accmode & VDELETE_CHILD)
 				mode |= NFSACCESS_MODIFY;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSACCESS_READ | NFSACCESS_MODIFY |
 				NFSACCESS_EXTEND | NFSACCESS_EXECUTE |
 				NFSACCESS_DELETE | NFSACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		gotahit = 0;
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
 			    if (time_second < (np->n_accesscache[i].stamp
 				+ nfsaccess_cache_timeout) &&
 				(np->n_accesscache[i].mode & mode) == mode) {
 				NFSINCRGLOBAL(nfsstatsv1.accesscache_hits);
 				gotahit = 1;
 			    }
 			    break;
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 #ifdef KDTRACE_HOOKS
 		if (gotahit != 0)
 			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
 			    ap->a_cred->cr_uid, mode);
 		else
 			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
 			    ap->a_cred->cr_uid, mode);
 #endif
 		if (gotahit == 0) {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		        error = nfs34_access_otw(vp, wmode, ap->a_td,
 			    ap->a_cred, &rmode);
 			if (!error &&
 			    (rmode & mode) != mode)
 				error = EACCES;
 		}
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			mtx_unlock(&np->n_mtx);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = ncl_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = ncl_readdirrpc(vp, &auio, ap->a_cred,
 				    ap->a_td);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = ncl_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			mtx_unlock(&np->n_mtx);
 		return (error);
 	}
 }
 
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 	struct ucred *cred;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * For NFSv4, we need to do the Open Op before cache validation,
 	 * so that we conform to RFC3530 Sec. 9.3.1.
 	 */
 	if (NFS_ISV4(vp)) {
 		error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td);
 		if (error) {
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 			return (error);
 		}
 	}
 
 	/*
 	 * Now, if this Open will be doing reading, re-validate/flush the
 	 * cache, so that Close/Open coherency is maintained.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		if (NFS_ISV4(vp))
 			np->n_change = vattr.va_filerev;
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		mtx_lock(&np->n_mtx);
 		if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) ||
 		    NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			if (NFS_ISV4(vp))
 				np->n_change = vattr.va_filerev;
 		}
 	}
 
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (newnfs_directio_enable && (fmode & O_DIRECT) &&
 	    (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NNONCACHE;
 		}
 		np->n_directio_opens++;
 	}
 
 	/* If opened for writing via NFSv4.1 or later, mark that for pNFS. */
 	if (NFSHASPNFS(VFSTONFS(vp->v_mount)) && (fmode & FWRITE) != 0)
 		np->n_flag |= NWRITEOPENED;
 
 	/*
 	 * If this is an open for writing, capture a reference to the
 	 * credentials, so they can be used by ncl_putpages(). Using
 	 * these write credentials is preferable to the credentials of
 	 * whatever thread happens to be doing the VOP_PUTPAGES() since
 	 * the write RPCs are less likely to fail with EACCES.
 	 */
 	if ((fmode & FWRITE) != 0) {
 		cred = np->n_writecred;
 		np->n_writecred = crhold(ap->a_cred);
 	} else
 		cred = NULL;
 	mtx_unlock(&np->n_mtx);
 
 	if (cred != NULL)
 		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to ncl_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  * for NFS Version 4 - flush the dirty buffers and commit them, if
  *		       nfscl_mustflush() says this is necessary.
  *                     It is necessary if there is no write delegation held,
  *                     in order to satisfy open/close coherency.
  *                     If the file isn't cached on local stable storage,
  *                     it may be necessary in order to detect "out of space"
  *                     errors from the server, if the write delegation
  *                     issued by the server doesn't allow the file to grow.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct ucred *cred;
 	int error = 0, ret, localcred = 0;
 	int fmode = ap->a_fflag;
 
 	if (NFSCL_FORCEDISM(vp->v_mount))
 		return (0);
 	/*
 	 * During shutdown, a_cred isn't valid, so just use root.
 	 */
 	if (ap->a_cred == NOCRED) {
 		cred = newnfs_getcred();
 		localcred = 1;
 	} else {
 		cred = ap->a_cred;
 	}
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	    }
 	    mtx_lock(&np->n_mtx);
 	    if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     * These races occur when there is no longer the old
 		     * traditional vnode locking implemented for Vnode Ops.
 		     */
 		    int cm = newnfs_commit_on_close ? 1 : 0;
 		    error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else if (NFS_ISV4(vp)) { 
 			if (nfscl_mustflush(vp) != 0) {
 				int cm = newnfs_commit_on_close ? 1 : 0;
 				error = ncl_flush(vp, MNT_WAIT, ap->a_td,
 				    cm, 0);
 				/*
 				 * as above w.r.t races when clearing
 				 * NMODIFIED.
 				 * np->n_flag &= ~NMODIFIED;
 				 */
 			}
 		} else {
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		}
 		mtx_lock(&np->n_mtx);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
  	     * An open is going to fetch fresh attrs any way, other procs
  	     * on this node that have file open will be forced to do an 
  	     * otw attr fetch, but this is safe.
 	     * --> A user found that their RPC count dropped by 20% when
 	     *     this was commented out and I can't see any requirement
 	     *     for it, so I've disabled it when negative lookups are
 	     *     enabled. (What does this have to do with negative lookup
 	     *     caching? Well nothing, except it was reported by the
 	     *     same user that needed negative lookup caching and I wanted
 	     *     there to be a way to disable it to see if it
 	     *     is the cause of some caching/coherency issue that might
 	     *     crop up.)
  	     */
 	    if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) {
 		    np->n_attrstamp = 0;
 		    KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	    }
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    mtx_unlock(&np->n_mtx);
 	}
 
 	if (NFS_ISV4(vp)) {
 		/*
 		 * Get attributes so "change" is up to date.
 		 */
 		if (error == 0 && nfscl_mustflush(vp) != 0 &&
 		    vp->v_type == VREG &&
 		    (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) {
 			ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva,
 			    NULL);
 			if (!ret) {
 				np->n_change = nfsva.na_filerev;
 				(void) nfscl_loadattrcache(&vp, &nfsva, NULL,
 				    NULL, 0, 0);
 			}
 		}
 
 		/*
 		 * and do the close.
 		 */
 		ret = nfsrpc_close(vp, 0, ap->a_td);
 		if (!error && ret)
 			error = ret;
 		if (error)
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 	}
 	if (newnfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		mtx_lock(&np->n_mtx);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		mtx_unlock(&np->n_mtx);
 	}
 	if (localcred)
 		NFSFREECRED(cred);
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct nfsvattr nfsva;
 	struct vattr *vap = ap->a_vap;
 	struct vattr vattr;
 
 	/*
 	 * Update local times for special files.
 	 */
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	mtx_unlock(&np->n_mtx);
 	/*
 	 * First look in the cache.
 	 */
 	if (ncl_getattrcache(vp, &vattr) == 0) {
 		vap->va_type = vattr.va_type;
 		vap->va_mode = vattr.va_mode;
 		vap->va_nlink = vattr.va_nlink;
 		vap->va_uid = vattr.va_uid;
 		vap->va_gid = vattr.va_gid;
 		vap->va_fsid = vattr.va_fsid;
 		vap->va_fileid = vattr.va_fileid;
 		vap->va_size = vattr.va_size;
 		vap->va_blocksize = vattr.va_blocksize;
 		vap->va_atime = vattr.va_atime;
 		vap->va_mtime = vattr.va_mtime;
 		vap->va_ctime = vattr.va_ctime;
 		vap->va_gen = vattr.va_gen;
 		vap->va_flags = vattr.va_flags;
 		vap->va_rdev = vattr.va_rdev;
 		vap->va_bytes = vattr.va_bytes;
 		vap->va_filerev = vattr.va_filerev;
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 		return (0);
 	}
 
 	if (NFS_ISV34(vp) && nfs_prime_access_cache &&
 	    nfsaccess_cache_timeout > 0) {
 		NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL);
 		if (ncl_getattrcache(vp, ap->a_vap) == 0) {
 			nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime);
 			return (0);
 		}
 	}
 	error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva, NULL);
 	if (!error)
 		error = nfscl_loadattrcache(&vp, &nfsva, vap, NULL, 0, 0);
 	if (!error) {
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct thread *td = curthread;	/* XXX */
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags and marking of atimes are not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			mtx_lock(&np->n_mtx);
 			tsize = np->n_size;
 			mtx_unlock(&np->n_mtx);
 			error = ncl_meta_setsize(vp, ap->a_cred, td,
 			    vap->va_size);
 			mtx_lock(&np->n_mtx);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    mtx_unlock(&np->n_mtx);
 			    error = ncl_vinvalbuf(vp, vap->va_size == 0 ?
 			        0 : V_SAVE, td, 1);
 			    if (error != 0) {
 				    vnode_pager_setsize(vp, tsize);
 				    return (error);
 			    }
 			    /*
 			     * Call nfscl_delegmodtime() to set the modify time
 			     * locally, as required.
 			     */
 			    nfscl_delegmodtime(vp);
  			} else
 			    mtx_unlock(&np->n_mtx);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in ncl_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * ncl_meta_setsize() and could modify np->n_size.
 			 */
 			mtx_lock(&np->n_mtx);
  			np->n_vattr.na_size = np->n_size = vap->va_size;
 			mtx_unlock(&np->n_mtx);
   		}
   	} else {
 		mtx_lock(&np->n_mtx);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			mtx_unlock(&np->n_mtx);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error == EINTR || error == EIO)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, td);
 	if (error && vap->va_size != VNOVAL) {
 		mtx_lock(&np->n_mtx);
 		np->n_size = np->n_vattr.na_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		mtx_unlock(&np->n_mtx);
 	}
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int error, ret, attrflag, i;
 	struct nfsvattr nfsva;
 
 	if (NFS_ISV34(vp)) {
 		mtx_lock(&np->n_mtx);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 			np->n_accesscache[i].stamp = 0;
 		np->n_flag |= NDELEGMOD;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	}
 	error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag,
 	    NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid);
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct mount *mp = dvp->v_mount;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	struct nfsnode *np, *newnp;
 	int error = 0, attrflag, dattrflag, ltype, ncticks;
 	struct thread *td = cnp->cn_thread;
 	struct nfsfh *nfhp;
 	struct nfsvattr dnfsva, nfsva;
 	struct vattr vattr;
 	struct timespec nctime;
 	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(mp);
 	np = VTONFS(dvp);
 
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&np->n_mtx);
 	while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) {
 		np->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0);
 	}
 	mtx_unlock(&np->n_mtx);
 
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return (error);
 	error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks);
 	if (error > 0 && error != ENOENT)
 		return (error);
 	if (error == -1) {
 		/*
 		 * Lookups of "." are special and always return the
 		 * current directory.  cache_lookup() already handles
 		 * associated locking bookkeeping, etc.
 		 */
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			/* XXX: Is this really correct? */
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 
 		/*
 		 * We only accept a positive hit in the cache if the
 		 * change time of the file matches our cached copy.
 		 * Otherwise, we discard the cache entry and fallback
 		 * to doing a lookup RPC.  We also only trust cache
 		 * entries for less than nm_nametimeo seconds.
 		 *
 		 * To better handle stale file handles and attributes,
 		 * clear the attribute cache of this node if it is a
 		 * leaf component, part of an open() call, and not
 		 * locally modified before fetching the attributes.
 		 * This should allow stale file handles to be detected
 		 * here where we can fall back to a LOOKUP RPC to
 		 * recover rather than having nfs_open() detect the
 		 * stale file handle and failing open(2) with ESTALE.
 		 */
 		newvp = *vpp;
 		newnp = VTONFS(newvp);
 		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
 		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(newnp->n_flag & NMODIFIED)) {
 			mtx_lock(&newnp->n_mtx);
 			newnp->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&newnp->n_mtx);
 		}
 		if (nfscl_nodeleg(newvp, 0) == 0 ||
 		    ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
 		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_ctime, &nctime, ==))) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	} else if (error == ENOENT) {
 		if (dvp->v_iflag & VI_DOOMED)
 			return (ENOENT);
 		/*
 		 * We only accept a negative hit in the cache if the
 		 * modification time of the parent directory matches
 		 * the cached copy in the name cache entry.
 		 * Otherwise, we discard all of the negative cache
 		 * entries for this directory.  We also only trust
 		 * negative cache entries for up to nm_negnametimeo
 		 * seconds.
 		 */
 		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
 		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			return (ENOENT);
 		}
 		cache_purge_negative(dvp);
 	}
 
 	error = 0;
 	newvp = NULLVP;
 	NFSINCRGLOBAL(nfsstatsv1.lookupcache_misses);
 	error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 	    NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 
 		if (error != ENOENT) {
 			if (NFS_ISV4(dvp))
 				error = nfscl_maperr(td, error, (uid_t)0,
 				    (gid_t)0);
 			return (error);
 		}
 
 		/* The requested file was not found. */
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN)) {
 			/*
 			 * XXX: UFS does a full VOP_ACCESS(dvp,
 			 * VWRITE) here instead of just checking
 			 * MNT_RDONLY.
 			 */
 			if (mp->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 
 		if ((cnp->cn_flags & MAKEENTRY) != 0 && dattrflag) {
 			/*
 			 * Cache the modification time of the parent
 			 * directory from the post-op attributes in
 			 * the name cache entry.  The negative cache
 			 * entry will be ignored once the directory
 			 * has changed.  Don't bother adding the entry
 			 * if the directory has already changed.
 			 */
 			mtx_lock(&np->n_mtx);
 			if (timespeccmp(&np->n_vattr.na_mtime,
 			    &dnfsva.na_mtime, ==)) {
 				mtx_unlock(&np->n_mtx);
 				cache_enter_time(dvp, NULL, cnp,
 				    &dnfsva.na_mtime, NULL);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		return (ENOENT);
 	}
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 			FREE((caddr_t)nfhp, M_NFSFH);
 			return (EISDIR);
 		}
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		*vpp = newvp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		ltype = NFSVOPISLOCKED(dvp);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			NFSVOPUNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 			vfs_rel(mp);
 			if (error == 0 && (dvp->v_iflag & VI_DOOMED)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 			}
 			if (error != 0)
 				return (error);
 		}
 		NFSVOPUNLOCK(dvp, 0);
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error == 0)
 			newvp = NFSTOV(np);
 		vfs_unbusy(mp);
 		if (newvp != dvp)
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 			error = ENOENT;
 		}
 		if (error != 0)
 			return (error);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 		FREE((caddr_t)nfhp, M_NFSFH);
 		VREF(dvp);
 		newvp = dvp;
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else {
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(np->n_flag & NMODIFIED)) {			
 			/*
 			 * Flush the attribute cache when opening a
 			 * leaf node to ensure that fresh attributes
 			 * are fetched in nfs_open() since we did not
 			 * fetch attributes from the LOOKUP reply.
 			 */
 			mtx_lock(&np->n_mtx);
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
 	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 		    newvp->v_type != VDIR ? NULL : &dnfsva.na_ctime);
 	*vpp = newvp;
 	return (0);
 }
 
 /*
  * nfs read call.
  * Just call ncl_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by ncl_doio() from below the buffer cache.
  */
 int
 ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva,
 	    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, NULL, NULL,
 		    NFSV4OPEN_ACCESSREAD, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "readrpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva,
 		    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     int *iomode, int *must_commit, int called_from_strategy)
 {
 	struct nfsvattr nfsva;
 	int error, attrflag, ret;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, iomode, must_commit,
 		    NFSV4OPEN_ACCESSWRITE, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "writerpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_write(vp, uiop, iomode, must_commit, cred,
 		    uiop->uio_td, &nfsva, &attrflag, NULL,
 		    called_from_strategy);
 	if (attrflag) {
 		if (VTONFS(vp)->n_flag & ND_NFSV4)
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 1,
 			    1);
 		else
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (DOINGASYNC(vp))
 		*iomode = NFSWRITE_FILESYNC;
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsvattr nfsva, dnfsva;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL, *dnp;
 	struct nfsfh *nfhp;
 	struct vattr vattr;
 	int error = 0, attrflag, dattrflag;
 	u_int32_t rdev;
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = vap->va_rdev;
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = 0xffffffff;
 	else
 		return (EOPNOTSUPP);
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap,
 	    rdev, vap->va_type, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (!nfhp)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag != 0) {
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		*vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static struct mtx nfs_cverf_mtx;
 MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex",
     MTX_DEF);
 
 static nfsquad_t
 nfs_get_cverf(void)
 {
 	static nfsquad_t cverf;
 	nfsquad_t ret;
 	static int cverf_initialized = 0;
 
 	mtx_lock(&nfs_cverf_mtx);
 	if (cverf_initialized == 0) {
 		cverf.lval[0] = arc4random();
 		cverf.lval[1] = arc4random();
 		cverf_initialized = 1;
 	} else
 		cverf.qval++;
 	ret = cverf;
 	mtx_unlock(&nfs_cverf_mtx);
 
 	return (ret);
 }
 
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct nfsmount *nmp;
 	struct nfsvattr dnfsva, nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cverf;
 	int error = 0, attrflag, dattrflag, fmode = 0;
 	struct vattr vattr;
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK)
 		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 	dnp = VTONFS(dvp);
 	nmp = VFSTONFS(vnode_mount(dvp));
 again:
 	/* For NFSv4, wait until any remove is done. */
 	mtx_lock(&dnp->n_mtx);
 	while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) {
 		dnp->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0);
 	}
 	mtx_unlock(&dnp->n_mtx);
 
 	cverf = nfs_get_cverf();
 	error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cverf, fmode, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (nfhp == NULL)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp != NULL)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag == 0)
 			error = nfsrpc_getattr(newvp, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, NULL);
 		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (error) {
 		if (newvp != NULL) {
 			vput(newvp);
 			newvp = NULL;
 		}
 		if (NFS_ISV34(dvp) && (fmode & O_EXCL) &&
 		    error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 	} else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) {
 		if (nfscl_checksattr(vap, &nfsva)) {
 			error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, &attrflag, NULL);
 			if (error && (vap->va_uid != (uid_t)VNOVAL ||
 			    vap->va_gid != (gid_t)VNOVAL)) {
 				/* try again without setting uid/gid */
 				vap->va_uid = (uid_t)VNOVAL;
 				vap->va_gid = (uid_t)VNOVAL;
 				error = nfsrpc_setattr(newvp, vap, NULL, 
 				    cnp->cn_cred, cnp->cn_thread, &nfsva,
 				    &attrflag, NULL);
 			}
 			if (attrflag)
 				(void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				    NULL, 0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		if ((cnp->cn_flags & MAKEENTRY) && attrflag)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    NULL);
 		*ap->a_vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
 	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = ncl_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		if (error != EINTR && error != EIO)
 			/* Do the rpc */
 			error = nfs_removerpc(dvp, vp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	mtx_lock(&np->n_mtx);
 	np->n_attrstamp = 0;
 	mtx_unlock(&np->n_mtx);
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 ncl_removeit(struct sillyrename *sp, struct vnode *vp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen,
 	    sp->s_cred, NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and ncl_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr dnfsva;
 	struct nfsnode *dnp = VTONFS(dvp);
 	int error = 0, dattrflag;
 
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NREMOVEINPROG;
 	mtx_unlock(&dnp->n_mtx);
 	error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva,
 	    &dattrflag, NULL);
 	mtx_lock(&dnp->n_mtx);
 	if ((dnp->n_flag & NREMOVEWANT)) {
 		dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG);
 		mtx_unlock(&dnp->n_mtx);
 		wakeup((caddr_t)dnp);
 	} else {
 		dnp->n_flag &= ~NREMOVEINPROG;
 		mtx_unlock(&dnp->n_mtx);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	mtx_unlock(&dnp->n_mtx);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct nfsnode *fnp = VTONFS(ap->a_fvp);
 	struct nfsnode *tdnp = VTONFS(ap->a_tdvp);
 	struct nfsv4node *newv4 = NULL;
 	int error;
 
 	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
 	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
 	if ((error = NFSVOPLOCK(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	NFSVOPUNLOCK(fvp, 0);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 	    tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 	    tcnp->cn_thread);
 
 	if (error == 0 && NFS_ISV4(tdvp)) {
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		MALLOC(newv4, struct nfsv4node *,
 		    sizeof (struct nfsv4node) +
 		    tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1,
 		    M_NFSV4NODE, M_WAITOK);
 		mtx_lock(&tdnp->n_mtx);
 		mtx_lock(&fnp->n_mtx);
 		if (fnp->n_v4 != NULL && fvp->v_type == VREG &&
 		    (fnp->n_v4->n4_namelen != tcnp->cn_namelen ||
 		      NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4),
 		      tcnp->cn_namelen) ||
 		      tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen ||
 		      NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			tdnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99;
 bcopy(tcnp->cn_nameptr, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("ren replace=%s\n",nnn);
 }
 #endif
 			FREE((caddr_t)fnp->n_v4, M_NFSV4NODE);
 			fnp->n_v4 = newv4;
 			newv4 = NULL;
 			fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len;
 			fnp->n_v4->n4_namelen = tcnp->cn_namelen;
 			NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			    tdnp->n_fhp->nfh_len);
 			NFSBCOPY(tcnp->cn_nameptr,
 			    NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen);
 		}
 		mtx_unlock(&tdnp->n_mtx);
 		mtx_unlock(&fnp->n_mtx);
 		if (newv4 != NULL)
 			FREE((caddr_t)newv4, M_NFSV4NODE);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen,
 	    sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred,
 	    scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr,
     int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr,
     int tnamelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr fnfsva, tnfsva;
 	struct nfsnode *fdnp = VTONFS(fdvp);
 	struct nfsnode *tdnp = VTONFS(tdvp);
 	int error = 0, fattrflag, tattrflag;
 
 	error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp,
 	    tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag,
 	    &tattrflag, NULL, NULL);
 	mtx_lock(&fdnp->n_mtx);
 	fdnp->n_flag |= NMODIFIED;
 	if (fattrflag != 0) {
 		mtx_unlock(&fdnp->n_mtx);
 		(void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1);
 	} else {
 		fdnp->n_attrstamp = 0;
 		mtx_unlock(&fdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
 	}
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (tattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (error && NFS_ISV4(fdvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np, *tdnp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag;
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag,
 	    &dattrflag, NULL);
 	tdnp = VTONFS(tdvp);
 	mtx_lock(&tdnp->n_mtx);
 	tdnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&tdnp->n_mtx);
 		(void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		mtx_unlock(&tdnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	else {
 		np = VTONFS(vp);
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 		mtx_unlock(&np->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(tdvp, vp, cnp, &nfsva.na_ctime, NULL);
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsvattr nfsva, dnfsva;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	int error = 0, attrflag, dattrflag, ret;
 
 	vap->va_type = VLNK;
 	error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    ap->a_target, vap, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret)
 			newvp = NFSTOV(np);
 		else if (!error)
 			error = ret;
 	}
 	if (newvp != NULL) {
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (!error) {
 		/*
 		 * If we do not have an error and we could not extract the
 		 * newvp from the response due to the request being NFSv2, we
 		 * have to do a lookup in order to obtain a newvp to return.
 		 */
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL);
 	}
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct vattr vattr;
 	struct nfsfh *nfhp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag, ret;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	vap->va_type = VDIR;
 	error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp,
 	    &attrflag, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret) {
 			newvp = NFSTOV(np);
 			if (attrflag)
 			   (void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				NULL, 0, 1);
 		} else if (!error)
 			error = ret;
 	}
 	if (!error && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		/*
 		 * If negative lookup caching is enabled, I might as well
 		 * add an entry for this node. Not necessary for correctness,
 		 * but if negative caching is enabled, then the system
 		 * must care about lookup caching hit rate, so...
 		 */
 		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 		    (cnp->cn_flags & MAKEENTRY) &&
 		    attrflag != 0 && dattrflag != 0)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    &dnfsva.na_ctime);
 		*ap->a_vpp = newvp;
 	}
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *dnp;
 	struct nfsvattr dnfsva;
 	int error, dattrflag;
 
 	if (dvp == vp)
 		return (EINVAL);
 	error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	mtx_lock(&dnp->n_mtx);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		mtx_unlock(&dnp->n_mtx);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		mtx_unlock(&dnp->n_mtx);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 
 	cache_purge(dvp);
 	cache_purge(vp);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	ssize_t tresid, left;
 	int error = 0;
 	struct vattr vattr;
 	
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = 0;
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
 			mtx_lock(&np->n_mtx);
 			if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) ||
 			    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				NFSINCRGLOBAL(nfsstatsv1.direofcache_hits);
 				if (ap->a_eofflag != NULL)
 					*ap->a_eofflag = 1;
 				return (0);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 	}
 
 	/*
 	 * NFS always guarantees that directory entries don't straddle
 	 * DIRBLKSIZ boundaries.  As such, we need to limit the size
 	 * to an exact multiple of DIRBLKSIZ, to avoid copying a partial
 	 * directory entry.
 	 */
 	left = uio->uio_resid % DIRBLKSIZ;
 	if (left == uio->uio_resid)
 		return (EINVAL);
 	uio->uio_resid -= left;
 
 	/*
 	 * Call ncl_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = ncl_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		NFSINCRGLOBAL(nfsstatsv1.direofcache_misses);
 		if (ap->a_eofflag != NULL)
 			*ap->a_eofflag = 1;
 	}
 	
 	/* Add the partial DIRBLKSIZ (left) back in. */
 	uio->uio_resid += left;
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by ncl_doio().
  */
 int
 ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, eof, attrflag;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 
 	error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at the end of the directory or have filled
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc().
  */
 int
 ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, attrflag, eof;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at end of the directory or have filled the
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirplusrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
 	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
 	    M_NEWNFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accommodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, vp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free((caddr_t)sp, M_NEWNFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL, *vp;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	struct nfsfh *nfhp, *onfhp;
 	struct nfsvattr nfsva, dnfsva;
 	struct componentname cn;
 	int error = 0, attrflag, dattrflag;
 	u_int hash;
 
 	error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (npp && !error) {
 		if (*npp != NULL) {
 		    np = *npp;
 		    vp = NFSTOV(np);
 		    /*
 		     * For NFSv4, check to see if it is the same name and
 		     * replace the name, if it is different.
 		     */
 		    if (np->n_v4 != NULL && nfsva.na_type == VREG &&
 			(np->n_v4->n4_namelen != len ||
 			 NFSBCMP(name, NFS4NODENAME(np->n_v4), len) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (len < 100) ? len : 99;
 bcopy(name, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("replace=%s\n",nnn);
 }
 #endif
 			    FREE((caddr_t)np->n_v4, M_NFSV4NODE);
 			    MALLOC(np->n_v4, struct nfsv4node *,
 				sizeof (struct nfsv4node) +
 				dnp->n_fhp->nfh_len + len - 1,
 				M_NFSV4NODE, M_WAITOK);
 			    np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			    np->n_v4->n4_namelen = len;
 			    NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 				dnp->n_fhp->nfh_len);
 			    NFSBCOPY(name, NFS4NODENAME(np->n_v4), len);
 		    }
 		    hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len,
 			FNV1_32_INIT);
 		    onfhp = np->n_fhp;
 		    /*
 		     * Rehash node for new file handle.
 		     */
 		    vfs_hash_rehash(vp, hash);
 		    np->n_fhp = nfhp;
 		    if (onfhp != NULL)
 			FREE((caddr_t)onfhp, M_NFSFH);
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) {
 		    FREE((caddr_t)nfhp, M_NFSFH);
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    cn.cn_nameptr = name;
 		    cn.cn_namelen = len;
 		    error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td,
 			&np, NULL, LK_EXCLUSIVE);
 		    if (error)
 			return (error);
 		    newvp = NFSTOV(np);
 		}
 		if (!attrflag && *npp == NULL) {
 			if (newvp == dvp)
 				vrele(newvp);
 			else
 				vput(newvp);
 			return (ENOENT);
 		}
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Nfs Version 3 and 4 commit rpc
  */
 int
 ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
    struct thread *td)
 {
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np;
 	struct uio uio;
 	int error, attrflag;
 
 	np = VTONFS(vp);
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp) && (np->n_flag & NDSCOMMIT) != 0) {
 		uio.uio_offset = offset;
 		uio.uio_resid = cnt;
 		error = nfscl_doiods(vp, &uio, NULL, NULL,
 		    NFSV4OPEN_ACCESSWRITE, 1, cred, td);
 		if (error != 0) {
 			mtx_lock(&np->n_mtx);
 			np->n_flag &= ~NDSCOMMIT;
 			mtx_unlock(&np->n_mtx);
 		}
 	}
 	if (error != 0) {
 		mtx_lock(&nmp->nm_mtx);
 		if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 			mtx_unlock(&nmp->nm_mtx);
 			return (0);
 		}
 		mtx_unlock(&nmp->nm_mtx);
 		error = nfsrpc_commit(vp, offset, cnt, cred, td, &nfsva,
 		    &attrflag, NULL);
 	}
 	if (attrflag != 0)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL,
 		    0, 1);
 	if (error != 0 && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling ncl_asyncio(), otherwise just all ncl_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct ucred *cr;
 
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	KASSERT(bp->b_vp == vp, ("missing b_getvp"));
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (vp->v_type == VREG && bp->b_blkno == bp->b_lblkno)
 		bp->b_blkno = bp->b_lblkno * (vp->v_bufobj.bo_bsize /
 		    DEV_BSIZE);
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    ncl_asyncio(VFSTONFS(vp->v_mount), bp, NOCRED, curthread))
 		(void) ncl_doio(vp, bp, cr, curthread, 1);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call ncl_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 
 	if (ap->a_vp->v_type != VREG) {
 		/*
 		 * For NFS, metadata is changed synchronously on the server,
 		 * so there is nothing to flush. Also, ncl_flush() clears
 		 * the NMODIFIED flag and that shouldn't be done here for
 		 * directories.
 		 */
 		return (0);
 	}
 	return (ncl_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1, 0));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  * If the called_from_renewthread argument is TRUE, it has been called
  * from the NFSv4 renew thread and, as such, cannot block indefinitely
  * waiting for a buffer write to complete.
  */
 int
 ncl_flush(struct vnode *vp, int waitfor, struct thread *td,
     int commit, int called_from_renewthread)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1, trycnt = 0;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 	struct bufobj *bo;
 #ifndef NFS_COMMITBVECSIZ
 #define	NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
-	int bvecsize = 0, bveccount;
+	u_int bvecsize = 0, bveccount;
 
 	if (called_from_renewthread != 0)
 		slptimeo = hz;
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	bo = &vp->v_bufobj;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, ncl_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV34(vp) && commit) {
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			BO_UNLOCK(bo);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			BO_LOCK(bo);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			BO_UNLOCK(bo);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			BO_LOCK(bo);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		BO_UNLOCK(bo);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = ncl_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = ncl_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			ncl_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				bufobj_wref(bo);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK) {
 				error = 0;
 				goto loop;
 			}
 			if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so the flush will be retried
 				 * later.
 				 */
 				error = EIO;
 				goto done;
 			}
 			if (newnfs_sigintr(nmp, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		if (passone || !commit)
 		    bp->b_flags |= B_ASYNC;
 		else
 		    bp->b_flags |= B_ASYNC;
 		bwrite(bp);
 		if (newnfs_sigintr(nmp, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	if (passone) {
 		passone = 0;
 		BO_UNLOCK(bo);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (bo->bo_numoutput) {
 			error = bufobj_wwait(bo, slpflag, slptimeo);
 			if (error) {
 			    BO_UNLOCK(bo);
 			    if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so that the flush will be
 				 * retried later.
 				 */
 				error = EIO;
 				goto done;
 			    }
 			    error = newnfs_sigintr(nmp, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    BO_LOCK(bo);
 			}
 		}
 		if (bo->bo_dirty.bv_cnt != 0 && commit) {
 			BO_UNLOCK(bo);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		BO_UNLOCK(bo);
 		mtx_lock(&np->n_mtx);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = newnfs_msleep(td, &np->n_directio_asyncwr,
 			    &np->n_mtx, slpflag | (PRIBIO + 1), 
 			    "nfsfsync", 0);
 			if (error) {
 				if (newnfs_sigintr(nmp, td)) {
 					mtx_unlock(&np->n_mtx);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	} else
 		BO_UNLOCK(bo);
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		mtx_lock(&np->n_mtx);
 		np->n_attrstamp = 0;
 	} else
 		mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && bo->bo_dirty.bv_cnt == 0 &&
 	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	if (error == 0 && commit != 0 && waitfor == MNT_WAIT &&
 	    (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 ||
 	    np->n_directio_asyncwr != 0)) {
 		if (trycnt++ < 5) {
 			/* try, try again... */
 			passone = 1;
 			wcred = NULL;
 			bvec = NULL;
 			bvecsize = 0;
 			goto again;
 		}
 		vn_printf(vp, "ncl_flush failed");
 		error = called_from_renewthread != 0 ? EIO : EBUSY;
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred;
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	struct proc *p = (struct proc *)ap->a_id;
 	struct thread *td = curthread;	/* XXX */
 	struct vattr va;
 	int ret, error = EOPNOTSUPP;
 	u_quad_t size;
 	
 	if (NFS_ISV4(vp) && (ap->a_flags & (F_POSIX | F_FLOCK)) != 0) {
 		if (vp->v_type != VREG)
 			return (EINVAL);
 		if ((ap->a_flags & F_POSIX) != 0)
 			cred = p->p_ucred;
 		else
 			cred = td->td_ucred;
 		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (vp->v_iflag & VI_DOOMED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EBADF);
 		}
 
 		/*
 		 * If this is unlocking a write locked region, flush and
 		 * commit them before unlocking. This is required by
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_UNLCK &&
 		    nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id,
 		    ap->a_flags))
 			(void) ncl_flush(vp, MNT_WAIT, td, 1, 0);
 
 		/*
 		 * Loop around doing the lock op, while a blocking lock
 		 * must wait for the lock op to succeed.
 		 */
 		do {
 			ret = nfsrpc_advlock(vp, np->n_size, ap->a_op,
 			    ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags);
 			if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 			    ap->a_op == F_SETLK) {
 				NFSVOPUNLOCK(vp, 0);
 				error = nfs_catnap(PZERO | PCATCH, ret,
 				    "ncladvl");
 				if (error)
 					return (EINTR);
 				NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 				if (vp->v_iflag & VI_DOOMED) {
 					NFSVOPUNLOCK(vp, 0);
 					return (EBADF);
 				}
 			}
 		} while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 		     ap->a_op == F_SETLK);
 		if (ret == NFSERR_DENIED) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EAGAIN);
 		} else if (ret == EINVAL || ret == EBADF || ret == EINTR) {
 			NFSVOPUNLOCK(vp, 0);
 			return (ret);
 		} else if (ret != 0) {
 			NFSVOPUNLOCK(vp, 0);
 			return (EACCES);
 		}
 
 		/*
 		 * Now, if we just got a lock, invalidate data in the buffer
 		 * cache, as required, so that the coherency conforms with
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_SETLK) {
 			if ((np->n_flag & NMODIFIED) == 0) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 			}
 			if ((np->n_flag & NMODIFIED) || ret ||
 			    np->n_change != va.va_filerev) {
 				(void) ncl_vinvalbuf(vp, V_SAVE, td, 1);
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 				if (!ret) {
 					np->n_mtime = va.va_mtime;
 					np->n_change = va.va_filerev;
 				}
 			}
 			/* Mark that a file lock has been acquired. */
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NHASBEENLOCKED;
 			mtx_unlock(&np->n_mtx);
 		}
 		NFSVOPUNLOCK(vp, 0);
 		return (0);
 	} else if (!NFS_ISV4(vp)) {
 		error = NFSVOPLOCK(vp, LK_SHARED);
 		if (error)
 			return (error);
 		if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 			size = VTONFS(vp)->n_size;
 			NFSVOPUNLOCK(vp, 0);
 			error = lf_advlock(ap, &(vp->v_lockf), size);
 		} else {
 			if (nfs_advlock_p != NULL)
 				error = nfs_advlock_p(ap);
 			else {
 				NFSVOPUNLOCK(vp, 0);
 				error = ENOLCK;
 			}
 		}
 		if (error == 0 && ap->a_op == F_SETLK) {
 			error = NFSVOPLOCK(vp, LK_SHARED);
 			if (error == 0) {
 				/* Mark that a file lock has been acquired. */
 				mtx_lock(&np->n_mtx);
 				np->n_flag |= NHASBEENLOCKED;
 				mtx_unlock(&np->n_mtx);
 				NFSVOPUNLOCK(vp, 0);
 			}
 		}
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 	
 	if (NFS_ISV4(vp))
 		return (EOPNOTSUPP);
 	error = NFSVOPLOCK(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		NFSVOPUNLOCK(vp, 0);
 		error = lf_advlockasync(ap, &(vp->v_lockf), size);
 	} else {
 		NFSVOPUNLOCK(vp, 0);
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	printf("\tfileid %jd fsid 0x%jx", (uintmax_t)np->n_vattr.na_fileid,
 	    (uintmax_t)np->n_vattr.na_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 ncl_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int oldflags, rtval;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) != 0)
 		return (0);
 
 	rtval = bufwait(bp);
 	if (oldflags & B_DELWRI)
 		reassignbuf(bp);
 	brelse(bp);
 	return (rtval);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 	    accmode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NACC;
 	vfs_timestamp(&np->n_atim);
 	mtx_unlock(&np->n_mtx);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	mtx_lock(&np->n_mtx);
 	np->n_flag |= NUPD;
 	vfs_timestamp(&np->n_mtim);
 	mtx_unlock(&np->n_mtx);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & (NACC | NUPD)) {
 		vfs_timestamp(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			mtx_unlock(&np->n_mtx);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
 			goto out;
 		}
 	}
 	mtx_unlock(&np->n_mtx);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call ncl_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (ncl_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_newnfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static int
 nfs_getacl(struct vop_getacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 static int
 nfs_setacl(struct vop_setacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 static int
 nfs_set_text(struct vop_set_text_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np;
 
 	/*
 	 * If the text file has been mmap'd, flush any dirty pages to the
 	 * buffer cache and then...
 	 * Make sure all writes are pushed to the NFS server.  If this is not
 	 * done, the modify time of the file can change while the text
 	 * file is being executed.  This will cause the process that is
 	 * executing the text file to be terminated.
 	 */
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 
 	/* Now, flush the buffer cache. */
 	ncl_flush(vp, MNT_WAIT, curthread, 0, 0);
 
 	/* And, finally, make sure that n_mtime is up to date. */
 	np = VTONFS(vp);
 	mtx_lock(&np->n_mtx);
 	np->n_mtime = np->n_vattr.na_mtime;
 	mtx_unlock(&np->n_mtx);
 
 	vp->v_vflag |= VV_TEXT;
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to nfs filesystems.
  */
 static int
 nfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct nfsv3_pathconf pc;
 	struct nfsvattr nfsva;
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;
 	int attrflag, error;
 
 	if ((NFS_ISV34(vp) && (ap->a_name == _PC_LINK_MAX ||
 	    ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED ||
 	    ap->a_name == _PC_NO_TRUNC)) ||
 	    (NFS_ISV4(vp) && ap->a_name == _PC_ACL_NFS4)) {
 		/*
 		 * Since only the above 4 a_names are returned by the NFSv3
 		 * Pathconf RPC, there is no point in doing it for others.
 		 * For NFSv4, the Pathconf RPC (actually a Getattr Op.) can
 		 * be used for _PC_NFS4_ACL as well.
 		 */
 		error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 		if (attrflag != 0)
 			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (error != 0)
 			return (error);
 	} else {
 		/*
 		 * For NFSv2 (or NFSv3 when not one of the above 4 a_names),
 		 * just fake them.
 		 */
 		pc.pc_linkmax = NFS_LINK_MAX;
 		pc.pc_namemax = NFS_MAXNAMLEN;
 		pc.pc_notrunc = 1;
 		pc.pc_chownrestricted = 1;
 		pc.pc_caseinsensitive = 0;
 		pc.pc_casepreserving = 1;
 		error = 0;
 	}
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 #ifdef _LP64
 		*ap->a_retval = pc.pc_linkmax;
 #else
 		*ap->a_retval = MIN(LONG_MAX, pc.pc_linkmax);
 #endif
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = pc.pc_namemax;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = pc.pc_chownrestricted;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = pc.pc_notrunc;
 		break;
 	case _PC_ACL_EXTENDED:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ACL_NFS4:
 		if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 &&
 		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL))
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (NFS_ISV4(vp))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 	case _PC_MAC_PRESENT:
 		*ap->a_retval = 0;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		if (NFS_ISV34(vp))
 			*ap->a_retval = 64;
 		else
 			*ap->a_retval = 32;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = NFS_MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 328237)
+++ head/sys/kern/kern_exec.c	(revision 328238)
@@ -1,1742 +1,1742 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exec;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exec, "char *");
 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 int coredump_pack_fileinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
     &coredump_pack_fileinfo, 0,
     "Enable file path packing in 'procstat -f' coredump notes");
 
 int coredump_pack_vmmapinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
     &coredump_pack_vmmapinfo, 0,
     "Enable file path packing in 'procstat -v' coredump notes");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int disallow_high_osrel;
 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
     &disallow_high_osrel, 0,
     "Disallow execution of binaries built for higher version of the world");
 
 static int map_at_zero = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0,
     "Permit processes to map an object at virtual address 0.");
 
 EVENTHANDLER_LIST_DECLARE(process_exec);
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 sys_execve(struct thread *td, struct execve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fexecve_args {
 	int	fd;
 	char	**argv;
 	char	**envv;
 }
 #endif
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
 {
 #ifdef MAC
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	post_execve(td, error, oldvmspace);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 int
 pre_execve(struct thread *td, struct vmspace **oldvmspace)
 {
 	struct proc *p;
 	int error;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	error = 0;
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		if (thread_single(p, SINGLE_BOUNDARY) != 0)
 			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
 	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("nested execve"));
 	*oldvmspace = p->p_vmspace;
 	return (error);
 }
 
 void
 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
 {
 	struct proc *p;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == EJUSTRETURN)
 			thread_single(p, SINGLE_EXIT);
 		else
 			thread_single_end(p, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p);
 	}
 	if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
 		KASSERT(p->p_vmspace != oldvmspace,
 		    ("oldvmspace still used"));
 		vmspace_free(oldvmspace);
 		td->td_pflags &= ~TDP_EXECVMSPC;
 	}
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 
 	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	return (do_execve(td, args, mac_p));
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
 	struct ucred *oldcred;
 	struct uidinfo *euip = NULL;
 	register_t *stack_base;
 	int error, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *oldtextvp = NULL, *newtextvp;
 	cap_rights_t rights;
 	int credential_changing;
 	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	bzero(imgp, sizeof(*imgp));
 	imgp->proc = p;
 	imgp->attr = &attr;
 	imgp->args = args;
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp among other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
 		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE1(proc, , , exec, args->fname);
 
 interpret:
 	if (args->fname != NULL) {
 #ifdef CAPABILITY_MODE
 		/*
 		 * While capability mode can't reach this point via direct
 		 * path arguments to execve(), we also don't allow
 		 * interpreters to be used in capability mode (for now).
 		 * Catch indirect lookups and return a permissions error.
 		 */
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			goto exec_fail;
 		}
 #endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
 
 		newtextvp = nd.ni_vp;
 		imgp->vp = newtextvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd,
 		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
 		if (error)
 			goto exec_fail;
 		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
 	}
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = VOP_IS_TEXT(imgp->vp);
 	VOP_SET_TEXT(imgp->vp);
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Determine new credentials before attempting image activators
 	 * so that it can be used by process_exec handlers to determine
 	 * credential/setid changes.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * We disable setuid/setgid/etc in capability mode on the basis
 	 * that most setugid applications are not written with that
 	 * environment in mind, and will therefore almost certainly operate
 	 * incorrectly. In principle there's no reason that setugid
 	 * applications might not be useful in capability mode, so we may want
 	 * to reconsider this conservative design choice in the future.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & S_ISUID) &&
 	    oldcred->cr_uid != attr.va_uid;
 	credential_changing |= (attr.va_mode & S_ISGID) &&
 	    oldcred->cr_gid != attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interpvplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 #ifdef CAPABILITY_MODE
 	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
 #endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		imgp->credential_setid = true;
 		VOP_UNLOCK(imgp->vp, 0);
 		imgp->newcred = crdup(oldcred);
 		if (attr.va_mode & S_ISUID) {
 			euip = uifind(attr.va_uid);
 			change_euid(imgp->newcred, euip);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (attr.va_mode & S_ISGID)
 			change_egid(imgp->newcred, attr.va_gid);
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 	} else {
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			VOP_UNLOCK(imgp->vp, 0);
 			imgp->newcred = crdup(oldcred);
 			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 		}
 	}
 	/* The new credentials are installed into the process later. */
 
 	/*
 	 * Do the best to calculate the full path to the image file.
 	 */
 	if (args->fname != NULL && args->fname[0] == '/')
 		imgp->execpath = args->fname;
 	else {
 		VOP_UNLOCK(imgp->vp, 0);
 		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
 		    &imgp->freepath) != 0)
 			imgp->execpath = args->fname;
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				VOP_UNSET_TEXT(imgp->vp);
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		VOP_UNSET_TEXT(imgp->vp);
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
 			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
 		vput(newtextvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		imgp->credential_setid = false;
 		if (imgp->newcred != NULL) {
 			crfree(imgp->newcred);
 			imgp->newcred = NULL;
 		}
 		imgp->execpath = NULL;
 		free(imgp->freepath, M_TEMP);
 		imgp->freepath = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
 	}
 
 	/*
 	 * NB: We unlock the vnode here because it is believed that none
 	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	if (disallow_high_osrel &&
 	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
 		error = ENOEXEC;
 		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
 		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
 	if (SV_PROC_FLAG(p, SV_CAPSICUM))
 		sys_cap_enter(td, NULL);
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	if (args->fdp != NULL) {
 		/* Install a brand new file descriptor table. */
 		fdinstall_remapped(td, args->fdp);
 		args->fdp = NULL;
 	} else {
 		/*
 		 * Keep on using the existing file descriptor table. For
 		 * security and other reasons, the file descriptor table
 		 * cannot be shared after an exec.
 		 */
 		fdunshare(td);
 		/* close files on exec */
 		fdcloseexec(td);
 	}
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 	}
 
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 
 	PROC_LOCK(p);
 	if (oldsigacts)
 		p->p_sigacts = newsigacts;
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	bzero(p->p_comm, sizeof(p->p_comm));
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
 	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
 		p->p_flag2 &= ~P2_NOTRACE;
 	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
 		/* STOPs are no longer ignored, arrange for AST */
 		signotify(td);
 	}
 
 	/*
 	 * Implement image setuid/setgid installation.
 	 */
 	if (imgp->credential_setid) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracecred != NULL &&
 		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
 			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * Both fdsetugidsafety() and fdcheckstd() may call functions
 		 * taking sleepable locks, so temporarily drop our locks.
 		 */
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp, 0);
 		fdsetugidsafety(td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto exec_fail_dealloc;
 		PROC_LOCK(p);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, imgp->newcred,
 			    imgp->vp, interpvplabel, imgp);
 		}
 #endif
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 	}
 	/*
 	 * Set the new credentials.
 	 */
 	if (imgp->newcred != NULL) {
 		proc_set_cred(p, imgp->newcred);
 		crfree(oldcred);
 		oldcred = NULL;
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced by namei
 	 * or fgetvp_exec.
 	 */
 	oldtextvp = p->p_textvp;
 	p->p_textvp = newtextvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exec if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exec)
 		dtrace_fasttrap_exec(p);
 #endif
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 	PROC_UNLOCK(p);
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		VOP_UNLOCK(imgp->vp, 0);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp, 
 		    (u_long)(uintptr_t)stack_base);
 	else
 		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
 
 	vfs_mark_atime(imgp->vp, td->td_ucred);
 
 	SDT_PROBE1(proc, , , exec__success, args->fname);
 
 exec_fail_dealloc:
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		if (args->fname)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
 		if (error != 0)
 			vput(imgp->vp);
 		else
 			VOP_UNLOCK(imgp->vp, 0);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	free(imgp->freepath, M_TEMP);
 
 	if (error == 0) {
 		if (p->p_ptevents & PTRACE_EXEC) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_EXEC)
 				td->td_dbgflags |= TDB_EXEC;
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 	} else {
 exec_fail:
 		/* we're done here, clear P_INEXEC */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_INEXEC;
 		PROC_UNLOCK(p);
 
 		SDT_PROBE1(proc, , , exec__failure, error);
 	}
 
 	if (imgp->newcred != NULL && oldcred != NULL)
 		crfree(imgp->newcred);
 
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
 	exec_free_args(args);
 
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (oldtextvp != NULL)
 		vrele(oldtextvp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	pargs_drop(oldargs);
 	pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 	if (euip != NULL)
 		uifree(euip);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, 0, SIGABRT);
 		/* NOT REACHED */
 	}
 
 #ifdef KTRACE
 	if (error == 0)
 		ktrprocctor(p);
 #endif
 
 	/*
 	 * We don't want cpu_set_syscall_retval() to overwrite any of
 	 * the register values put in place by exec_setregs().
 	 * Implementations of cpu_set_syscall_retval() will leave
 	 * registers unmodified when returning EJUSTRETURN.
 	 */
 	return (error == 0 ? EJUSTRETURN : error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i, after, initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_WLOCK(object);
 #if VM_NRESERVLEVEL > 0
 	vm_object_color(object, 0);
 #endif
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(ma[0]);
 		if (!vm_pager_has_page(object, 0, NULL, &after)) {
 			vm_page_lock(ma[0]);
 			vm_page_free(ma[0]);
 			vm_page_unlock(ma[0]);
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		initial_pagein = min(after, VM_INITIAL_PAGEIN);
 		KASSERT(initial_pagein <= object->size,
 		    ("%s: initial_pagein %d object->size %ju",
 		    __func__, initial_pagein, (uintmax_t )object->size));
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if (vm_page_tryxbusy(ma[i]))
 					break;
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			for (i = 0; i < initial_pagein; i++) {
 				vm_page_lock(ma[i]);
 				vm_page_free(ma[i]);
 				vm_page_unlock(ma[i]);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		vm_page_xunbusy(ma[0]);
 		for (i = 1; i < initial_pagein; i++)
 			vm_page_readahead_finish(ma[i]);
 	}
 	vm_page_lock(ma[0]);
 	vm_page_hold(ma[0]);
 	vm_page_activate(ma[0]);
 	vm_page_unlock(ma[0]);
 	VM_OBJECT_WUNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(struct image_params *imgp)
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack.
  *	The new stack is only sgrowsiz large because it is grown
  *	automatically on a page fault.
  */
 int
 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_object_t obj;
 	struct rlimit rlim_stack;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (map_at_zero)
 		sv_minuser = sv->sv_minuser;
 	else
 		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 		/* An exec terminates mlockall(MCL_FUTURE). */
 		vm_map_lock(map);
 		vm_map_modflags(map, 0, MAP_WIREFUTURE);
 		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Map a shared page */
 	obj = sv->sv_shared_page_obj;
 	if (obj != NULL) {
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 		if (error != KERN_SUCCESS) {
 			vm_object_deallocate(obj);
 			return (vm_mmap_to_errno(error));
 		}
 	}
 
 	/* Allocate a new stack */
 	if (imgp->stack_sz != 0) {
 		ssiz = trunc_page(imgp->stack_sz);
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
 		PROC_UNLOCK(p);
 		if (ssiz > rlim_stack.rlim_max)
 			ssiz = rlim_stack.rlim_max;
 		if (ssiz > rlim_stack.rlim_cur) {
 			rlim_stack.rlim_cur = ssiz;
 			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
 		}
 	} else if (sv->sv_maxssiz != NULL) {
 		ssiz = *sv->sv_maxssiz;
 	} else {
 		ssiz = maxssiz;
 	}
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error != KERN_SUCCESS)
 		return (vm_mmap_to_errno(error));
 
 	/*
 	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
 	 * are still used to enforce the stack rlimit on the process stack.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)stack_addr;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	u_long argp, envp;
 	int error;
 	size_t length;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = (segflg == UIO_SYSSPACE) ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			goto err_exit;
 	} else
 		length = 0;
 
 	args->begin_argv = args->buf + length;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	/*
 	 * extract arguments first
 	 */
 	for (;;) {
 		error = fueword(argv++, &argp);
 		if (error == -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if (argp == 0)
 			break;
 		error = copyinstr((void *)(uintptr_t)argp, args->endp,
 		    args->stringspace, &length);
 		if (error != 0) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		for (;;) {
 			error = fueword(envv++, &envp);
 			if (error == -1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if (envp == 0)
 				break;
 			error = copyinstr((void *)(uintptr_t)envp,
 			    args->endp, args->stringspace, &length);
 			if (error != 0) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 exec_copyin_data_fds(struct thread *td, struct image_args *args,
     const void *data, size_t datalen, const int *fds, size_t fdslen)
 {
 	struct filedesc *ofdp;
 	const char *p;
 	int *kfds;
 	int error;
 
 	memset(args, '\0', sizeof(*args));
 	ofdp = td->td_proc->p_fd;
 	if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1)
 		return (E2BIG);
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	args->begin_argv = args->buf;
 	args->stringspace = ARG_MAX;
 
 	if (datalen > 0) {
 		/*
 		 * Argument buffer has been provided. Copy it into the
 		 * kernel as a single string and add a terminating null
 		 * byte.
 		 */
 		error = copyin(data, args->begin_argv, datalen);
 		if (error != 0)
 			goto err_exit;
 		args->begin_argv[datalen] = '\0';
 		args->endp = args->begin_argv + datalen + 1;
 		args->stringspace -= datalen + 1;
 
 		/*
 		 * Traditional argument counting. Count the number of
 		 * null bytes.
 		 */
 		for (p = args->begin_argv; p < args->endp; ++p)
 			if (*p == '\0')
 				++args->argc;
 	} else {
 		/* No argument buffer provided. */
 		args->endp = args->begin_argv;
 	}
 	/* There are no environment variables. */
 	args->begin_envv = args->endp;
 
 	/* Create new file descriptor table. */
 	kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK);
 	error = copyin(fds, kfds, fdslen * sizeof(int));
 	if (error != 0) {
 		free(kfds, M_TEMP);
 		goto err_exit;
 	}
 	error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp);
 	free(kfds, M_TEMP);
 	if (error != 0)
 		goto err_exit;
 
 	return (0);
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 struct exec_args_kva {
 	vm_offset_t addr;
 	u_int gen;
 	SLIST_ENTRY(exec_args_kva) next;
 };
 
 static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva);
 
 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist;
 static struct mtx exec_args_kva_mtx;
 static u_int exec_args_gen;
 
 static void
 exec_prealloc_args_kva(void *arg __unused)
 {
 	struct exec_args_kva *argkva;
 	u_int i;
 
 	SLIST_INIT(&exec_args_kva_freelist);
 	mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF);
 	for (i = 0; i < exec_map_entries; i++) {
 		argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK);
 		argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size);
 		argkva->gen = exec_args_gen;
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 	}
 }
 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL);
 
 static vm_offset_t
 exec_alloc_args_kva(void **cookie)
 {
 	struct exec_args_kva *argkva;
 
 	argkva = (void *)atomic_readandclear_ptr(
 	    (uintptr_t *)DPCPU_PTR(exec_args_kva));
 	if (argkva == NULL) {
 		mtx_lock(&exec_args_kva_mtx);
 		while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL)
 			(void)mtx_sleep(&exec_args_kva_freelist,
 			    &exec_args_kva_mtx, 0, "execkva", 0);
 		SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 	*(struct exec_args_kva **)cookie = argkva;
 	return (argkva->addr);
 }
 
 static void
 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen)
 {
 	vm_offset_t base;
 
 	base = argkva->addr;
 	if (argkva->gen != gen) {
 		vm_map_madvise(exec_map, base, base + exec_map_entry_size,
 		    MADV_FREE);
 		argkva->gen = gen;
 	}
 	if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva),
 	    (uintptr_t)NULL, (uintptr_t)argkva)) {
 		mtx_lock(&exec_args_kva_mtx);
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 		wakeup_one(&exec_args_kva_freelist);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 }
 
 static void
 exec_free_args_kva(void *cookie)
 {
 
 	exec_release_args_kva(cookie, exec_args_gen);
 }
 
 static void
 exec_args_kva_lowmem(void *arg __unused)
 {
 	SLIST_HEAD(, exec_args_kva) head;
 	struct exec_args_kva *argkva;
 	u_int gen;
 	int i;
 
 	gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1;
 
 	/*
 	 * Force an madvise of each KVA range. Any currently allocated ranges
 	 * will have MADV_FREE applied once they are freed.
 	 */
 	SLIST_INIT(&head);
 	mtx_lock(&exec_args_kva_mtx);
 	SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva);
 	mtx_unlock(&exec_args_kva_mtx);
 	while ((argkva = SLIST_FIRST(&head)) != NULL) {
 		SLIST_REMOVE_HEAD(&head, next);
 		exec_release_args_kva(argkva, gen);
 	}
 
 	CPU_FOREACH(i) {
 		argkva = (void *)atomic_readandclear_ptr(
 		    (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva));
 		if (argkva != NULL)
 			exec_release_args_kva(argkva, gen);
 	}
 }
 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL,
     EVENTHANDLER_PRI_ANY);
 
 /*
  * Allocate temporary demand-paged, zero-filled memory for the file name,
  * argument, and environment strings.
  */
 int
 exec_alloc_args(struct image_args *args)
 {
 
 	args->buf = (char *)exec_alloc_args_kva(&args->bufkva);
 	return (0);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf != NULL) {
 		exec_free_args_kva(args->bufkva);
 		args->buf = NULL;
 	}
 	if (args->fname_buf != NULL) {
 		free(args->fname_buf, M_TEMP);
 		args->fname_buf = NULL;
 	}
 	if (args->fdp != NULL)
 		fdescfree_remapped(args->fdp);
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp;
 	uintptr_t destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	size_t execpath_len;
 	int szsigcode, szps;
 	char canary[sizeof(long) * 8];
 
 	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_sigcode_base == 0) {
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(void *));
 		copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		imgp->execpathp = destp;
 		copyout(imgp->execpath, (void *)destp, execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = destp;
 	copyout(canary, (void *)destp, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	destp -= szps;
 	destp = rounddown2(destp, sizeof(void *));
 	imgp->pagesizes = destp;
 	copyout(pagesizes, (void *)destp, szps);
 	imgp->pagesizeslen = szps;
 
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(void *));
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size)
 		    * sizeof(char *));
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
 		    + 2) * sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(struct image_params *imgp)
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error, writecount;
 
 	td = curthread;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that
 	 *    this file resides on.
 	 * 2) Ensure that at least one execute bit is on. Otherwise, a
 	 *    privileged user will always succeed, and we don't want this
 	 *    to happen unless the file really is executable.
 	 * 3) Ensure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		return (error);
 	if (writecount != 0)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error == 0)
 		imgp->opened = 1;
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
-	int count = 2;	/* New slot and trailing NULL */
+	u_int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/link_elf_obj.c
===================================================================
--- head/sys/kern/link_elf_obj.c	(revision 328237)
+++ head/sys/kern/link_elf_obj.c	(revision 328238)
@@ -1,1531 +1,1531 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1998-2000 Doug Rabson
  * Copyright (c) 2004 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
 #include <sys/vnode.h>
 #include <sys/linker.h>
 
 #include <machine/elf.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <sys/link_elf.h>
 
 #ifdef DDB_CTF
 #include <sys/zlib.h>
 #endif
 
 #include "linker_if.h"
 
 typedef struct {
 	void		*addr;
 	Elf_Off		size;
 	int		flags;
 	int		sec;	/* Original section */
 	char		*name;
 } Elf_progent;
 
 typedef struct {
 	Elf_Rel		*rel;
 	int		nrel;
 	int		sec;
 } Elf_relent;
 
 typedef struct {
 	Elf_Rela	*rela;
 	int		nrela;
 	int		sec;
 } Elf_relaent;
 
 
 typedef struct elf_file {
 	struct linker_file lf;		/* Common fields */
 
 	int		preloaded;
 	caddr_t		address;	/* Relocation address */
 	vm_object_t	object;		/* VM object to hold file pages */
 	Elf_Shdr	*e_shdr;
 
 	Elf_progent	*progtab;
-	int		nprogtab;
+	u_int		nprogtab;
 
 	Elf_relaent	*relatab;
-	int		nrelatab;
+	u_int		nrelatab;
 
 	Elf_relent	*reltab;
 	int		nreltab;
 
 	Elf_Sym		*ddbsymtab;	/* The symbol table we are using */
 	long		ddbsymcnt;	/* Number of symbols */
 	caddr_t		ddbstrtab;	/* String table */
 	long		ddbstrcnt;	/* number of bytes in string table */
 
 	caddr_t		shstrtab;	/* Section name string table */
 	long		shstrcnt;	/* number of bytes in string table */
 
 	caddr_t		ctftab;		/* CTF table */
 	long		ctfcnt;		/* number of bytes in CTF table */
 	caddr_t		ctfoff;		/* CTF offset table */
 	caddr_t		typoff;		/* Type offset table */
 	long		typlen;		/* Number of type entries. */
 
 } *elf_file_t;
 
 #include <kern/kern_ctf.c>
 
 static int	link_elf_link_preload(linker_class_t cls,
 		    const char *, linker_file_t *);
 static int	link_elf_link_preload_finish(linker_file_t);
 static int	link_elf_load_file(linker_class_t, const char *, linker_file_t *);
 static int	link_elf_lookup_symbol(linker_file_t, const char *,
 		    c_linker_sym_t *);
 static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t,
 		    linker_symval_t *);
 static int	link_elf_search_symbol(linker_file_t, caddr_t value,
 		    c_linker_sym_t *sym, long *diffp);
 
 static void	link_elf_unload_file(linker_file_t);
 static int	link_elf_lookup_set(linker_file_t, const char *,
 		    void ***, void ***, int *);
 static int	link_elf_each_function_name(linker_file_t,
 		    int (*)(const char *, void *), void *);
 static int	link_elf_each_function_nameval(linker_file_t,
 				linker_function_nameval_callback_t,
 				void *);
 static int	link_elf_reloc_local(linker_file_t);
 static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
 static long	link_elf_strtab_get(linker_file_t, caddr_t *);
 
 static int	elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps,
 		    Elf_Addr *);
 
 static kobj_method_t link_elf_methods[] = {
 	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
 	KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
 	KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
 	KOBJMETHOD(linker_unload,		link_elf_unload_file),
 	KOBJMETHOD(linker_load_file,		link_elf_load_file),
 	KOBJMETHOD(linker_link_preload,		link_elf_link_preload),
 	KOBJMETHOD(linker_link_preload_finish,	link_elf_link_preload_finish),
 	KOBJMETHOD(linker_lookup_set,		link_elf_lookup_set),
 	KOBJMETHOD(linker_each_function_name,	link_elf_each_function_name),
 	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
 	KOBJMETHOD(linker_ctf_get,		link_elf_ctf_get),
 	KOBJMETHOD(linker_symtab_get, 		link_elf_symtab_get),
 	KOBJMETHOD(linker_strtab_get, 		link_elf_strtab_get),
 	{ 0, 0 }
 };
 
 static struct linker_class link_elf_class = {
 #if ELF_TARG_CLASS == ELFCLASS32
 	"elf32_obj",
 #else
 	"elf64_obj",
 #endif
 	link_elf_methods, sizeof(struct elf_file)
 };
 
 static int	relocate_file(elf_file_t ef);
 static void	elf_obj_cleanup_globals_cache(elf_file_t);
 
 static void
 link_elf_error(const char *filename, const char *s)
 {
 	if (filename == NULL)
 		printf("kldload: %s\n", s);
 	else
 		printf("kldload: %s: %s\n", filename, s);
 }
 
 static void
 link_elf_init(void *arg)
 {
 
 	linker_add_class(&link_elf_class);
 }
 
 SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
 
 static int
 link_elf_link_preload(linker_class_t cls, const char *filename,
     linker_file_t *result)
 {
 	Elf_Ehdr *hdr;
 	Elf_Shdr *shdr;
 	Elf_Sym *es;
 	void *modptr, *baseptr, *sizeptr;
 	char *type;
 	elf_file_t ef;
 	linker_file_t lf;
 	Elf_Addr off;
 	int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex;
 
 	/* Look to see if we have the file preloaded */
 	modptr = preload_search_by_name(filename);
 	if (modptr == NULL)
 		return ENOENT;
 
 	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
 	baseptr = preload_search_info(modptr, MODINFO_ADDR);
 	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
 	hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA |
 	    MODINFOMD_ELFHDR);
 	shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA |
 	    MODINFOMD_SHDR);
 	if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE)
 	    " obj module") != 0 &&
 	    strcmp(type, "elf obj module") != 0)) {
 		return (EFTYPE);
 	}
 	if (baseptr == NULL || sizeptr == NULL || hdr == NULL ||
 	    shdr == NULL)
 		return (EINVAL);
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (lf == NULL)
 		return (ENOMEM);
 
 	ef = (elf_file_t)lf;
 	ef->preloaded = 1;
 	ef->address = *(caddr_t *)baseptr;
 	lf->address = *(caddr_t *)baseptr;
 	lf->size = *(size_t *)sizeptr;
 
 	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_version != EV_CURRENT ||
 	    hdr->e_type != ET_REL ||
 	    hdr->e_machine != ELF_TARG_MACH) {
 		error = EFTYPE;
 		goto out;
 	}
 	ef->e_shdr = shdr;
 
 	/* Scan the section header for information and table sizing. */
 	symtabindex = -1;
 	symstrindex = -1;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			/* Ignore sections not loaded by the loader. */
 			if (shdr[i].sh_addr == 0)
 				break;
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
 			symtabindex = i;
 			symstrindex = shdr[i].sh_link;
 			break;
 		case SHT_REL:
 			ef->nreltab++;
 			break;
 		case SHT_RELA:
 			ef->nrelatab++;
 			break;
 		}
 	}
 
 	shstrindex = hdr->e_shstrndx;
 	if (ef->nprogtab == 0 || symstrindex < 0 ||
 	    symstrindex >= hdr->e_shnum ||
 	    shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 ||
 	    shstrindex >= hdr->e_shnum ||
 	    shdr[shstrindex].sh_type != SHT_STRTAB) {
 		printf("%s: bad/missing section headers\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Allocate space for tracking the load chunks */
 	if (ef->nprogtab != 0)
 		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nreltab != 0)
 		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nrelatab != 0)
 		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
 	    (ef->nreltab != 0 && ef->reltab == NULL) ||
 	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* XXX, relocate the sh_addr fields saved by the loader. */
 	off = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off))
 			off = shdr[i].sh_addr;
 	}
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_addr != 0)
 			shdr[i].sh_addr = shdr[i].sh_addr - off +
 			    (Elf_Addr)ef->address;
 	}
 
 	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
 	ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr;
 	ef->ddbstrcnt = shdr[symstrindex].sh_size;
 	ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr;
 	ef->shstrcnt = shdr[shstrindex].sh_size;
 	ef->shstrtab = (char *)shdr[shstrindex].sh_addr;
 
 	/* Now fill out progtab and the relocation tables. */
 	pb = 0;
 	rl = 0;
 	ra = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			if (shdr[i].sh_addr == 0)
 				break;
 			ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
 			if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
 #ifdef __amd64__
 			else if (shdr[i].sh_type == SHT_X86_64_UNWIND)
 				ef->progtab[pb].name = "<<UNWIND>>";
 #endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			ef->progtab[pb].size = shdr[i].sh_size;
 			ef->progtab[pb].sec = i;
 			if (ef->shstrtab && shdr[i].sh_name != 0)
 				ef->progtab[pb].name =
 				    ef->shstrtab + shdr[i].sh_name;
 			if (ef->progtab[pb].name != NULL && 
 			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
 				void *dpcpu;
 
 				dpcpu = dpcpu_alloc(shdr[i].sh_size);
 				if (dpcpu == NULL) {
 					error = ENOSPC;
 					goto out;
 				}
 				memcpy(dpcpu, ef->progtab[pb].addr,
 				    ef->progtab[pb].size);
 				dpcpu_copy(dpcpu, shdr[i].sh_size);
 				ef->progtab[pb].addr = dpcpu;
 #ifdef VIMAGE
 			} else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
 				void *vnet_data;
 
 				vnet_data = vnet_data_alloc(shdr[i].sh_size);
 				if (vnet_data == NULL) {
 					error = ENOSPC;
 					goto out;
 				}
 				memcpy(vnet_data, ef->progtab[pb].addr,
 				    ef->progtab[pb].size);
 				vnet_data_copy(vnet_data, shdr[i].sh_size);
 				ef->progtab[pb].addr = vnet_data;
 #endif
 			} else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, ".ctors")) {
 				lf->ctors_addr = ef->progtab[pb].addr;
 				lf->ctors_size = shdr[i].sh_size;
 			}
 
 			/* Update all symbol values with the offset. */
 			for (j = 0; j < ef->ddbsymcnt; j++) {
 				es = &ef->ddbsymtab[j];
 				if (es->st_shndx != i)
 					continue;
 				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
 			}
 			pb++;
 			break;
 		case SHT_REL:
 			ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr;
 			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
 			ef->reltab[rl].sec = shdr[i].sh_info;
 			rl++;
 			break;
 		case SHT_RELA:
 			ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr;
 			ef->relatab[ra].nrela =
 			    shdr[i].sh_size / sizeof(Elf_Rela);
 			ef->relatab[ra].sec = shdr[i].sh_info;
 			ra++;
 			break;
 		}
 	}
 	if (pb != ef->nprogtab) {
 		printf("%s: lost progbits\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 	if (rl != ef->nreltab) {
 		printf("%s: lost reltab\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 	if (ra != ef->nrelatab) {
 		printf("%s: lost relatab\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Local intra-module relocations */
 	error = link_elf_reloc_local(lf);
 	if (error != 0)
 		goto out;
 
 	*result = lf;
 	return (0);
 
 out:
 	/* preload not done this way */
 	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	return (error);
 }
 
 static void
 link_elf_invoke_ctors(caddr_t addr, size_t size)
 {
 	void (**ctor)(void);
 	size_t i, cnt;
 
 	if (addr == NULL || size == 0)
 		return;
 	cnt = size / sizeof(*ctor);
 	ctor = (void *)addr;
 	for (i = 0; i < cnt; i++) {
 		if (ctor[i] != NULL)
 			(*ctor[i])();
 	}
 }
 
 static int
 link_elf_link_preload_finish(linker_file_t lf)
 {
 	elf_file_t ef;
 	int error;
 
 	ef = (elf_file_t)lf;
 	error = relocate_file(ef);
 	if (error)
 		return error;
 
 	/* Notify MD code that a module is being loaded. */
 	error = elf_cpu_load_file(lf);
 	if (error)
 		return (error);
 
 	/* Invoke .ctors */
 	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
 	return (0);
 }
 
 static int
 link_elf_load_file(linker_class_t cls, const char *filename,
     linker_file_t *result)
 {
 	struct nameidata *nd;
 	struct thread *td = curthread;	/* XXX */
 	Elf_Ehdr *hdr;
 	Elf_Shdr *shdr;
 	Elf_Sym *es;
 	int nbytes, i, j;
 	vm_offset_t mapbase;
 	size_t mapsize;
 	int error = 0;
 	ssize_t resid;
 	int flags;
 	elf_file_t ef;
 	linker_file_t lf;
 	int symtabindex;
 	int symstrindex;
 	int shstrindex;
 	int nsym;
 	int pb, rl, ra;
 	int alignmask;
 
 	shdr = NULL;
 	lf = NULL;
 	mapsize = 0;
 	hdr = NULL;
 
 	nd = malloc(sizeof(struct nameidata), M_TEMP, M_WAITOK);
 	NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
 	error = vn_open(nd, &flags, 0, NULL);
 	if (error) {
 		free(nd, M_TEMP);
 		return error;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	if (nd->ni_vp->v_type != VREG) {
 		error = ENOEXEC;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_kld_check_load(td->td_ucred, nd->ni_vp);
 	if (error) {
 		goto out;
 	}
 #endif
 
 	/* Read the elf header from the file. */
 	hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)hdr, sizeof(*hdr), 0,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = ENOEXEC;
 		goto out;
 	}
 
 	if (!IS_ELF(*hdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
 	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
 		link_elf_error(filename, "Unsupported file layout");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
 	    || hdr->e_version != EV_CURRENT) {
 		link_elf_error(filename, "Unsupported file version");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_type != ET_REL) {
 		error = ENOSYS;
 		goto out;
 	}
 	if (hdr->e_machine != ELF_TARG_MACH) {
 		link_elf_error(filename, "Unsupported machine");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (!lf) {
 		error = ENOMEM;
 		goto out;
 	}
 	ef = (elf_file_t) lf;
 	ef->nprogtab = 0;
 	ef->e_shdr = 0;
 	ef->nreltab = 0;
 	ef->nrelatab = 0;
 
 	/* Allocate and read in the section header */
 	nbytes = hdr->e_shnum * hdr->e_shentsize;
 	if (nbytes == 0 || hdr->e_shoff == 0 ||
 	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 	shdr = malloc(nbytes, M_LINKER, M_WAITOK);
 	ef->e_shdr = shdr;
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (caddr_t)shdr, nbytes,
 	    hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 	    NOCRED, &resid, td);
 	if (error)
 		goto out;
 	if (resid) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Scan the section header for information and table sizing. */
 	nsym = 0;
 	symtabindex = -1;
 	symstrindex = -1;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			if ((shdr[i].sh_flags & SHF_ALLOC) == 0)
 				break;
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
 			nsym++;
 			symtabindex = i;
 			symstrindex = shdr[i].sh_link;
 			break;
 		case SHT_REL:
 			ef->nreltab++;
 			break;
 		case SHT_RELA:
 			ef->nrelatab++;
 			break;
 		case SHT_STRTAB:
 			break;
 		}
 	}
 	if (ef->nprogtab == 0) {
 		link_elf_error(filename, "file has no contents");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (nsym != 1) {
 		/* Only allow one symbol table for now */
 		link_elf_error(filename, "file has no valid symbol table");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
 	    shdr[symstrindex].sh_type != SHT_STRTAB) {
 		link_elf_error(filename, "file has invalid symbol strings");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Allocate space for tracking the load chunks */
 	if (ef->nprogtab != 0)
 		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nreltab != 0)
 		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nrelatab != 0)
 		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 
 	if (symtabindex == -1) {
 		link_elf_error(filename, "lost symbol table index");
 		error = ENOEXEC;
 		goto out;
 	}
 	/* Allocate space for and load the symbol table */
 	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
 	ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->ddbsymtab,
 	    shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = EINVAL;
 		goto out;
 	}
 
 	if (symstrindex == -1) {
 		link_elf_error(filename, "lost symbol string index");
 		error = ENOEXEC;
 		goto out;
 	}
 	/* Allocate space for and load the symbol strings */
 	ef->ddbstrcnt = shdr[symstrindex].sh_size;
 	ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, ef->ddbstrtab,
 	    shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Do we have a string table for the section names?  */
 	shstrindex = -1;
 	if (hdr->e_shstrndx != 0 &&
 	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
 		shstrindex = hdr->e_shstrndx;
 		ef->shstrcnt = shdr[shstrindex].sh_size;
 		ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER,
 		    M_WAITOK);
 		error = vn_rdwr(UIO_READ, nd->ni_vp, ef->shstrtab,
 		    shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 		    &resid, td);
 		if (error)
 			goto out;
 		if (resid != 0){
 			error = EINVAL;
 			goto out;
 		}
 	}
 
 	/* Size up code/data(progbits) and bss(nobits). */
 	alignmask = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			if ((shdr[i].sh_flags & SHF_ALLOC) == 0)
 				break;
 			alignmask = shdr[i].sh_addralign - 1;
 			mapsize += alignmask;
 			mapsize &= ~alignmask;
 			mapsize += shdr[i].sh_size;
 			break;
 		}
 	}
 
 	/*
 	 * We know how much space we need for the text/data/bss/etc.
 	 * This stuff needs to be in a single chunk so that profiling etc
 	 * can get the bounds and gdb can associate offsets with modules
 	 */
 	ef->object = vm_object_allocate(OBJT_DEFAULT,
 	    round_page(mapsize) >> PAGE_SHIFT);
 	if (ef->object == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 	ef->address = (caddr_t) vm_map_min(kernel_map);
 
 	/*
 	 * In order to satisfy amd64's architectural requirements on the
 	 * location of code and data in the kernel's address space, request a
 	 * mapping that is above the kernel.  
 	 */
 #ifdef __amd64__
 	mapbase = KERNBASE;
 #else
 	mapbase = VM_MIN_KERNEL_ADDRESS;
 #endif
 	error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
 	    round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL,
 	    VM_PROT_ALL, 0);
 	if (error) {
 		vm_object_deallocate(ef->object);
 		ef->object = 0;
 		goto out;
 	}
 
 	/* Wire the pages */
 	error = vm_map_wire(kernel_map, mapbase,
 	    mapbase + round_page(mapsize),
 	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
 	if (error != KERN_SUCCESS) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* Inform the kld system about the situation */
 	lf->address = ef->address = (caddr_t)mapbase;
 	lf->size = mapsize;
 
 	/*
 	 * Now load code/data(progbits), zero bss(nobits), allocate space for
 	 * and load relocs
 	 */
 	pb = 0;
 	rl = 0;
 	ra = 0;
 	alignmask = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			if ((shdr[i].sh_flags & SHF_ALLOC) == 0)
 				break;
 			alignmask = shdr[i].sh_addralign - 1;
 			mapbase += alignmask;
 			mapbase &= ~alignmask;
 			if (ef->shstrtab != NULL && shdr[i].sh_name != 0) {
 				ef->progtab[pb].name =
 				    ef->shstrtab + shdr[i].sh_name;
 				if (!strcmp(ef->progtab[pb].name, ".ctors")) {
 					lf->ctors_addr = (caddr_t)mapbase;
 					lf->ctors_size = shdr[i].sh_size;
 				}
 			} else if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
 #ifdef __amd64__
 			else if (shdr[i].sh_type == SHT_X86_64_UNWIND)
 				ef->progtab[pb].name = "<<UNWIND>>";
 #endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			if (ef->progtab[pb].name != NULL && 
 			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
 				ef->progtab[pb].addr =
 				    dpcpu_alloc(shdr[i].sh_size);
 #ifdef VIMAGE
 			else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
 				ef->progtab[pb].addr =
 				    vnet_data_alloc(shdr[i].sh_size);
 #endif
 			else
 				ef->progtab[pb].addr =
 				    (void *)(uintptr_t)mapbase;
 			if (ef->progtab[pb].addr == NULL) {
 				error = ENOSPC;
 				goto out;
 			}
 			ef->progtab[pb].size = shdr[i].sh_size;
 			ef->progtab[pb].sec = i;
 			if (shdr[i].sh_type == SHT_PROGBITS
 #ifdef __amd64__
 			    || shdr[i].sh_type == SHT_X86_64_UNWIND
 #endif
 			    ) {
 				error = vn_rdwr(UIO_READ, nd->ni_vp,
 				    ef->progtab[pb].addr,
 				    shdr[i].sh_size, shdr[i].sh_offset,
 				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 				    NOCRED, &resid, td);
 				if (error)
 					goto out;
 				if (resid != 0){
 					error = EINVAL;
 					goto out;
 				}
 				/* Initialize the per-cpu or vnet area. */
 				if (ef->progtab[pb].addr != (void *)mapbase &&
 				    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
 					dpcpu_copy(ef->progtab[pb].addr,
 					    shdr[i].sh_size);
 #ifdef VIMAGE
 				else if (ef->progtab[pb].addr !=
 				    (void *)mapbase &&
 				    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
 					vnet_data_copy(ef->progtab[pb].addr,
 					    shdr[i].sh_size);
 #endif
 			} else
 				bzero(ef->progtab[pb].addr, shdr[i].sh_size);
 
 			/* Update all symbol values with the offset. */
 			for (j = 0; j < ef->ddbsymcnt; j++) {
 				es = &ef->ddbsymtab[j];
 				if (es->st_shndx != i)
 					continue;
 				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
 			}
 			mapbase += shdr[i].sh_size;
 			pb++;
 			break;
 		case SHT_REL:
 			ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER,
 			    M_WAITOK);
 			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
 			ef->reltab[rl].sec = shdr[i].sh_info;
 			error = vn_rdwr(UIO_READ, nd->ni_vp,
 			    (void *)ef->reltab[rl].rel,
 			    shdr[i].sh_size, shdr[i].sh_offset,
 			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			    &resid, td);
 			if (error)
 				goto out;
 			if (resid != 0){
 				error = EINVAL;
 				goto out;
 			}
 			rl++;
 			break;
 		case SHT_RELA:
 			ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER,
 			    M_WAITOK);
 			ef->relatab[ra].nrela =
 			    shdr[i].sh_size / sizeof(Elf_Rela);
 			ef->relatab[ra].sec = shdr[i].sh_info;
 			error = vn_rdwr(UIO_READ, nd->ni_vp,
 			    (void *)ef->relatab[ra].rela,
 			    shdr[i].sh_size, shdr[i].sh_offset,
 			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			    &resid, td);
 			if (error)
 				goto out;
 			if (resid != 0){
 				error = EINVAL;
 				goto out;
 			}
 			ra++;
 			break;
 		}
 	}
 	if (pb != ef->nprogtab) {
 		link_elf_error(filename, "lost progbits");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (rl != ef->nreltab) {
 		link_elf_error(filename, "lost reltab");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (ra != ef->nrelatab) {
 		link_elf_error(filename, "lost relatab");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (mapbase != (vm_offset_t)ef->address + mapsize) {
 		printf(
 		    "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
 		    filename != NULL ? filename : "<none>",
 		    (u_long)mapbase, ef->address, (u_long)mapsize,
 		    (u_long)(vm_offset_t)ef->address + mapsize);
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* Local intra-module relocations */
 	error = link_elf_reloc_local(lf);
 	if (error != 0)
 		goto out;
 
 	/* Pull in dependencies */
 	VOP_UNLOCK(nd->ni_vp, 0);
 	error = linker_load_dependencies(lf);
 	vn_lock(nd->ni_vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		goto out;
 
 	/* External relocations */
 	error = relocate_file(ef);
 	if (error)
 		goto out;
 
 	/* Notify MD code that a module is being loaded. */
 	error = elf_cpu_load_file(lf);
 	if (error)
 		goto out;
 
 	/* Invoke .ctors */
 	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
 
 	*result = lf;
 
 out:
 	VOP_UNLOCK(nd->ni_vp, 0);
 	vn_close(nd->ni_vp, FREAD, td->td_ucred, td);
 	free(nd, M_TEMP);
 	if (error && lf)
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	free(hdr, M_LINKER);
 
 	return error;
 }
 
 static void
 link_elf_unload_file(linker_file_t file)
 {
 	elf_file_t ef = (elf_file_t) file;
-	int i;
+	u_int i;
 
 	/* Notify MD code that a module is being unloaded. */
 	elf_cpu_unload_file(file);
 
 	if (ef->progtab) {
 		for (i = 0; i < ef->nprogtab; i++) {
 			if (ef->progtab[i].size == 0)
 				continue;
 			if (ef->progtab[i].name == NULL)
 				continue;
 			if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME))
 				dpcpu_free(ef->progtab[i].addr,
 				    ef->progtab[i].size);
 #ifdef VIMAGE
 			else if (!strcmp(ef->progtab[i].name, VNET_SETNAME))
 				vnet_data_free(ef->progtab[i].addr,
 				    ef->progtab[i].size);
 #endif
 		}
 	}
 	if (ef->preloaded) {
 		free(ef->reltab, M_LINKER);
 		free(ef->relatab, M_LINKER);
 		free(ef->progtab, M_LINKER);
 		free(ef->ctftab, M_LINKER);
 		free(ef->ctfoff, M_LINKER);
 		free(ef->typoff, M_LINKER);
 		if (file->filename != NULL)
 			preload_delete_name(file->filename);
 		/* XXX reclaim module memory? */
 		return;
 	}
 
 	for (i = 0; i < ef->nreltab; i++)
 		free(ef->reltab[i].rel, M_LINKER);
 	for (i = 0; i < ef->nrelatab; i++)
 		free(ef->relatab[i].rela, M_LINKER);
 	free(ef->reltab, M_LINKER);
 	free(ef->relatab, M_LINKER);
 	free(ef->progtab, M_LINKER);
 
 	if (ef->object) {
 		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
 		    (vm_offset_t) ef->address +
 		    (ef->object->size << PAGE_SHIFT));
 	}
 	free(ef->e_shdr, M_LINKER);
 	free(ef->ddbsymtab, M_LINKER);
 	free(ef->ddbstrtab, M_LINKER);
 	free(ef->shstrtab, M_LINKER);
 	free(ef->ctftab, M_LINKER);
 	free(ef->ctfoff, M_LINKER);
 	free(ef->typoff, M_LINKER);
 }
 
 static const char *
 symbol_name(elf_file_t ef, Elf_Size r_info)
 {
 	const Elf_Sym *ref;
 
 	if (ELF_R_SYM(r_info)) {
 		ref = ef->ddbsymtab + ELF_R_SYM(r_info);
 		return ef->ddbstrtab + ref->st_name;
 	} else
 		return NULL;
 }
 
 static Elf_Addr
 findbase(elf_file_t ef, int sec)
 {
 	int i;
 	Elf_Addr base = 0;
 
 	for (i = 0; i < ef->nprogtab; i++) {
 		if (sec == ef->progtab[i].sec) {
 			base = (Elf_Addr)ef->progtab[i].addr;
 			break;
 		}
 	}
 	return base;
 }
 
 static int
 relocate_file(elf_file_t ef)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	const char *symname;
 	const Elf_Sym *sym;
 	int i;
 	Elf_Size symidx;
 	Elf_Addr base;
 
 
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
 		if (rel == NULL) {
 			link_elf_error(ef->lf.filename, "lost a reltab!");
 			return (ENOEXEC);
 		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Local relocs are already done */
 			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
 				continue;
 			if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL,
 			    elf_obj_lookup)) {
 				symname = symbol_name(ef, rel->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
 		if (rela == NULL) {
 			link_elf_error(ef->lf.filename, "lost a relatab!");
 			return (ENOEXEC);
 		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename,
 			    "lost base for relatab");
 			return (ENOEXEC);
 		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Local relocs are already done */
 			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
 				continue;
 			if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA,
 			    elf_obj_lookup)) {
 				symname = symbol_name(ef, rela->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 		}
 	}
 
 	/*
 	 * Only clean SHN_FBSD_CACHED for successful return.  If we
 	 * modified symbol table for the object but found an
 	 * unresolved symbol, there is no reason to roll back.
 	 */
 	elf_obj_cleanup_globals_cache(ef);
 
 	return (0);
 }
 
 static int
 link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	const Elf_Sym *symp;
 	const char *strp;
 	int i;
 
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		strp = ef->ddbstrtab + symp->st_name;
 		if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) {
 			*sym = (c_linker_sym_t) symp;
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
     linker_symval_t *symval)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	const Elf_Sym *es = (const Elf_Sym*) sym;
 
 	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
 		symval->name = ef->ddbstrtab + es->st_name;
 		symval->value = (caddr_t)es->st_value;
 		symval->size = es->st_size;
 		return 0;
 	}
 	return ENOENT;
 }
 
 static int
 link_elf_search_symbol(linker_file_t lf, caddr_t value,
     c_linker_sym_t *sym, long *diffp)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	u_long off = (uintptr_t) (void *) value;
 	u_long diff = off;
 	u_long st_value;
 	const Elf_Sym *es;
 	const Elf_Sym *best = NULL;
 	int i;
 
 	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
 		if (es->st_name == 0)
 			continue;
 		st_value = es->st_value;
 		if (off >= st_value) {
 			if (off - st_value < diff) {
 				diff = off - st_value;
 				best = es;
 				if (diff == 0)
 					break;
 			} else if (off - st_value == diff) {
 				best = es;
 			}
 		}
 	}
 	if (best == NULL)
 		*diffp = off;
 	else
 		*diffp = diff;
 	*sym = (c_linker_sym_t) best;
 
 	return 0;
 }
 
 /*
  * Look up a linker set on an ELF system.
  */
 static int
 link_elf_lookup_set(linker_file_t lf, const char *name,
     void ***startp, void ***stopp, int *countp)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	void **start, **stop;
 	int i, count;
 
 	/* Relative to section number */
 	for (i = 0; i < ef->nprogtab; i++) {
 		if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) &&
 		    strcmp(ef->progtab[i].name + 4, name) == 0) {
 			start  = (void **)ef->progtab[i].addr;
 			stop = (void **)((char *)ef->progtab[i].addr +
 			    ef->progtab[i].size);
 			count = stop - start;
 			if (startp)
 				*startp = start;
 			if (stopp)
 				*stopp = stop;
 			if (countp)
 				*countp = count;
 			return (0);
 		}
 	}
 	return (ESRCH);
 }
 
 static int
 link_elf_each_function_name(linker_file_t file,
     int (*callback)(const char *, void *), void *opaque)
 {
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym *symp;
 	int i, error;
 	
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = callback(ef->ddbstrtab + symp->st_name, opaque);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static int
 link_elf_each_function_nameval(linker_file_t file,
     linker_function_nameval_callback_t callback, void *opaque)
 {
 	linker_symval_t symval;
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym* symp;
 	int i, error;
 
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval);
 			if (error)
 				return (error);
 			error = callback(file, i, &symval, opaque);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static void
 elf_obj_cleanup_globals_cache(elf_file_t ef)
 {
 	Elf_Sym *sym;
 	Elf_Size i;
 
 	for (i = 0; i < ef->ddbsymcnt; i++) {
 		sym = ef->ddbsymtab + i;
 		if (sym->st_shndx == SHN_FBSD_CACHED) {
 			sym->st_shndx = SHN_UNDEF;
 			sym->st_value = 0;
 		}
 	}
 }
 
 /*
  * Symbol lookup function that can be used when the symbol index is known (ie
  * in relocations). It uses the symbol index instead of doing a fully fledged
  * hash table based lookup when such is valid. For example for local symbols.
  * This is not only more efficient, it's also more correct. It's not always
  * the case that the symbol can be found through the hash table.
  */
 static int
 elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	Elf_Sym *sym;
 	const char *symbol;
 	Elf_Addr res1;
 
 	/* Don't even try to lookup the symbol if the index is bogus. */
 	if (symidx >= ef->ddbsymcnt) {
 		*res = 0;
 		return (EINVAL);
 	}
 
 	sym = ef->ddbsymtab + symidx;
 
 	/* Quick answer if there is a definition included. */
 	if (sym->st_shndx != SHN_UNDEF) {
 		*res = sym->st_value;
 		return (0);
 	}
 
 	/* If we get here, then it is undefined and needs a lookup. */
 	switch (ELF_ST_BIND(sym->st_info)) {
 	case STB_LOCAL:
 		/* Local, but undefined? huh? */
 		*res = 0;
 		return (EINVAL);
 
 	case STB_GLOBAL:
 	case STB_WEAK:
 		/* Relative to Data or Function name */
 		symbol = ef->ddbstrtab + sym->st_name;
 
 		/* Force a lookup failure if the symbol name is bogus. */
 		if (*symbol == 0) {
 			*res = 0;
 			return (EINVAL);
 		}
 		res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps);
 
 		/*
 		 * Cache global lookups during module relocation. The failure
 		 * case is particularly expensive for callers, who must scan
 		 * through the entire globals table doing strcmp(). Cache to
 		 * avoid doing such work repeatedly.
 		 *
 		 * After relocation is complete, undefined globals will be
 		 * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(),
 		 * above.
 		 */
 		if (res1 != 0) {
 			sym->st_shndx = SHN_FBSD_CACHED;
 			sym->st_value = res1;
 			*res = res1;
 			return (0);
 		} else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
 			sym->st_value = 0;
 			*res = 0;
 			return (0);
 		}
 		return (EINVAL);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 static void
 link_elf_fix_link_set(elf_file_t ef)
 {
 	static const char startn[] = "__start_";
 	static const char stopn[] = "__stop_";
 	Elf_Sym *sym;
 	const char *sym_name, *linkset_name;
 	Elf_Addr startp, stopp;
 	Elf_Size symidx;
 	int start, i;
 
 	startp = stopp = 0;
 	for (symidx = 1 /* zero entry is special */;
 		symidx < ef->ddbsymcnt; symidx++) {
 		sym = ef->ddbsymtab + symidx;
 		if (sym->st_shndx != SHN_UNDEF)
 			continue;
 
 		sym_name = ef->ddbstrtab + sym->st_name;
 		if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
 			start = 1;
 			linkset_name = sym_name + sizeof(startn) - 1;
 		}
 		else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
 			start = 0;
 			linkset_name = sym_name + sizeof(stopn) - 1;
 		}
 		else
 			continue;
 
 		for (i = 0; i < ef->nprogtab; i++) {
 			if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
 				startp = (Elf_Addr)ef->progtab[i].addr;
 				stopp = (Elf_Addr)(startp + ef->progtab[i].size);
 				break;
 			}
 		}
 		if (i == ef->nprogtab)
 			continue;
 
 		sym->st_value = start ? startp : stopp;
 		sym->st_shndx = i;
 	}
 }
 
 static int
 link_elf_reloc_local(linker_file_t lf)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	const Elf_Sym *sym;
 	Elf_Addr base;
 	int i;
 	Elf_Size symidx;
 
 	link_elf_fix_link_set(ef);
 
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
 		if (rel == NULL) {
 			link_elf_error(ef->lf.filename, "lost a reltab");
 			return (ENOEXEC);
 		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Only do local relocs */
 			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
 				continue;
 			elf_reloc_local(lf, base, rel, ELF_RELOC_REL,
 			    elf_obj_lookup);
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
 		if (rela == NULL) {
 			link_elf_error(ef->lf.filename, "lost a relatab!");
 			return (ENOEXEC);
 		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Only do local relocs */
 			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
 				continue;
 			elf_reloc_local(lf, base, rela, ELF_RELOC_RELA,
 			    elf_obj_lookup);
 		}
 	}
 	return (0);
 }
 
 static long
 link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
 {
     elf_file_t ef = (elf_file_t)lf;
     
     *symtab = ef->ddbsymtab;
     
     if (*symtab == NULL)
         return (0);
 
     return (ef->ddbsymcnt);
 }
     
 static long
 link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
 {
     elf_file_t ef = (elf_file_t)lf;
 
     *strtab = ef->ddbstrtab;
 
     if (*strtab == NULL)
         return (0);
 
     return (ef->ddbstrcnt);
 }
Index: head/sys/kern/subr_hash.c
===================================================================
--- head/sys/kern/subr_hash.c	(revision 328237)
+++ head/sys/kern/subr_hash.c	(revision 328238)
@@ -1,154 +1,152 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 
 static __inline int
 hash_mflags(int flags)
 {
 
 	return ((flags & HASH_NOWAIT) ? M_NOWAIT : M_WAITOK);
 }
 
 /*
  * General routine to allocate a hash table with control of memory flags.
  */
 void *
 hashinit_flags(int elements, struct malloc_type *type, u_long *hashmask,
     int flags)
 {
-	long hashsize;
+	long hashsize, i;
 	LIST_HEAD(generic, generic) *hashtbl;
-	int i;
 
 	KASSERT(elements > 0, ("%s: bad elements", __func__));
 	/* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
 	KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
 	    ("Bad flags (0x%x) passed to hashinit_flags", flags));
 
 	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
 		continue;
 	hashsize >>= 1;
 
 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type,
 	    hash_mflags(flags));
 	if (hashtbl != NULL) {
 		for (i = 0; i < hashsize; i++)
 			LIST_INIT(&hashtbl[i]);
 		*hashmask = hashsize - 1;
 	}
 	return (hashtbl);
 }
 
 /*
  * Allocate and initialize a hash table with default flag: may sleep.
  */
 void *
 hashinit(int elements, struct malloc_type *type, u_long *hashmask)
 {
 
 	return (hashinit_flags(elements, type, hashmask, HASH_WAITOK));
 }
 
 void
 hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask)
 {
 	LIST_HEAD(generic, generic) *hashtbl, *hp;
 
 	hashtbl = vhashtbl;
 	for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
 		KASSERT(LIST_EMPTY(hp), ("%s: hashtbl %p not empty "
 		    "(malloc type %s)", __func__, hashtbl, type->ks_shortdesc));
 	free(hashtbl, type);
 }
 
 static const int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531,
 			2039, 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143,
 			6653, 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
 #define	NPRIMES nitems(primes)
 
 /*
  * General routine to allocate a prime number sized hash table with control of
  * memory flags.
  */
 void *
 phashinit_flags(int elements, struct malloc_type *type, u_long *nentries, int flags)
 {
-	long hashsize;
+	long hashsize, i;
 	LIST_HEAD(generic, generic) *hashtbl;
-	int i;
 
 	KASSERT(elements > 0, ("%s: bad elements", __func__));
 	/* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
 	KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
 	    ("Bad flags (0x%x) passed to phashinit_flags", flags));
 
 	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
 		i++;
 		if (i == NPRIMES)
 			break;
 		hashsize = primes[i];
 	}
 	hashsize = primes[i - 1];
 
 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type,
 	    hash_mflags(flags));
 	if (hashtbl == NULL)
 		return (NULL);
 
 	for (i = 0; i < hashsize; i++)
 		LIST_INIT(&hashtbl[i]);
 	*nentries = hashsize;
 	return (hashtbl);
 }
 
 /*
  * Allocate and initialize a prime number sized hash table with default flag:
  * may sleep.
  */
 void *
 phashinit(int elements, struct malloc_type *type, u_long *nentries)
 {
 
 	return (phashinit_flags(elements, type, nentries, HASH_WAITOK));
 }
Index: head/sys/kern/uipc_usrreq.c
===================================================================
--- head/sys/kern/uipc_usrreq.c	(revision 328237)
+++ head/sys/kern/uipc_usrreq.c	(revision 328238)
@@ -1,2599 +1,2599 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2004-2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * UNIX Domain (Local) Sockets
  *
  * This is an implementation of UNIX (local) domain sockets.  Each socket has
  * an associated struct unpcb (UNIX protocol control block).  Stream sockets
  * may be connected to 0 or 1 other socket.  Datagram sockets may be
  * connected to 0, 1, or many other sockets.  Sockets may be created and
  * connected in pairs (socketpair(2)), or bound/connected to using the file
  * system name space.  For most purposes, only the receive socket buffer is
  * used, as sending on one socket delivers directly to the receive socket
  * buffer of a second socket.
  *
  * The implementation is substantially complicated by the fact that
  * "ancillary data", such as file descriptors or credentials, may be passed
  * across UNIX domain sockets.  The potential for passing UNIX domain sockets
  * over other UNIX domain sockets requires the implementation of a simple
  * garbage collector to find and tear down cycles of disconnected sockets.
  *
  * TODO:
  *	RDM
  *	rethink name space problems
  *	need a proper out-of-band
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
 #include <sys/eventhandler.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 MALLOC_DECLARE(M_FILECAPS);
 
 /*
  * Locking key:
  * (l)	Locked using list lock
  * (g)	Locked using linkage lock
  */
 
 static uma_zone_t	unp_zone;
 static unp_gen_t	unp_gencnt;	/* (l) */
 static u_int		unp_count;	/* (l) Count of local sockets. */
 static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
 static int		unp_rights;	/* (g) File descriptors in flight. */
 static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
 static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
 static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
 
 struct unp_defer {
 	SLIST_ENTRY(unp_defer) ud_link;
 	struct file *ud_fp;
 };
 static SLIST_HEAD(, unp_defer) unp_defers;
 static int unp_defers_count;
 
 static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
 
 /*
  * Garbage collection of cyclic file descriptor/socket references occurs
  * asynchronously in a taskqueue context in order to avoid recursion and
  * reentrance in the UNIX domain socket, file descriptor, and socket layer
  * code.  See unp_gc() for a full description.
  */
 static struct timeout_task unp_gc_task;
 
 /*
  * The close of unix domain sockets attached as SCM_RIGHTS is
  * postponed to the taskqueue, to avoid arbitrary recursion depth.
  * The attached sockets might have another sockets attached.
  */
 static struct task	unp_defer_task;
 
 /*
  * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
  * stream sockets, although the total for sender and receiver is actually
  * only PIPSIZ.
  *
  * Datagram sockets really use the sendspace as the maximum datagram size,
  * and don't really want to reserve the sendspace.  Their recvspace should be
  * large enough for at least one max-size datagram plus address.
  */
 #ifndef PIPSIZ
 #define	PIPSIZ	8192
 #endif
 static u_long	unpst_sendspace = PIPSIZ;
 static u_long	unpst_recvspace = PIPSIZ;
 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
 static u_long	unpdg_recvspace = 4*1024;
 static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
 static u_long	unpsp_recvspace = PIPSIZ;
 
 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0,
     "SOCK_STREAM");
 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0,
     "SOCK_SEQPACKET");
 
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
 	   &unpst_sendspace, 0, "Default stream send space.");
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpst_recvspace, 0, "Default stream receive space.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
 	   &unpdg_sendspace, 0, "Default datagram send space.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpdg_recvspace, 0, "Default datagram receive space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
 	   &unpsp_sendspace, 0, "Default seqpacket send space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
     "File descriptors in flight.");
 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
     &unp_defers_count, 0,
     "File descriptors deferred to taskqueue for close.");
 
 /*
  * Locking and synchronization:
  *
  * Two types of locks exist in the local domain socket implementation: a
  * a global linkage rwlock and per-unpcb mutexes.  The linkage lock protects
  * the socket count, global generation number, stream/datagram global lists and
  * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
  * held exclusively over the acquisition of multiple unpcb locks to prevent
  * deadlock.
  *
  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
  * allocated in pru_attach() and freed in pru_detach().  The validity of that
  * pointer is an invariant, so no lock is required to dereference the so_pcb
  * pointer if a valid socket reference is held by the caller.  In practice,
  * this is always true during operations performed on a socket.  Each unpcb
  * has a back-pointer to its socket, unp_socket, which will be stable under
  * the same circumstances.
  *
  * This pointer may only be safely dereferenced as long as a valid reference
  * to the unpcb is held.  Typically, this reference will be from the socket,
  * or from another unpcb when the referring unpcb's lock is held (in order
  * that the reference not be invalidated during use).  For example, to follow
  * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
  * as unp_socket remains valid as long as the reference to unp_conn is valid.
  *
  * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
  * atomic reads without the lock may be performed "lockless", but more
  * complex reads and read-modify-writes require the mutex to be held.  No
  * lock order is defined between unpcb locks -- multiple unpcb locks may be
  * acquired at the same time only when holding the linkage rwlock
  * exclusively, which prevents deadlocks.
  *
  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
  * protocols, bind() is a non-atomic operation, and connect() requires
  * potential sleeping in the protocol, due to potentially waiting on local or
  * distributed file systems.  We try to separate "lookup" operations, which
  * may sleep, and the IPC operations themselves, which typically can occur
  * with relative atomicity as locks can be held over the entire operation.
  *
  * Another tricky issue is simultaneous multi-threaded or multi-process
  * access to a single UNIX domain socket.  These are handled by the flags
  * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
  * binding, both of which involve dropping UNIX domain socket locks in order
  * to perform namei() and other file system operations.
  */
 static struct rwlock	unp_link_rwlock;
 static struct mtx	unp_defers_lock;
 
 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
 					    "unp_link_rwlock")
 
 #define	UNP_LINK_LOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
 					    RA_LOCKED)
 #define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
 					    RA_UNLOCKED)
 
 #define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
 #define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_WLOCKED)
 #define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
 
 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
 					    "unp_defer", NULL, MTX_DEF)
 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
 
 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
 					    "unp_mtx", "unp_mtx",	\
 					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
 
 static int	uipc_connect2(struct socket *, struct socket *);
 static int	uipc_ctloutput(struct socket *, struct sockopt *);
 static int	unp_connect(struct socket *, struct sockaddr *,
 		    struct thread *);
 static int	unp_connectat(int, struct socket *, struct sockaddr *,
 		    struct thread *);
 static int	unp_connect2(struct socket *so, struct socket *so2, int);
 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
 static void	unp_dispose(struct socket *so);
 static void	unp_dispose_mbuf(struct mbuf *);
 static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *);
 static void	unp_gc(__unused void *, int);
 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
 static void	unp_discard(struct file *);
 static void	unp_freerights(struct filedescent **, int);
 static void	unp_init(void);
 static int	unp_internalize(struct mbuf **, struct thread *);
 static void	unp_internalize_fp(struct file *);
 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
 static void	unp_process_defers(void * __unused, int);
 
 /*
  * Definitions of protocols supported in the LOCAL domain.
  */
 static struct domain localdomain;
 static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
 static struct pr_usrreqs uipc_usrreqs_seqpacket;
 static struct protosw localsw[] = {
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&localdomain,
 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_stream
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&localdomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_dgram
 },
 {
 	.pr_type =		SOCK_SEQPACKET,
 	.pr_domain =		&localdomain,
 
 	/*
 	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
 	 * due to our use of sbappendaddr.  A new sbappend variants is needed
 	 * that supports both atomic record writes and control data.
 	 */
 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
 				    PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
 },
 };
 
 static struct domain localdomain = {
 	.dom_family =		AF_LOCAL,
 	.dom_name =		"local",
 	.dom_init =		unp_init,
 	.dom_externalize =	unp_externalize,
 	.dom_dispose =		unp_dispose,
 	.dom_protosw =		localsw,
 	.dom_protoswNPROTOSW =	&localsw[nitems(localsw)]
 };
 DOMAIN_SET(local);
 
 static void
 uipc_abort(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		unp_drop(unp2);
 		UNP_PCB_UNLOCK(unp2);
 	}
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	/*
 	 * Pass back name of connected socket, if it was bound and we are
 	 * still connected (our peer may have closed already!).
 	 */
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_LINK_RLOCK();
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL && unp2->unp_addr != NULL) {
 		UNP_PCB_LOCK(unp2);
 		sa = (struct sockaddr *) unp2->unp_addr;
 		bcopy(sa, *nam, sa->sa_len);
 		UNP_PCB_UNLOCK(unp2);
 	} else {
 		sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 	}
 	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
 	u_long sendspace, recvspace;
 	struct unpcb *unp;
 	int error;
 	bool locked;
 
 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			sendspace = unpst_sendspace;
 			recvspace = unpst_recvspace;
 			break;
 
 		case SOCK_DGRAM:
 			sendspace = unpdg_sendspace;
 			recvspace = unpdg_recvspace;
 			break;
 
 		case SOCK_SEQPACKET:
 			sendspace = unpsp_sendspace;
 			recvspace = unpsp_recvspace;
 			break;
 
 		default:
 			panic("uipc_attach");
 		}
 		error = soreserve(so, sendspace, recvspace);
 		if (error)
 			return (error);
 	}
 	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
 	if (unp == NULL)
 		return (ENOBUFS);
 	LIST_INIT(&unp->unp_refs);
 	UNP_PCB_LOCK_INIT(unp);
 	unp->unp_socket = so;
 	so->so_pcb = unp;
 	unp->unp_refcount = 1;
 	if (so->so_listen != NULL)
 		unp->unp_flags |= UNP_NASCENT;
 
 	if ((locked = UNP_LINK_WOWNED()) == false)
 		UNP_LINK_WLOCK();
 
 	unp->unp_gencnt = ++unp_gencnt;
 	unp_count++;
 	switch (so->so_type) {
 	case SOCK_STREAM:
 		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
 		break;
 
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
 		break;
 
 	case SOCK_SEQPACKET:
 		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
 		break;
 
 	default:
 		panic("uipc_attach");
 	}
 
 	if (locked == false)
 		UNP_LINK_WUNLOCK();
 
 	return (0);
 }
 
 static int
 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vattr vattr;
 	int error, namelen;
 	struct nameidata nd;
 	struct unpcb *unp;
 	struct vnode *vp;
 	struct mount *mp;
 	cap_rights_t rights;
 	char *buf;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
 
 	if (soun->sun_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
 	if (namelen <= 0)
 		return (EINVAL);
 
 	/*
 	 * We don't allow simultaneous bind() calls on a single UNIX domain
 	 * socket, so flag in-progress operations, and return an error if an
 	 * operation is already in progress.
 	 *
 	 * Historically, we have not allowed a socket to be rebound, so this
 	 * also returns an error.  Not allowing re-binding simplifies the
 	 * implementation and avoids a great many possible failure modes.
 	 */
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode != NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (EINVAL);
 	}
 	if (unp->unp_flags & UNP_BINDING) {
 		UNP_PCB_UNLOCK(unp);
 		return (EALREADY);
 	}
 	unp->unp_flags |= UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 
 	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
 	bcopy(soun->sun_path, buf, namelen);
 	buf[namelen] = 0;
 
 restart:
 	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
 	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vp = nd.ni_vp;
 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (vp != NULL) {
 			vrele(vp);
 			error = EADDRINUSE;
 			goto error;
 		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error)
 			goto error;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VSOCK;
 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 #endif
 	if (error == 0)
 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (error) {
 		vn_finished_write(mp);
 		goto error;
 	}
 	vp = nd.ni_vp;
 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	VOP_UNP_BIND(vp, unp);
 	unp->unp_vnode = vp;
 	unp->unp_addr = soun;
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	free(buf, M_TEMP);
 	return (0);
 
 error:
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (uipc_bindat(AT_FDCWD, so, nam, td));
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
 	UNP_LINK_WLOCK();
 	error = unp_connect(so, nam, td);
 	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
 static int
 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
 	UNP_LINK_WLOCK();
 	error = unp_connectat(fd, so, nam, td);
 	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
 static void
 uipc_close(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct vnode *vp = NULL;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		unp_disconnect(unp, unp2);
 		UNP_PCB_UNLOCK(unp2);
 	}
 	if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 	if (vp)
 		vrele(vp);
 }
 
 static int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
 	struct unpcb *unp, *unp2;
 	int error;
 
 	UNP_LINK_WLOCK();
 	unp = so1->so_pcb;
 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
 	UNP_PCB_LOCK(unp);
 	unp2 = so2->so_pcb;
 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
 	UNP_PCB_LOCK(unp2);
 	error = unp_connect2(so1, so2, PRU_CONNECT2);
 	UNP_PCB_UNLOCK(unp2);
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
 static void
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct sockaddr_un *saved_unp_addr;
 	struct vnode *vp;
 	int freeunp, local_unp_rights;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
 
 	vp = NULL;
 	local_unp_rights = 0;
 
 	UNP_LINK_WLOCK();
 	LIST_REMOVE(unp, unp_link);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
 	UNP_PCB_LOCK(unp);
 	if ((unp->unp_flags & UNP_NASCENT) != 0)
 		goto teardown;
 
 	if ((vp = unp->unp_vnode) != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		unp_disconnect(unp, unp2);
 		UNP_PCB_UNLOCK(unp2);
 	}
 
 	/*
 	 * We hold the linkage lock exclusively, so it's OK to acquire
 	 * multiple pcb locks at a time.
 	 */
 	while (!LIST_EMPTY(&unp->unp_refs)) {
 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
 
 		UNP_PCB_LOCK(ref);
 		unp_drop(ref);
 		UNP_PCB_UNLOCK(ref);
 	}
 	local_unp_rights = unp_rights;
 teardown:
 	UNP_LINK_WUNLOCK();
 	unp->unp_socket->so_pcb = NULL;
 	saved_unp_addr = unp->unp_addr;
 	unp->unp_addr = NULL;
 	unp->unp_refcount--;
 	freeunp = (unp->unp_refcount == 0);
 	if (saved_unp_addr != NULL)
 		free(saved_unp_addr, M_SONAME);
 	if (freeunp) {
 		UNP_PCB_LOCK_DESTROY(unp);
 		uma_zfree(unp_zone, unp);
 	} else
 		UNP_PCB_UNLOCK(unp);
 	if (vp)
 		vrele(vp);
 	if (local_unp_rights)
 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		unp_disconnect(unp, unp2);
 		UNP_PCB_UNLOCK(unp2);
 	}
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 	return (0);
 }
 
 static int
 uipc_listen(struct socket *so, int backlog, struct thread *td)
 {
 	struct unpcb *unp;
 	int error;
 
 	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
 		return (EOPNOTSUPP);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode == NULL) {
 		/* Already connected or not bound to an address. */
 		error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ;
 		UNP_PCB_UNLOCK(unp);
 		return (error);
 	}
 
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0) {
 		cru2x(td->td_ucred, &unp->unp_peercred);
 		solisten_proto(so, backlog);
 	}
 	SOCK_UNLOCK(so);
 	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_LINK_RLOCK();
 	/*
 	 * XXX: It seems that this test always fails even when connection is
 	 * established.  So, this else clause is added as workaround to
 	 * return PF_LOCAL sockaddr.
 	 */
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		if (unp2->unp_addr != NULL)
 			sa = (struct sockaddr *) unp2->unp_addr;
 		else
 			sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 		UNP_PCB_UNLOCK(unp2);
 	} else {
 		sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 	}
 	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	/*
 	 * Adjust backpressure on sender and wakeup any waiting to write.
 	 *
 	 * The unp lock is acquired to maintain the validity of the unp_conn
 	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
 	 * static as long as we don't permit unp2 to disconnect from unp,
 	 * which is prevented by the lock on unp.  We cache values from
 	 * so_rcv to avoid holding the so_rcv lock over the entire
 	 * transaction on the remote so_snd.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	mbcnt = so->so_rcv.sb_mbcnt;
 	sbcc = sbavail(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * There is a benign race condition at this point.  If we're planning to
 	 * clear SB_STOP, but uipc_send is called on the connected socket at
 	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
 	 * we would erroneously clear SB_STOP below, even though the sockbuf is
 	 * full.  The race is benign because the only ill effect is to allow the
 	 * sockbuf to exceed its size limit, and the size limits are not
 	 * strictly guaranteed anyway.
 	 */
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (0);
 	}
 	so2 = unp2->unp_socket;
 	SOCKBUF_LOCK(&so2->so_snd);
 	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
 		so2->so_snd.sb_flags &= ~SB_STOP;
 	sowwakeup_locked(so2);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 	int error = 0;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM ||
 	    so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
 	}
 	if (control != NULL && (error = unp_internalize(&control, td)))
 		goto release;
 	if ((nam != NULL) || (flags & PRUS_EOF))
 		UNP_LINK_WLOCK();
 	else
 		UNP_LINK_RLOCK();
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 	{
 		const struct sockaddr *from;
 
 		unp2 = unp->unp_conn;
 		if (nam != NULL) {
 			UNP_LINK_WLOCK_ASSERT();
 			if (unp2 != NULL) {
 				error = EISCONN;
 				break;
 			}
 			error = unp_connect(so, nam, td);
 			if (error)
 				break;
 			unp2 = unp->unp_conn;
 		}
 
 		/*
 		 * Because connect() and send() are non-atomic in a sendto()
 		 * with a target address, it's possible that the socket will
 		 * have disconnected before the send() can run.  In that case
 		 * return the slightly counter-intuitive but otherwise
 		 * correct error that the socket is not connected.
 		 */
 		if (unp2 == NULL) {
 			error = ENOTCONN;
 			break;
 		}
 		/* Lockless read. */
 		if (unp2->unp_flags & UNP_WANTCRED)
 			control = unp_addsockcred(td, control);
 		UNP_PCB_LOCK(unp);
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
 			from = &sun_noname;
 		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (sbappendaddr_locked(&so2->so_rcv, from, m,
 		    control)) {
 			sorwakeup_locked(so2);
 			m = NULL;
 			control = NULL;
 		} else {
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 			error = ENOBUFS;
 		}
 		if (nam != NULL) {
 			UNP_LINK_WLOCK_ASSERT();
 			UNP_PCB_LOCK(unp2);
 			unp_disconnect(unp, unp2);
 			UNP_PCB_UNLOCK(unp2);
 		}
 		UNP_PCB_UNLOCK(unp);
 		break;
 	}
 
 	case SOCK_SEQPACKET:
 	case SOCK_STREAM:
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam != NULL) {
 				UNP_LINK_WLOCK_ASSERT();
 				error = unp_connect(so, nam, td);
 				if (error)
 					break;	/* XXX */
 			} else {
 				error = ENOTCONN;
 				break;
 			}
 		}
 
 		/* Lockless read. */
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			break;
 		}
 
 		/*
 		 * Because connect() and send() are non-atomic in a sendto()
 		 * with a target address, it's possible that the socket will
 		 * have disconnected before the send() can run.  In that case
 		 * return the slightly counter-intuitive but otherwise
 		 * correct error that the socket is not connected.
 		 *
 		 * Locking here must be done carefully: the linkage lock
 		 * prevents interconnections between unpcbs from changing, so
 		 * we can traverse from unp to unp2 without acquiring unp's
 		 * lock.  Socket buffer locks follow unpcb locks, so we can
 		 * acquire both remote and lock socket buffer locks.
 		 */
 		unp2 = unp->unp_conn;
 		if (unp2 == NULL) {
 			error = ENOTCONN;
 			break;
 		}
 		so2 = unp2->unp_socket;
 		UNP_PCB_LOCK(unp2);
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (unp2->unp_flags & UNP_WANTCRED) {
 			/*
 			 * Credentials are passed only once on SOCK_STREAM
 			 * and SOCK_SEQPACKET.
 			 */
 			unp2->unp_flags &= ~UNP_WANTCRED;
 			control = unp_addsockcred(td, control);
 		}
 		/*
 		 * Send to paired receive port, and then reduce send buffer
 		 * hiwater marks to maintain backpressure.  Wake up readers.
 		 */
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			if (control != NULL) {
 				if (sbappendcontrol_locked(&so2->so_rcv, m,
 				    control))
 					control = NULL;
 			} else
 				sbappend_locked(&so2->so_rcv, m, flags);
 			break;
 
 		case SOCK_SEQPACKET: {
 			const struct sockaddr *from;
 
 			from = &sun_noname;
 			/*
 			 * Don't check for space available in so2->so_rcv.
 			 * Unix domain sockets only check for space in the
 			 * sending sockbuf, and that check is performed one
 			 * level up the stack.
 			 */
 			if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
 				from, m, control))
 				control = NULL;
 			break;
 			}
 		}
 
 		mbcnt = so2->so_rcv.sb_mbcnt;
 		sbcc = sbavail(&so2->so_rcv);
 		if (sbcc)
 			sorwakeup_locked(so2);
 		else
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 
 		/*
 		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
 		 * it would be possible for uipc_rcvd to be called at this
 		 * point, drain the receiving sockbuf, clear SB_STOP, and then
 		 * we would set SB_STOP below.  That could lead to an empty
 		 * sockbuf having SB_STOP set
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
 			so->so_snd.sb_flags |= SB_STOP;
 		SOCKBUF_UNLOCK(&so->so_snd);
 		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
 	}
 
 	/*
 	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
 	 */
 	if (flags & PRUS_EOF) {
 		UNP_PCB_LOCK(unp);
 		socantsendmore(so);
 		unp_shutdown(unp);
 		UNP_PCB_UNLOCK(unp);
 	}
 
 	if ((nam != NULL) || (flags & PRUS_EOF))
 		UNP_LINK_WUNLOCK();
 	else
 		UNP_LINK_RUNLOCK();
 
 	if (control != NULL && error != 0)
 		unp_dispose_mbuf(control);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	/*
 	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
 	 * for freeing memory.
 	 */   
 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 		m_freem(m);
 	return (error);
 }
 
 static int
 uipc_ready(struct socket *so, struct mbuf *m, int count)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	int error;
 
 	unp = sotounpcb(so);
 
 	UNP_LINK_RLOCK();
 	if ((unp2 = unp->unp_conn) == NULL) {
 		UNP_LINK_RUNLOCK();
 		for (int i = 0; i < count; i++)
 			m = m_free(m);
 		return (ECONNRESET);
 	}
 	UNP_PCB_LOCK(unp2);
 	so2 = unp2->unp_socket;
 
 	SOCKBUF_LOCK(&so2->so_rcv);
 	if ((error = sbready(&so2->so_rcv, m, count)) == 0)
 		sorwakeup_locked(so2);
 	else
 		SOCKBUF_UNLOCK(&so2->so_rcv);
 
 	UNP_PCB_UNLOCK(unp2);
 	UNP_LINK_RUNLOCK();
 
 	return (error);
 }
 
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
 
 	sb->st_blksize = so->so_snd.sb_hiwat;
 	UNP_PCB_LOCK(unp);
 	sb->st_dev = NODEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
 	sb->st_ino = unp->unp_ino;
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_shutdown(struct socket *so)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	socantsendmore(so);
 	unp_shutdown(unp);
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
 	return (0);
 }
 
 static int
 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static struct pr_usrreqs uipc_usrreqs_dgram = {
 	.pru_abort = 		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_close =		uipc_close,
 };
 
 static struct pr_usrreqs uipc_usrreqs_seqpacket = {
 	.pru_abort =		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
 	.pru_close =		uipc_close,
 };
 
 static struct pr_usrreqs uipc_usrreqs_stream = {
 	.pru_abort = 		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_ready =		uipc_ready,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_generic,
 	.pru_close =		uipc_close,
 };
 
 static int
 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct unpcb *unp;
 	struct xucred xu;
 	int error, optval;
 
 	if (sopt->sopt_level != 0)
 		return (EINVAL);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case LOCAL_PEERCRED:
 			UNP_PCB_LOCK(unp);
 			if (unp->unp_flags & UNP_HAVEPC)
 				xu = unp->unp_peercred;
 			else {
 				if (so->so_type == SOCK_STREAM)
 					error = ENOTCONN;
 				else
 					error = EINVAL;
 			}
 			UNP_PCB_UNLOCK(unp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &xu, sizeof(xu));
 			break;
 
 		case LOCAL_CREDS:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		case LOCAL_CONNWAIT:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case LOCAL_CREDS:
 		case LOCAL_CONNWAIT:
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 
 #define	OPTSET(bit) do {						\
 	UNP_PCB_LOCK(unp);						\
 	if (optval)							\
 		unp->unp_flags |= bit;					\
 	else								\
 		unp->unp_flags &= ~bit;					\
 	UNP_PCB_UNLOCK(unp);						\
 } while (0)
 
 			switch (sopt->sopt_name) {
 			case LOCAL_CREDS:
 				OPTSET(UNP_WANTCRED);
 				break;
 
 			case LOCAL_CONNWAIT:
 				OPTSET(UNP_CONNWAIT);
 				break;
 
 			default:
 				break;
 			}
 			break;
 #undef	OPTSET
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static int
 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (unp_connectat(AT_FDCWD, so, nam, td));
 }
 
 static int
 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td)
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vnode *vp;
 	struct socket *so2;
 	struct unpcb *unp, *unp2, *unp3;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
 	cap_rights_t rights;
 	int error, len;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 
 	UNP_LINK_WLOCK_ASSERT();
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
 	if (nam->sa_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
 	if (len <= 0)
 		return (EINVAL);
 	bcopy(soun->sun_path, buf, len);
 	buf[len] = 0;
 
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_flags & UNP_CONNECTING) {
 		UNP_PCB_UNLOCK(unp);
 		return (EALREADY);
 	}
 	UNP_LINK_WUNLOCK();
 	unp->unp_flags |= UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 
 	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
 	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
 	error = namei(&nd);
 	if (error)
 		vp = NULL;
 	else
 		vp = nd.ni_vp;
 	ASSERT_VOP_LOCKED(vp, "unp_connect");
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error)
 		goto bad;
 
 	if (vp->v_type != VSOCK) {
 		error = ENOTSOCK;
 		goto bad;
 	}
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
 	if (error)
 		goto bad;
 #endif
 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
 	if (error)
 		goto bad;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
 	/*
 	 * Lock linkage lock for two reasons: make sure v_socket is stable,
 	 * and to protect simultaneous locking of multiple pcbs.
 	 */
 	UNP_LINK_WLOCK();
 	VOP_UNP_CONNECT(vp, &unp2);
 	if (unp2 == NULL) {
 		error = ECONNREFUSED;
 		goto bad2;
 	}
 	so2 = unp2->unp_socket;
 	if (so->so_type != so2->so_type) {
 		error = EPROTOTYPE;
 		goto bad2;
 	}
 	UNP_PCB_LOCK(unp);
 	UNP_PCB_LOCK(unp2);
 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 		if (so2->so_options & SO_ACCEPTCONN) {
 			CURVNET_SET(so2->so_vnet);
 			so2 = sonewconn(so2, 0);
 			CURVNET_RESTORE();
 		} else
 			so2 = NULL;
 		if (so2 == NULL) {
 			error = ECONNREFUSED;
 			goto bad3;
 		}
 		unp3 = sotounpcb(so2);
 		UNP_PCB_LOCK(unp3);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
 			unp3->unp_addr = (struct sockaddr_un *) sa;
 			sa = NULL;
 		}
 
 		/*
 		 * The connector's (client's) credentials are copied from its
 		 * process structure at the time of connect() (which is now).
 		 */
 		cru2x(td->td_ucred, &unp3->unp_peercred);
 		unp3->unp_flags |= UNP_HAVEPC;
 
 		/*
 		 * The receiver's (server's) credentials are copied from the
 		 * unp_peercred member of socket on which the former called
 		 * listen(); uipc_listen() cached that process's credentials
 		 * at that time so we can use them now.
 		 */
 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
 		    sizeof(unp->unp_peercred));
 		unp->unp_flags |= UNP_HAVEPC;
 		if (unp2->unp_flags & UNP_WANTCRED)
 			unp3->unp_flags |= UNP_WANTCRED;
 		UNP_PCB_UNLOCK(unp2);
 		unp2 = unp3;
 #ifdef MAC
 		mac_socketpeer_set_from_socket(so, so2);
 		mac_socketpeer_set_from_socket(so2, so);
 #endif
 	}
 
 	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
 	    sotounpcb(so2) == unp2,
 	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
 	error = unp_connect2(so, so2, PRU_CONNECT);
 bad3:
 	UNP_PCB_UNLOCK(unp2);
 	UNP_PCB_UNLOCK(unp);
 bad2:
 	UNP_LINK_WUNLOCK();
 bad:
 	if (vp != NULL)
 		vput(vp);
 	free(sa, M_SONAME);
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 unp_connect2(struct socket *so, struct socket *so2, int req)
 {
 	struct unpcb *unp;
 	struct unpcb *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
 	unp2 = sotounpcb(so2);
 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
 
 	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 
 	if (so2->so_type != so->so_type)
 		return (EPROTOTYPE);
 	unp2->unp_flags &= ~UNP_NASCENT;
 	unp->unp_conn = unp2;
 
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 		soisconnected(so);
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		unp2->unp_conn = unp;
 		if (req == PRU_CONNECT &&
 		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
 			soisconnecting(so);
 		else
 			soisconnected(so);
 		soisconnected(so2);
 		break;
 
 	default:
 		panic("unp_connect2");
 	}
 	return (0);
 }
 
 static void
 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
 	struct socket *so;
 
 	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
 
 	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 
 	unp->unp_conn = NULL;
 	switch (unp->unp_socket->so_type) {
 	case SOCK_DGRAM:
 		LIST_REMOVE(unp, unp_reflink);
 		so = unp->unp_socket;
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_ISCONNECTED;
 		SOCK_UNLOCK(so);
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		soisdisconnected(unp->unp_socket);
 		unp2->unp_conn = NULL;
 		soisdisconnected(unp2->unp_socket);
 		break;
 	}
 }
 
 /*
  * unp_pcblist() walks the global list of struct unpcb's to generate a
  * pointer list, bumping the refcount on each unpcb.  It then copies them out
  * sequentially, validating the generation number on each to see if it has
  * been detached.  All of this is necessary because copyout() may sleep on
  * disk I/O.
  */
 static int
 unp_pcblist(SYSCTL_HANDLER_ARGS)
 {
-	int error, i, n;
-	int freeunp;
 	struct unpcb *unp, **unp_list;
 	unp_gen_t gencnt;
 	struct xunpgen *xug;
 	struct unp_head *head;
 	struct xunpcb *xu;
+	u_int i;
+	int freeunp, error, n;
 
 	switch ((intptr_t)arg1) {
 	case SOCK_STREAM:
 		head = &unp_shead;
 		break;
 
 	case SOCK_DGRAM:
 		head = &unp_dhead;
 		break;
 
 	case SOCK_SEQPACKET:
 		head = &unp_sphead;
 		break;
 
 	default:
 		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
 	}
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = unp_count;
 		req->oldidx = 2 * (sizeof *xug)
 			+ (n + n/8) * sizeof(struct xunpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
 	UNP_LINK_RLOCK();
 	gencnt = unp_gencnt;
 	n = unp_count;
 	UNP_LINK_RUNLOCK();
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
 	xug->xug_gen = gencnt;
 	xug->xug_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, xug, sizeof *xug);
 	if (error) {
 		free(xug, M_TEMP);
 		return (error);
 	}
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 
 	UNP_LINK_RLOCK();
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
 		UNP_PCB_LOCK(unp);
 		if (unp->unp_gencnt <= gencnt) {
 			if (cr_cansee(req->td->td_ucred,
 			    unp->unp_socket->so_cred)) {
 				UNP_PCB_UNLOCK(unp);
 				continue;
 			}
 			unp_list[i++] = unp;
 			unp->unp_refcount++;
 		}
 		UNP_PCB_UNLOCK(unp);
 	}
 	UNP_LINK_RUNLOCK();
 	n = i;			/* In case we lost some during malloc. */
 
 	error = 0;
 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
 		UNP_PCB_LOCK(unp);
 		unp->unp_refcount--;
 	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = unp;
 			/*
 			 * XXX - need more locking here to protect against
 			 * connect/disconnect races for SMP.
 			 */
 			if (unp->unp_addr != NULL)
 				bcopy(unp->unp_addr, &xu->xu_addr,
 				      unp->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
 			if (unp->unp_conn != NULL &&
 			    unp->unp_conn->unp_addr != NULL)
 				bcopy(unp->unp_conn->unp_addr,
 				      &xu->xu_caddr,
 				      unp->unp_conn->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
 			xu->unp_vnode = unp->unp_vnode;
 			xu->unp_conn = unp->unp_conn;
 			xu->xu_firstref = LIST_FIRST(&unp->unp_refs);
 			xu->xu_nextref = LIST_NEXT(unp, unp_reflink);
 			xu->unp_gencnt = unp->unp_gencnt;
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
 			UNP_PCB_UNLOCK(unp);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
 		} else {
 			freeunp = (unp->unp_refcount == 0);
 			UNP_PCB_UNLOCK(unp);
 			if (freeunp) {
 				UNP_PCB_LOCK_DESTROY(unp);
 				uma_zfree(unp_zone, unp);
 			}
 		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		xug->xug_gen = unp_gencnt;
 		xug->xug_sogen = so_gencnt;
 		xug->xug_count = unp_count;
 		error = SYSCTL_OUT(req, xug, sizeof *xug);
 	}
 	free(unp_list, M_TEMP);
 	free(xug, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
     (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local datagram sockets");
 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
     (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local stream sockets");
 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD,
     (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
     "List of active local seqpacket sockets");
 
 static void
 unp_shutdown(struct unpcb *unp)
 {
 	struct unpcb *unp2;
 	struct socket *so;
 
 	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	unp2 = unp->unp_conn;
 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
 	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
 		so = unp2->unp_socket;
 		if (so != NULL)
 			socantrcvmore(so);
 	}
 }
 
 static void
 unp_drop(struct unpcb *unp)
 {
 	struct socket *so = unp->unp_socket;
 	struct unpcb *unp2;
 
 	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	/*
 	 * Regardless of whether the socket's peer dropped the connection
 	 * with this socket by aborting or disconnecting, POSIX requires
 	 * that ECONNRESET is returned.
 	 */
 	so->so_error = ECONNRESET;
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL)
 		return;
 	UNP_PCB_LOCK(unp2);
 	unp_disconnect(unp, unp2);
 	UNP_PCB_UNLOCK(unp2);
 }
 
 static void
 unp_freerights(struct filedescent **fdep, int fdcount)
 {
 	struct file *fp;
 	int i;
 
 	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		filecaps_free(&fdep[i]->fde_caps);
 		unp_discard(fp);
 	}
 	free(fdep[0], M_FILECAPS);
 }
 
 static int
 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 {
 	struct thread *td = curthread;		/* XXX */
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	int i;
 	int *fdp;
 	struct filedesc *fdesc = td->td_proc->p_fd;
 	struct filedescent **fdep;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, newfds;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
 		*controlp = NULL;
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
 			error = EINVAL;
 			break;
 		}
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 		if (cm->cmsg_level == SOL_SOCKET
 		    && cm->cmsg_type == SCM_RIGHTS) {
 			newfds = datalen / sizeof(*fdep);
 			if (newfds == 0)
 				goto next;
 			fdep = data;
 
 			/* If we're not outputting the descriptors free them. */
 			if (error || controlp == NULL) {
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
 			FILEDESC_XLOCK(fdesc);
 
 			/*
 			 * Now change each pointer to an fd in the global
 			 * table to an integer that is the index to the local
 			 * fd table entry that we set up to point to the
 			 * global one we are transferring.
 			 */
 			newlen = newfds * sizeof(int);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_XUNLOCK(fdesc);
 				error = E2BIG;
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
 
 			fdp = (int *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			if (fdallocn(td, 0, fdp, newfds) != 0) {
 				FILEDESC_XUNLOCK(fdesc);
 				error = EMSGSIZE;
 				unp_freerights(fdep, newfds);
 				m_freem(*controlp);
 				*controlp = NULL;
 				goto next;
 			}
 			for (i = 0; i < newfds; i++, fdp++) {
 				_finstall(fdesc, fdep[i]->fde_file, *fdp,
 				    (flags & MSG_CMSG_CLOEXEC) != 0 ? UF_EXCLOSE : 0,
 				    &fdep[i]->fde_caps);
 				unp_externalize_fp(fdep[i]->fde_file);
 			}
 			FILEDESC_XUNLOCK(fdesc);
 			free(fdep[0], M_FILECAPS);
 		} else {
 			/* We can just copy anything else across. */
 			if (error || controlp == NULL)
 				goto next;
 			*controlp = sbcreatecontrol(NULL, datalen,
 			    cm->cmsg_type, cm->cmsg_level);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto next;
 			}
 			bcopy(data,
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
 			    datalen);
 		}
 		controlp = &(*controlp)->m_next;
 
 next:
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 	m_freem(control);
 	return (error);
 }
 
 static void
 unp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(unp_zone, maxsockets);
 }
 
 static void
 unp_init(void)
 {
 
 #ifdef VIMAGE
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 #endif
 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	if (unp_zone == NULL)
 		panic("unp_init");
 	uma_zone_set_max(unp_zone, maxsockets);
 	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	LIST_INIT(&unp_dhead);
 	LIST_INIT(&unp_shead);
 	LIST_INIT(&unp_sphead);
 	SLIST_INIT(&unp_defers);
 	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
 	UNP_LINK_LOCK_INIT();
 	UNP_DEFERRED_LOCK_INIT();
 }
 
 static int
 unp_internalize(struct mbuf **controlp, struct thread *td)
 {
 	struct mbuf *control = *controlp;
 	struct proc *p = td->td_proc;
 	struct filedesc *fdesc = p->p_fd;
 	struct bintime *bt;
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	struct cmsgcred *cmcred;
 	struct filedescent *fde, **fdep, *fdev;
 	struct file *fp;
 	struct timeval *tv;
 	struct timespec *ts;
 	int i, *fdp;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, oldfds;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
 
 	error = 0;
 	*controlp = NULL;
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
 		    || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) {
 			error = EINVAL;
 			goto out;
 		}
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 		switch (cm->cmsg_type) {
 		/*
 		 * Fill in credential information.
 		 */
 		case SCM_CREDS:
 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
 			    SCM_CREDS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			cmcred = (struct cmsgcred *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			cmcred->cmcred_pid = p->p_pid;
 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
 			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
 			    CMGROUP_MAX);
 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
 				cmcred->cmcred_groups[i] =
 				    td->td_ucred->cr_groups[i];
 			break;
 
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
 			if (oldfds == 0)
 				break;
 			/*
 			 * Check that all the FDs passed in refer to legal
 			 * files.  If not, reject the entire operation.
 			 */
 			fdp = data;
 			FILEDESC_SLOCK(fdesc);
 			for (i = 0; i < oldfds; i++, fdp++) {
 				fp = fget_locked(fdesc, *fdp);
 				if (fp == NULL) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EOPNOTSUPP;
 					goto out;
 				}
 
 			}
 
 			/*
 			 * Now replace the integer FDs with pointers to the
 			 * file structure and capability rights.
 			 */
 			newlen = oldfds * sizeof(fdep[0]);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_SUNLOCK(fdesc);
 				error = E2BIG;
 				goto out;
 			}
 			fdp = data;
 			fdep = (struct filedescent **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
 			    M_WAITOK);
 			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
 				fde = &fdesc->fd_ofiles[*fdp];
 				fdep[i] = fdev;
 				fdep[i]->fde_file = fde->fde_file;
 				filecaps_copy(&fde->fde_caps,
 				    &fdep[i]->fde_caps, true);
 				unp_internalize_fp(fdep[i]->fde_file);
 			}
 			FILEDESC_SUNLOCK(fdesc);
 			break;
 
 		case SCM_TIMESTAMP:
 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			tv = (struct timeval *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			microtime(tv);
 			break;
 
 		case SCM_BINTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			bt = (struct bintime *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			bintime(bt);
 			break;
 
 		case SCM_REALTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_REALTIME, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanotime(ts);
 			break;
 
 		case SCM_MONOTONIC:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_MONOTONIC, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanouptime(ts);
 			break;
 
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		controlp = &(*controlp)->m_next;
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 out:
 	m_freem(control);
 	return (error);
 }
 
 static struct mbuf *
 unp_addsockcred(struct thread *td, struct mbuf *control)
 {
 	struct mbuf *m, *n, *n_prev;
 	struct sockcred *sc;
 	const struct cmsghdr *cm;
 	int ngroups;
 	int i;
 
 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
 	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
 	if (m == NULL)
 		return (control);
 
 	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
 	sc->sc_uid = td->td_ucred->cr_ruid;
 	sc->sc_euid = td->td_ucred->cr_uid;
 	sc->sc_gid = td->td_ucred->cr_rgid;
 	sc->sc_egid = td->td_ucred->cr_gid;
 	sc->sc_ngroups = ngroups;
 	for (i = 0; i < sc->sc_ngroups; i++)
 		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 
 	/*
 	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
 	 * created SCM_CREDS control message (struct sockcred) has another
 	 * format.
 	 */
 	if (control != NULL)
 		for (n = control, n_prev = NULL; n != NULL;) {
 			cm = mtod(n, struct cmsghdr *);
     			if (cm->cmsg_level == SOL_SOCKET &&
 			    cm->cmsg_type == SCM_CREDS) {
     				if (n_prev == NULL)
 					control = n->m_next;
 				else
 					n_prev->m_next = n->m_next;
 				n = m_free(n);
 			} else {
 				n_prev = n;
 				n = n->m_next;
 			}
 		}
 
 	/* Prepend it to the head. */
 	m->m_next = control;
 	return (m);
 }
 
 static struct unpcb *
 fptounp(struct file *fp)
 {
 	struct socket *so;
 
 	if (fp->f_type != DTYPE_SOCKET)
 		return (NULL);
 	if ((so = fp->f_data) == NULL)
 		return (NULL);
 	if (so->so_proto->pr_domain != &localdomain)
 		return (NULL);
 	return sotounpcb(so);
 }
 
 static void
 unp_discard(struct file *fp)
 {
 	struct unp_defer *dr;
 
 	if (unp_externalize_fp(fp)) {
 		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
 		dr->ud_fp = fp;
 		UNP_DEFERRED_LOCK();
 		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
 		UNP_DEFERRED_UNLOCK();
 		atomic_add_int(&unp_defers_count, 1);
 		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
 	} else
 		(void) closef(fp, (struct thread *)NULL);
 }
 
 static void
 unp_process_defers(void *arg __unused, int pending)
 {
 	struct unp_defer *dr;
 	SLIST_HEAD(, unp_defer) drl;
 	int count;
 
 	SLIST_INIT(&drl);
 	for (;;) {
 		UNP_DEFERRED_LOCK();
 		if (SLIST_FIRST(&unp_defers) == NULL) {
 			UNP_DEFERRED_UNLOCK();
 			break;
 		}
 		SLIST_SWAP(&unp_defers, &drl, unp_defer);
 		UNP_DEFERRED_UNLOCK();
 		count = 0;
 		while ((dr = SLIST_FIRST(&drl)) != NULL) {
 			SLIST_REMOVE_HEAD(&drl, ud_link);
 			closef(dr->ud_fp, NULL);
 			free(dr, M_TEMP);
 			count++;
 		}
 		atomic_add_int(&unp_defers_count, -count);
 	}
 }
 
 static void
 unp_internalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_file = fp;
 		unp->unp_msgcount++;
 	}
 	fhold(fp);
 	unp_rights++;
 	UNP_LINK_WUNLOCK();
 }
 
 static int
 unp_externalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 	int ret;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_msgcount--;
 		ret = 1;
 	} else
 		ret = 0;
 	unp_rights--;
 	UNP_LINK_WUNLOCK();
 	return (ret);
 }
 
 /*
  * unp_defer indicates whether additional work has been defered for a future
  * pass through unp_gc().  It is thread local and does not require explicit
  * synchronization.
  */
 static int	unp_marked;
 static int	unp_unreachable;
 
 static void
 unp_accessable(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
 	struct file *fp;
 	int i;
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		if ((unp = fptounp(fp)) == NULL)
 			continue;
 		if (unp->unp_gcflag & UNPGC_REF)
 			continue;
 		unp->unp_gcflag &= ~UNPGC_DEAD;
 		unp->unp_gcflag |= UNPGC_REF;
 		unp_marked++;
 	}
 }
 
 static void
 unp_gc_process(struct unpcb *unp)
 {
 	struct socket *so, *soa;
 	struct file *fp;
 
 	/* Already processed. */
 	if (unp->unp_gcflag & UNPGC_SCANNED)
 		return;
 	fp = unp->unp_file;
 
 	/*
 	 * Check for a socket potentially in a cycle.  It must be in a
 	 * queue as indicated by msgcount, and this must equal the file
 	 * reference count.  Note that when msgcount is 0 the file is NULL.
 	 */
 	if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp &&
 	    unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) {
 		unp->unp_gcflag |= UNPGC_DEAD;
 		unp_unreachable++;
 		return;
 	}
 
 	so = unp->unp_socket;
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		/*
 		 * Mark all sockets in our accept queue.
 		 */
 		TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
 			if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
 				continue;
 			SOCKBUF_LOCK(&soa->so_rcv);
 			unp_scan(soa->so_rcv.sb_mb, unp_accessable);
 			SOCKBUF_UNLOCK(&soa->so_rcv);
 		}
 	} else {
 		/*
 		 * Mark all sockets we reference with RIGHTS.
 		 */
 		if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
 			SOCKBUF_LOCK(&so->so_rcv);
 			unp_scan(so->so_rcv.sb_mb, unp_accessable);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		}
 	}
 	SOCK_UNLOCK(so);
 	unp->unp_gcflag |= UNPGC_SCANNED;
 }
 
 static int unp_recycled;
 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, 
     "Number of unreachable sockets claimed by the garbage collector.");
 
 static int unp_taskcount;
 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, 
     "Number of times the garbage collector has run.");
 
 static void
 unp_gc(__unused void *arg, int pending)
 {
 	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
 				    NULL };
 	struct unp_head **head;
 	struct file *f, **unref;
 	struct unpcb *unp;
 	int i, total;
 
 	unp_taskcount++;
 	UNP_LINK_RLOCK();
 	/*
 	 * First clear all gc flags from previous runs, apart from
 	 * UNPGC_IGNORE_RIGHTS.
 	 */
 	for (head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link)
 			unp->unp_gcflag =
 			    (unp->unp_gcflag & UNPGC_IGNORE_RIGHTS);
 
 	/*
 	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
 	 * is reachable all of the sockets it references are reachable.
 	 * Stop the scan once we do a complete loop without discovering
 	 * a new reachable socket.
 	 */
 	do {
 		unp_unreachable = 0;
 		unp_marked = 0;
 		for (head = heads; *head != NULL; head++)
 			LIST_FOREACH(unp, *head, unp_link)
 				unp_gc_process(unp);
 	} while (unp_marked);
 	UNP_LINK_RUNLOCK();
 	if (unp_unreachable == 0)
 		return;
 
 	/*
 	 * Allocate space for a local list of dead unpcbs.
 	 */
 	unref = malloc(unp_unreachable * sizeof(struct file *),
 	    M_TEMP, M_WAITOK);
 
 	/*
 	 * Iterate looking for sockets which have been specifically marked
 	 * as as unreachable and store them locally.
 	 */
 	UNP_LINK_RLOCK();
 	for (total = 0, head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link)
 			if ((unp->unp_gcflag & UNPGC_DEAD) != 0) {
 				f = unp->unp_file;
 				if (unp->unp_msgcount == 0 || f == NULL ||
 				    f->f_count != unp->unp_msgcount)
 					continue;
 				unref[total++] = f;
 				fhold(f);
 				KASSERT(total <= unp_unreachable,
 				    ("unp_gc: incorrect unreachable count."));
 			}
 	UNP_LINK_RUNLOCK();
 
 	/*
 	 * Now flush all sockets, free'ing rights.  This will free the
 	 * struct files associated with these sockets but leave each socket
 	 * with one remaining ref.
 	 */
 	for (i = 0; i < total; i++) {
 		struct socket *so;
 
 		so = unref[i]->f_data;
 		CURVNET_SET(so->so_vnet);
 		sorflush(so);
 		CURVNET_RESTORE();
 	}
 
 	/*
 	 * And finally release the sockets so they can be reclaimed.
 	 */
 	for (i = 0; i < total; i++)
 		fdrop(unref[i], NULL);
 	unp_recycled += total;
 	free(unref, M_TEMP);
 }
 
 static void
 unp_dispose_mbuf(struct mbuf *m)
 {
 
 	if (m)
 		unp_scan(m, unp_freerights);
 }
 
 /*
  * Synchronize against unp_gc, which can trip over data as we are freeing it.
  */
 static void
 unp_dispose(struct socket *so)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	UNP_LINK_WLOCK();
 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
 	UNP_LINK_WUNLOCK();
 	if (!SOLISTENING(so))
 		unp_dispose_mbuf(so->so_rcv.sb_mb);
 }
 
 static void
 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 {
 	struct mbuf *m;
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen;
 
 	while (m0 != NULL) {
 		for (m = m0; m; m = m->m_next) {
 			if (m->m_type != MT_CONTROL)
 				continue;
 
 			cm = mtod(m, struct cmsghdr *);
 			clen = m->m_len;
 
 			while (cm != NULL) {
 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
 					break;
 
 				data = CMSG_DATA(cm);
 				datalen = (caddr_t)cm + cm->cmsg_len
 				    - (caddr_t)data;
 
 				if (cm->cmsg_level == SOL_SOCKET &&
 				    cm->cmsg_type == SCM_RIGHTS) {
 					(*op)(data, datalen /
 					    sizeof(struct filedescent *));
 				}
 
 				if (CMSG_SPACE(datalen) < clen) {
 					clen -= CMSG_SPACE(datalen);
 					cm = (struct cmsghdr *)
 					    ((caddr_t)cm + CMSG_SPACE(datalen));
 				} else {
 					clen = 0;
 					cm = NULL;
 				}
 			}
 		}
 		m0 = m0->m_nextpkt;
 	}
 }
 
 /*
  * A helper function called by VFS before socket-type vnode reclamation.
  * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
  * use count.
  */
 void
 vfs_unp_reclaim(struct vnode *vp)
 {
 	struct unpcb *unp;
 	int active;
 
 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
 	KASSERT(vp->v_type == VSOCK,
 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
 
 	active = 0;
 	UNP_LINK_WLOCK();
 	VOP_UNP_CONNECT(vp, &unp);
 	if (unp == NULL)
 		goto done;
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode == vp) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 		active = 1;
 	}
 	UNP_PCB_UNLOCK(unp);
 done:
 	UNP_LINK_WUNLOCK();
 	if (active)
 		vunref(vp);
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_unpflags(int unp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (unp_flags & UNP_HAVEPC) {
 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_WANTCRED) {
 		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNWAIT) {
 		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNECTING) {
 		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_BINDING) {
 		db_printf("%sUNP_BINDING", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_xucred(int indent, struct xucred *xu)
 {
 	int comma, i;
 
 	db_print_indent(indent);
 	db_printf("cr_version: %u   cr_uid: %u   cr_ngroups: %d\n",
 	    xu->cr_version, xu->cr_uid, xu->cr_ngroups);
 	db_print_indent(indent);
 	db_printf("cr_groups: ");
 	comma = 0;
 	for (i = 0; i < xu->cr_ngroups; i++) {
 		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
 		comma = 1;
 	}
 	db_printf("\n");
 }
 
 static void
 db_print_unprefs(int indent, struct unp_head *uh)
 {
 	struct unpcb *unp;
 	int counter;
 
 	counter = 0;
 	LIST_FOREACH(unp, uh, unp_reflink) {
 		if (counter % 4 == 0)
 			db_print_indent(indent);
 		db_printf("%p  ", unp);
 		if (counter % 4 == 3)
 			db_printf("\n");
 		counter++;
 	}
 	if (counter != 0 && counter % 4 != 0)
 		db_printf("\n");
 }
 
 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
 {
 	struct unpcb *unp;
 
         if (!have_addr) {
                 db_printf("usage: show unpcb <addr>\n");
                 return;
         }
         unp = (struct unpcb *)addr;
 
 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
 	    unp->unp_vnode);
 
 	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
 	    unp->unp_conn);
 
 	db_printf("unp_refs:\n");
 	db_print_unprefs(2, &unp->unp_refs);
 
 	/* XXXRW: Would be nice to print the full address, if any. */
 	db_printf("unp_addr: %p\n", unp->unp_addr);
 
 	db_printf("unp_gencnt: %llu\n",
 	    (unsigned long long)unp->unp_gencnt);
 
 	db_printf("unp_flags: %x (", unp->unp_flags);
 	db_print_unpflags(unp->unp_flags);
 	db_printf(")\n");
 
 	db_printf("unp_peercred:\n");
 	db_print_xucred(2, &unp->unp_peercred);
 
 	db_printf("unp_refcount: %u\n", unp->unp_refcount);
 }
 #endif
Index: head/sys/netpfil/ipfw/ip_fw_sockopt.c
===================================================================
--- head/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 328237)
+++ head/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 328238)
@@ -1,4653 +1,4653 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  * Copyright (c) 2014 Yandex LLC
  * Copyright (c) 2014 Alexander V. Chernikov
  *
  * Supported by: Valeria Paoli
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Control socket and rule management routines for ipfw.
  * Control is currently implemented via IP_FW3 setsockopt() code.
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>	/* struct m_tag used by nested headers */
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/fnv_hash.h>
 #include <net/if.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h> /* hooks */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_fw_table.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 static int ipfw_ctl(struct sockopt *sopt);
 static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len,
     struct rule_check_info *ci);
 static int check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci);
 static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci);
 static int rewrite_rule_uidx(struct ip_fw_chain *chain,
     struct rule_check_info *ci);
 
 #define	NAMEDOBJ_HASH_SIZE	32
 
 struct namedobj_instance {
 	struct namedobjects_head	*names;
 	struct namedobjects_head	*values;
 	uint32_t nn_size;		/* names hash size */
 	uint32_t nv_size;		/* number hash size */
 	u_long *idx_mask;		/* used items bitmask */
 	uint32_t max_blocks;		/* number of "long" blocks in bitmask */
 	uint32_t count;			/* number of items */
 	uint16_t free_off[IPFW_MAX_SETS];	/* first possible free offset */
 	objhash_hash_f	*hash_f;
 	objhash_cmp_f	*cmp_f;
 };
 #define	BLOCK_ITEMS	(8 * sizeof(u_long))	/* Number of items for ffsl() */
 
 static uint32_t objhash_hash_name(struct namedobj_instance *ni,
     const void *key, uint32_t kopt);
 static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val);
 static int objhash_cmp_name(struct named_object *no, const void *name,
     uint32_t set);
 
 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 
 static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 
 /* ctl3 handler data */
 struct mtx ctl3_lock;
 #define	CTL3_LOCK_INIT()	mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF)
 #define	CTL3_LOCK_DESTROY()	mtx_destroy(&ctl3_lock)
 #define	CTL3_LOCK()		mtx_lock(&ctl3_lock)
 #define	CTL3_UNLOCK()		mtx_unlock(&ctl3_lock)
 
 static struct ipfw_sopt_handler *ctl3_handlers;
 static size_t ctl3_hsize;
 static uint64_t ctl3_refct, ctl3_gencnt;
 #define	CTL3_SMALLBUF	4096			/* small page-size write buffer */
 #define	CTL3_LARGEBUF	16 * 1024 * 1024	/* handle large rulesets */
 
 static int ipfw_flush_sopt_data(struct sockopt_data *sd);
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_XGET,		0,	HDIR_GET,	dump_config },
 	{ IP_FW_XADD,		0,	HDIR_BOTH,	add_rules },
 	{ IP_FW_XDEL,		0,	HDIR_BOTH,	del_rules },
 	{ IP_FW_XZERO,		0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XRESETLOG,	0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XMOVE,		0,	HDIR_SET,	move_rules },
 	{ IP_FW_SET_SWAP,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_MOVE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_ENABLE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_DUMP_SOPTCODES,	0,	HDIR_GET,	dump_soptcodes },
 	{ IP_FW_DUMP_SRVOBJECTS,0,	HDIR_GET,	dump_srvobjects },
 };
 
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule);
 static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd,
     uint16_t *puidx, uint8_t *ptype);
 static int mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint32_t *bmask);
 static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti);
 static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct tid_info *ti, struct obj_idx *pidx, int *unresolved);
 static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule);
 static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *end);
 static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd);
 
 /*
  * Opcode object rewriter variables
  */
 struct opcode_obj_rewrite *ctl3_rewriters;
 static size_t ctl3_rsize;
 
 /*
  * static variables followed by global ones
  */
 
 static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone);
 #define	V_ipfw_cntr_zone		VNET(ipfw_cntr_zone)
 
 void
 ipfw_init_counters()
 {
 
 	V_ipfw_cntr_zone = uma_zcreate("IPFW counters",
 	    IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 }
 
 void
 ipfw_destroy_counters()
 {
 	
 	uma_zdestroy(V_ipfw_cntr_zone);
 }
 
 struct ip_fw *
 ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize)
 {
 	struct ip_fw *rule;
 
 	rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO);
 	rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO);
 
 	return (rule);
 }
 
 static void
 free_rule(struct ip_fw *rule)
 {
 
 	uma_zfree(V_ipfw_cntr_zone, rule->cntr);
 	free(rule, M_IPFW);
 }
 
 
 /*
  * Find the smallest rule >= key, id.
  * We could use bsearch but it is so simple that we code it directly
  */
 int
 ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
 {
 	int i, lo, hi;
 	struct ip_fw *r;
 
   	for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
 		i = (lo + hi) / 2;
 		r = chain->map[i];
 		if (r->rulenum < key)
 			lo = i + 1;	/* continue from the next one */
 		else if (r->rulenum > key)
 			hi = i;		/* this might be good */
 		else if (r->id < id)
 			lo = i + 1;	/* continue from the next one */
 		else /* r->id >= id */
 			hi = i;		/* this might be good */
 	}
 	return hi;
 }
 
 /*
  * Builds skipto cache on rule set @map.
  */
 static void
 update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map)
 {
 	int *smap, rulenum;
 	int i, mi;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	mi = 0;
 	rulenum = map[mi]->rulenum;
 	smap = chain->idxmap_back;
 
 	if (smap == NULL)
 		return;
 
 	for (i = 0; i < 65536; i++) {
 		smap[i] = mi;
 		/* Use the same rule index until i < rulenum */
 		if (i != rulenum || i == 65535)
 			continue;
 		/* Find next rule with num > i */
 		rulenum = map[++mi]->rulenum;
 		while (rulenum == i)
 			rulenum = map[++mi]->rulenum;
 	}
 }
 
 /*
  * Swaps prepared (backup) index with current one.
  */
 static void
 swap_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *map;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 	IPFW_WLOCK_ASSERT(chain);
 
 	map = chain->idxmap;
 	chain->idxmap = chain->idxmap_back;
 	chain->idxmap_back = map;
 }
 
 /*
  * Allocate and initialize skipto cache.
  */
 void
 ipfw_init_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *idxmap, *idxmap_back;
 
 	idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW,
 	    M_WAITOK | M_ZERO);
 	idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	/*
 	 * Note we may be called at any time after initialization,
 	 * for example, on first skipto rule, so we need to
 	 * provide valid chain->idxmap on return
 	 */
 
 	IPFW_UH_WLOCK(chain);
 	if (chain->idxmap != NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		free(idxmap, M_IPFW);
 		free(idxmap_back, M_IPFW);
 		return;
 	}
 
 	/* Set backup pointer first to permit building cache */
 	chain->idxmap_back = idxmap_back;
 	update_skipto_cache(chain, chain->map);
 	IPFW_WLOCK(chain);
 	/* It is now safe to set chain->idxmap ptr */
 	chain->idxmap = idxmap;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * Destroys skipto cache.
  */
 void
 ipfw_destroy_skipto_cache(struct ip_fw_chain *chain)
 {
 
 	if (chain->idxmap != NULL)
 		free(chain->idxmap, M_IPFW);
 	if (chain->idxmap != NULL)
 		free(chain->idxmap_back, M_IPFW);
 }
 
 
 /*
  * allocate a new map, returns the chain locked. extra is the number
  * of entries to add or delete.
  */
 static struct ip_fw **
 get_map(struct ip_fw_chain *chain, int extra, int locked)
 {
 
 	for (;;) {
 		struct ip_fw **map;
-		int i, mflags;
+		u_int i, mflags;
 
 		mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK);
 
 		i = chain->n_rules + extra;
 		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
 			return NULL;
 		}
 		if (!locked)
 			IPFW_UH_WLOCK(chain);
 		if (i >= chain->n_rules + extra) /* good */
 			return map;
 		/* otherwise we lost the race, free and retry */
 		if (!locked)
 			IPFW_UH_WUNLOCK(chain);
 		free(map, M_IPFW);
 	}
 }
 
 /*
  * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
  */
 static struct ip_fw **
 swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
 {
 	struct ip_fw **old_map;
 
 	IPFW_WLOCK(chain);
 	chain->id++;
 	chain->n_rules = new_len;
 	old_map = chain->map;
 	chain->map = new_map;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	return old_map;
 }
 
 
 static void
 export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr)
 {
 	struct timeval boottime;
 
 	cntr->size = sizeof(*cntr);
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 static void
 export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr)
 {
 	struct timeval boottime;
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 /*
  * Copies rule @urule from v1 userland format (current).
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule1(struct rule_check_info *ci)
 {
 	struct ip_fw_rule *urule;
 	struct ip_fw *krule;
 
 	urule = (struct ip_fw_rule *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	krule->flags = urule->flags;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 /*
  * Export rule into v1 format (Current).
  * Layout:
  * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT)
  *     [ ip_fw_rule ] OR
  *     [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs).
  * ]
  * Assume @data is zeroed.
  */
 static void
 export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs)
 {
 	struct ip_fw_bcounter *cntr;
 	struct ip_fw_rule *urule;
 	ipfw_obj_tlv *tlv;
 
 	/* Fill in TLV header */
 	tlv = (ipfw_obj_tlv *)data;
 	tlv->type = IPFW_TLV_RULE_ENT;
 	tlv->length = len;
 
 	if (rcntrs != 0) {
 		/* Copy counters */
 		cntr = (struct ip_fw_bcounter *)(tlv + 1);
 		urule = (struct ip_fw_rule *)(cntr + 1);
 		export_cntr1_base(krule, cntr);
 	} else
 		urule = (struct ip_fw_rule *)(tlv + 1);
 
 	/* copy header */
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	urule->flags = krule->flags;
 	urule->id = krule->id;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 
 /*
  * Copies rule @urule from FreeBSD8 userland format (v0)
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule0(struct rule_check_info *ci)
 {
 	struct ip_fw_rule0 *urule;
 	struct ip_fw *krule;
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	urule = (struct ip_fw_rule0 *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	if ((urule->_pad & 1) != 0)
 		krule->flags |= IPFW_RULE_NOOPT;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 65535 to 0
 	 * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room
 	 *    for targ).
 	 * 3) convert table number in iface opcodes to u16
 	 * 4) convert old `nat global` into new 65535
 	 */
 	l = krule->cmd_len;
 	cmd = krule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else if (cmd->arg1 == 0)
 				cmd->arg1 = IP_FW_NAT44_GLOBAL;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else
 				cmd->arg1 |= 0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TABLEARG)
 				lcmd->conn_limit = IP_FW_TARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.kidx = (uint16_t)cmdif->p.glob;
 			break;
 		}
 	}
 }
 
 /*
  * Copies rule @krule from kernel to FreeBSD8 userland format (v0)
  */
 static void
 export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len)
 {
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	/* copy header */
 	memset(urule, 0, len);
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	if ((krule->flags & IPFW_RULE_NOOPT) != 0)
 		urule->_pad |= 1;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/* Export counters */
 	export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt);
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 0 to 65535
 	 * 2) Remove highest bit from O_SETFIB/O_SETDSCP values.
 	 * 3) convert table number in iface opcodes to int
 	 */
 	l = urule->cmd_len;
 	cmd = urule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else if (cmd->arg1 == IP_FW_NAT44_GLOBAL)
 				cmd->arg1 = 0;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else
 				cmd->arg1 &= ~0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TARG)
 				lcmd->conn_limit = IP_FW_TABLEARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.glob = cmdif->p.kidx;
 			break;
 		}
 	}
 }
 
 /*
  * Add new rule(s) to the list possibly creating rule number for each.
  * Update the rule_number in the input struct so the caller knows it as well.
  * Must be called without IPFW_UH held
  */
 static int
 commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count)
 {
 	int error, i, insert_before, tcount;
 	uint16_t rulenum, *pnum;
 	struct rule_check_info *ci;
 	struct ip_fw *krule;
 	struct ip_fw **map;	/* the new array of pointers */
 
 	/* Check if we need to do table/obj index remap */
 	tcount = 0;
 	for (ci = rci, i = 0; i < count; ci++, i++) {
 		if (ci->object_opcodes == 0)
 			continue;
 
 		/*
 		 * Rule has some object opcodes.
 		 * We need to find (and create non-existing)
 		 * kernel objects, and reference existing ones.
 		 */
 		error = rewrite_rule_uidx(chain, ci);
 		if (error != 0) {
 
 			/*
 			 * rewrite failed, state for current rule
 			 * has been reverted. Check if we need to
 			 * revert more.
 			 */
 			if (tcount > 0) {
 
 				/*
 				 * We have some more table rules
 				 * we need to rollback.
 				 */
 
 				IPFW_UH_WLOCK(chain);
 				while (ci != rci) {
 					ci--;
 					if (ci->object_opcodes == 0)
 						continue;
 					unref_rule_objects(chain,ci->krule);
 
 				}
 				IPFW_UH_WUNLOCK(chain);
 
 			}
 
 			return (error);
 		}
 
 		tcount++;
 	}
 
 	/* get_map returns with IPFW_UH_WLOCK if successful */
 	map = get_map(chain, count, 0 /* not locked */);
 	if (map == NULL) {
 		if (tcount > 0) {
 			/* Unbind tables */
 			IPFW_UH_WLOCK(chain);
 			for (ci = rci, i = 0; i < count; ci++, i++) {
 				if (ci->object_opcodes == 0)
 					continue;
 
 				unref_rule_objects(chain, ci->krule);
 			}
 			IPFW_UH_WUNLOCK(chain);
 		}
 
 		return (ENOSPC);
 	}
 
 	if (V_autoinc_step < 1)
 		V_autoinc_step = 1;
 	else if (V_autoinc_step > 1000)
 		V_autoinc_step = 1000;
 
 	/* FIXME: Handle count > 1 */
 	ci = rci;
 	krule = ci->krule;
 	rulenum = krule->rulenum;
 
 	/* find the insertion point, we will insert before */
 	insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE;
 	i = ipfw_find_rule(chain, insert_before, 0);
 	/* duplicate first part */
 	if (i > 0)
 		bcopy(chain->map, map, i * sizeof(struct ip_fw *));
 	map[i] = krule;
 	/* duplicate remaining part, we always have the default rule */
 	bcopy(chain->map + i, map + i + 1,
 		sizeof(struct ip_fw *) *(chain->n_rules - i));
 	if (rulenum == 0) {
 		/* Compute rule number and write it back */
 		rulenum = i > 0 ? map[i-1]->rulenum : 0;
 		if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
 			rulenum += V_autoinc_step;
 		krule->rulenum = rulenum;
 		/* Save number to userland rule */
 		pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff);
 		*pnum = rulenum;
 	}
 
 	krule->id = chain->id + 1;
 	update_skipto_cache(chain, map);
 	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += RULEUSIZE0(krule);
 	IPFW_UH_WUNLOCK(chain);
 	if (map)
 		free(map, M_IPFW);
 	return (0);
 }
 
 int
 ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
     int locked)
 {
 	struct ip_fw **map;
 
 	map = get_map(chain, 1, locked);
 	if (map == NULL)
 		return (ENOMEM);
 	if (chain->n_rules > 0)
 		bcopy(chain->map, map,
 		    chain->n_rules * sizeof(struct ip_fw *));
 	map[chain->n_rules] = rule;
 	rule->rulenum = IPFW_DEFAULT_RULE;
 	rule->set = RESVD_SET;
 	rule->id = chain->id + 1;
 	/* We add rule in the end of chain, no need to update skipto cache */
 	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += RULEUSIZE0(rule);
 	IPFW_UH_WUNLOCK(chain);
 	free(map, M_IPFW);
 	return (0);
 }
 
 /*
  * Adds @rule to the list of rules to reap
  */
 void
 ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
     struct ip_fw *rule)
 {
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Unlink rule from everywhere */
 	unref_rule_objects(chain, rule);
 
 	*((struct ip_fw **)rule) = *head;
 	*head = rule;
 }
 
 /*
  * Reclaim storage associated with a list of rules.  This is
  * typically the list created using remove_rule.
  * A NULL pointer on input is handled correctly.
  */
 void
 ipfw_reap_rules(struct ip_fw *head)
 {
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
 		head = *((struct ip_fw **)head);
 		free_rule(rule);
 	}
 }
 
 /*
  * Rules to keep are
  *	(default || reserved || !match_set || !match_number)
  * where
  *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
  *	// the default rule is always protected
  *
  *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
  *	// RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
  *
  *   match_set ::= (cmd == 0 || rule->set == set)
  *	// set number is ignored for cmd == 0
  *
  *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
  *	// number is ignored for cmd == 1 or n == 0
  *
  */
 int
 ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt)
 {
 
 	/* Don't match default rule for modification queries */
 	if (rule->rulenum == IPFW_DEFAULT_RULE &&
 	    (rt->flags & IPFW_RCFLAG_DEFAULT) == 0)
 		return (0);
 
 	/* Don't match rules in reserved set for flush requests */
 	if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET)
 		return (0);
 
 	/* If we're filtering by set, don't match other sets */
 	if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set)
 		return (0);
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
 	    (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule))
 		return (0);
 
 	return (1);
 }
 
 struct manage_sets_args {
 	uint16_t	set;
 	uint8_t		new_set;
 };
 
 static int
 swap_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	else if (no->set == args->new_set)
 		no->set = (uint8_t)args->set;
 	return (0);
 }
 
 static int
 move_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	return (0);
 }
 
 static int
 test_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set != (uint8_t)args->set)
 		return (0);
 	if (ipfw_objhash_lookup_name_type(ni, args->new_set,
 	    no->etlv, no->name) != NULL)
 		return (EEXIST);
 	return (0);
 }
 
 /*
  * Generic function to handler moving and swapping sets.
  */
 int
 ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
     uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd)
 {
 	struct manage_sets_args args;
 	struct named_object *no;
 
 	args.set = set;
 	args.new_set = new_set;
 	switch (cmd) {
 	case SWAP_ALL:
 		return (ipfw_objhash_foreach_type(ni, swap_sets_cb,
 		    &args, type));
 	case TEST_ALL:
 		return (ipfw_objhash_foreach_type(ni, test_sets_cb,
 		    &args, type));
 	case MOVE_ALL:
 		return (ipfw_objhash_foreach_type(ni, move_sets_cb,
 		    &args, type));
 	case COUNT_ONE:
 		/*
 		 * @set used to pass kidx.
 		 * When @new_set is zero - reset object counter,
 		 * otherwise increment it.
 		 */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		if (new_set != 0)
 			no->ocnt++;
 		else
 			no->ocnt = 0;
 		return (0);
 	case TEST_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		/*
 		 * First check number of references:
 		 * when it differs, this mean other rules are holding
 		 * reference to given object, so it is not possible to
 		 * change its set. Note that refcnt may account references
 		 * to some going-to-be-added rules. Since we don't know
 		 * their numbers (and even if they will be added) it is
 		 * perfectly OK to return error here.
 		 */
 		if (no->ocnt != no->refcnt)
 			return (EBUSY);
 		if (ipfw_objhash_lookup_name_type(ni, new_set, type,
 		    no->name) != NULL)
 			return (EEXIST);
 		return (0);
 	case MOVE_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		no->set = new_set;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 /*
  * Delete rules matching range @rt.
  * Saves number of deleted rules in @ndel.
  *
  * Returns 0 on success.
  */
 static int
 delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel)
 {
 	struct ip_fw *reap, *rule, **map;
 	int end, start;
 	int i, n, ndyn, ofs;
 
 	reap = NULL;
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 
 	/*
 	 * Stage 1: Determine range to inspect.
 	 * Range is half-inclusive, e.g [start, end).
 	 */
 	start = 0;
 	end = chain->n_rules - 1;
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) {
 		start = ipfw_find_rule(chain, rt->start_rule, 0);
 
 		if (rt->end_rule >= IPFW_DEFAULT_RULE)
 			rt->end_rule = IPFW_DEFAULT_RULE - 1;
 		end = ipfw_find_rule(chain, rt->end_rule, UINT32_MAX);
 	}
 
 	/* Allocate new map of the same size */
 	map = get_map(chain, 0, 1 /* locked */);
 	if (map == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	n = 0;
 	ndyn = 0;
 	ofs = start;
 	/* 1. bcopy the initial part of the map */
 	if (start > 0)
 		bcopy(chain->map, map, start * sizeof(struct ip_fw *));
 	/* 2. copy active rules between start and end */
 	for (i = start; i < end; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0) {
 			map[ofs++] = rule;
 			continue;
 		}
 
 		n++;
 		if (ipfw_is_dyn_rule(rule) != 0)
 			ndyn++;
 	}
 	/* 3. copy the final part of the map */
 	bcopy(chain->map + end, map + ofs,
 		(chain->n_rules - end) * sizeof(struct ip_fw *));
 	/* 4. recalculate skipto cache */
 	update_skipto_cache(chain, map);
 	/* 5. swap the maps (under UH_WLOCK + WHLOCK) */
 	map = swap_map(chain, map, chain->n_rules - n);
 	/* 6. Remove all dynamic states originated by deleted rules */
 	if (ndyn > 0)
 		ipfw_expire_dyn_rules(chain, rt);
 	/* 7. now remove the rules deleted from the old map */
 	for (i = start; i < end; i++) {
 		rule = map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		chain->static_len -= RULEUSIZE0(rule);
 		ipfw_reap_add(chain, &reap, rule);
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_reap_rules(reap);
 	if (map != NULL)
 		free(map, M_IPFW);
 	*ndel = n;
 	return (0);
 }
 
 static int
 move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	ipfw_insn *cmd;
 	int cmdlen, i, l, c;
 	uint16_t kidx;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	/* Stage 1: count number of references by given rules */
 	for (c = 0, i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/*
 			 * When manage_sets() returns non-zero value to
 			 * COUNT_ONE command, consider this as an object
 			 * doesn't support sets (e.g. disabled with sysctl).
 			 * So, skip checks for this object.
 			 */
 			if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0)
 				continue;
 			c++;
 		}
 	}
 	if (c == 0) /* No objects found */
 		return (0);
 	/* Stage 2: verify "ownership" */
 	for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* Test for ownership and conflicting names */
 			c = rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, TEST_ONE);
 		}
 	}
 	/* Stage 3: change set and cleanup */
 	for (i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* cleanup object counter */
 			rw->manage_sets(ch, kidx,
 			    0 /* reset counter */, COUNT_ONE);
 			if (c != 0)
 				continue;
 			/* change set */
 			rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, MOVE_ONE);
 		}
 	}
 	return (c);
 }/*
  * Changes set of given rule rannge @rt
  * with each other.
  *
  * Returns 0 on success.
  */
 static int
 move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK(chain);
 
 	/*
 	 * Move rules with matching paramenerts to a new set.
 	 * This one is much more complex. We have to ensure
 	 * that all referenced tables (if any) are referenced
 	 * by given rule subset only. Otherwise, we can't move
 	 * them to new set and have to return error.
 	 */
 	if ((i = move_objects(chain, rt)) != 0) {
 		IPFW_UH_WUNLOCK(chain);
 		return (i);
 	}
 
 	/* XXX: We have to do swap holding WLOCK */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		rule->set = rt->new_set;
 	}
 
 	IPFW_UH_WUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Clear counters for a specific rule.
  * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
  * so we only care that rules do not disappear.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0)
 		IPFW_ZERO_RULE_COUNTER(rule);
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
 }
 
 /*
  * Flushes rules counters and/or log values on matching range.
  *
  * Returns number of items cleared.
  */
 static int
 clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only)
 {
 	struct ip_fw *rule;
 	int num;
 	int i;
 
 	num = 0;
 	rt->flags |= IPFW_RCFLAG_DEFAULT;
 
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		clear_counters(rule, log_only);
 		num++;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (num);
 }
 
 static int
 check_range_tlv(ipfw_range_tlv *rt)
 {
 
 	if (rt->head.length != sizeof(*rt))
 		return (1);
 	if (rt->start_rule > rt->end_rule)
 		return (1);
 	if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS)
 		return (1);
 
 	if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Delete rules matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of deleted rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int error, ndel;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	ndel = 0;
 	if ((error = delete_range(chain, &rh->range, &ndel)) != 0)
 		return (error);
 
 	/* Save number of rules deleted */
 	rh->range.new_set = ndel;
 	return (0);
 }
 
 /*
  * Move rules/sets matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	return (move_range(chain, &rh->range));
 }
 
 /*
  * Clear rule accounting data matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of cleared rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int log_only, num;
 	char *msg;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	log_only = (op3->opcode == IP_FW_XRESETLOG);
 
 	num = clear_range(chain, &rh->range, log_only);
 
 	if (rh->range.flags & IPFW_RCFLAG_ALL)
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	else
 		msg = log_only ? "logging count reset" : "cleared";
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 		log(lev, "ipfw: %s.\n", msg);
 	}
 
 	/* Save number of rules cleared */
 	rh->range.new_set = num;
 	return (0);
 }
 
 static void
 enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	uint32_t v_set;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Change enabled/disabled sets mask */
 	v_set = (V_set_disable | rt->set) & ~rt->new_set;
 	v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */
 	IPFW_WLOCK(chain);
 	V_set_disable = v_set;
 	IPFW_WUNLOCK(chain);
 }
 
 static int
 swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	if (rt->set == rt->new_set) /* nothing to do */
 		return (0);
 
 	if (mv != 0) {
 		/*
 		 * Berfore moving the rules we need to check that
 		 * there aren't any conflicting named objects.
 		 */
 		for (rw = ctl3_rewriters;
 		    rw < ctl3_rewriters + ctl3_rsize; rw++) {
 			if (rw->manage_sets == NULL)
 				continue;
 			i = rw->manage_sets(chain, (uint8_t)rt->set,
 			    (uint8_t)rt->new_set, TEST_ALL);
 			if (i != 0)
 				return (EEXIST);
 		}
 	}
 	/* Swap or move two sets */
 	for (i = 0; i < chain->n_rules - 1; i++) {
 		rule = chain->map[i];
 		if (rule->set == (uint8_t)rt->set)
 			rule->set = (uint8_t)rt->new_set;
 		else if (rule->set == (uint8_t)rt->new_set && mv == 0)
 			rule->set = (uint8_t)rt->set;
 	}
 	for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) {
 		if (rw->manage_sets == NULL)
 			continue;
 		rw->manage_sets(chain, (uint8_t)rt->set,
 		    (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL);
 	}
 	return (0);
 }
 
 /*
  * Swaps or moves set
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int ret;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (rh->range.head.length != sizeof(ipfw_range_tlv))
 		return (1);
 	/* enable_sets() expects bitmasks. */
 	if (op3->opcode != IP_FW_SET_ENABLE &&
 	    (rh->range.set >= IPFW_MAX_SETS ||
 	    rh->range.new_set >= IPFW_MAX_SETS))
 		return (EINVAL);
 
 	ret = 0;
 	IPFW_UH_WLOCK(chain);
 	switch (op3->opcode) {
 	case IP_FW_SET_SWAP:
 	case IP_FW_SET_MOVE:
 		ret = swap_sets(chain, &rh->range,
 		    op3->opcode == IP_FW_SET_MOVE);
 		break;
 	case IP_FW_SET_ENABLE:
 		enable_sets(chain, &rh->range);
 		break;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (ret);
 }
 
 /**
  * Remove all rules with given number, or do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
  *
  * The argument is an uint32_t. The low 16 bit are the rule or set number;
  * the next 8 bits are the new set; the top 8 bits indicate the command:
  *
  *	0	delete rules numbered "rulenum"
  *	1	delete rules in set "rulenum"
  *	2	move rules "rulenum" to set "new_set"
  *	3	move rules from set "rulenum" to set "new_set"
  *	4	swap sets "rulenum" and "new_set"
  *	5	delete rules "rulenum" and set "new_set"
  */
 static int
 del_entry(struct ip_fw_chain *chain, uint32_t arg)
 {
 	uint32_t num;	/* rule number or old_set */
 	uint8_t cmd, new_set;
 	int do_del, ndel;
 	int error = 0;
 	ipfw_range_tlv rt;
 
 	num = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 5 || new_set > RESVD_SET)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2 || cmd == 5) {
 		if (num >= IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
 		if (num > RESVD_SET)	/* old_set */
 			return EINVAL;
 	}
 
 	/* Convert old requests into new representation */
 	memset(&rt, 0, sizeof(rt));
 	rt.start_rule = num;
 	rt.end_rule = num;
 	rt.set = num;
 	rt.new_set = new_set;
 	do_del = 0;
 
 	switch (cmd) {
 	case 0: /* delete rules numbered "rulenum" */
 		if (num == 0)
 			rt.flags |= IPFW_RCFLAG_ALL;
 		else
 			rt.flags |= IPFW_RCFLAG_RANGE;
 		do_del = 1;
 		break;
 	case 1: /* delete rules in set "rulenum" */
 		rt.flags |= IPFW_RCFLAG_SET;
 		do_del = 1;
 		break;
 	case 5: /* delete rules "rulenum" and set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET;
 		rt.set = new_set;
 		rt.new_set = 0;
 		do_del = 1;
 		break;
 	case 2: /* move rules "rulenum" to set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE;
 		break;
 	case 3: /* move rules from set "rulenum" to set "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 1);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	case 4: /* swap sets "rulenum" and "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 0);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	default:
 		return (ENOTSUP);
 	}
 
 	if (do_del != 0) {
 		if ((error = delete_range(chain, &rt, &ndel)) != 0)
 			return (error);
 
 		if (ndel == 0 && (cmd != 1 && num != 0))
 			return (EINVAL);
 
 		return (0);
 	}
 
 	return (move_range(chain, &rt));
 }
 
 /**
  * Reset some or all counters on firewall rules.
  * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
  * the next 8 bits are the set number, the top 8 bits are the command:
  *	0	work with rules from all set's;
  *	1	work with rules only from specified set.
  * Specified rule number is zero if we want to clear all entries.
  * log_only is 1 if we only want to reset logs, zero otherwise.
  */
 static int
 zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 {
 	struct ip_fw *rule;
 	char *msg;
 	int i;
 
 	uint16_t rulenum = arg & 0xffff;
 	uint8_t set = (arg >> 16) & 0xff;
 	uint8_t cmd = (arg >> 24) & 0xff;
 
 	if (cmd > 1)
 		return (EINVAL);
 	if (cmd == 1 && set > RESVD_SET)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	if (rulenum == 0) {
 		V_norule_counter = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			/* Skip rules not in our set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
 			clear_counters(rule, log_only);
 		}
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	} else {
 		int cleared = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			if (rule->rulenum == rulenum) {
 				if (cmd == 0 || rule->set == set)
 					clear_counters(rule, log_only);
 				cleared = 1;
 			}
 			if (rule->rulenum > rulenum)
 				break;
 		}
 		if (!cleared) {	/* we did not find any matching rules */
 			IPFW_UH_RUNLOCK(chain);
 			return (EINVAL);
 		}
 		msg = log_only ? "logging count reset" : "cleared";
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 
 		if (rulenum)
 			log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
 		else
 			log(lev, "ipfw: %s.\n", msg);
 	}
 	return (0);
 }
 
 
 /*
  * Check rule head in FreeBSD11 format
  *
  */
 static int
 check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = roundup2(RULESIZE(rule), sizeof(uint64_t));
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 /*
  * Check rule head in FreeBSD8 format
  *
  */
 static int
 check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = sizeof(*rule) + rule->cmd_len * 4 - 4;
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 static int
 check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci)
 {
 	int cmdlen, l;
 	int have_action;
 
 	have_action = 0;
 
 	/*
 	 * Now go for the individual checks. Very simple ones, basically only
 	 * instruction sizes.
 	 */
 	for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		if (cmdlen > l) {
 			printf("ipfw: opcode %d size truncated\n",
 			    cmd->opcode);
 			return EINVAL;
 		}
 		switch (cmd->opcode) {
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_PROTO:
 		case O_IP_SRC_ME:
 		case O_IP_DST_ME:
 		case O_LAYER2:
 		case O_IN:
 		case O_FRAG:
 		case O_DIVERTED:
 		case O_IPOPT:
 		case O_IPTOS:
 		case O_IPPRECEDENCE:
 		case O_IPVER:
 		case O_SOCKARG:
 		case O_TCPFLAGS:
 		case O_TCPOPTS:
 		case O_ESTAB:
 		case O_VERREVPATH:
 		case O_VERSRCREACH:
 		case O_ANTISPOOF:
 		case O_IPSEC:
 #ifdef INET6
 		case O_IP6_SRC_ME:
 		case O_IP6_DST_ME:
 		case O_EXT_HDR:
 		case O_IP6:
 #endif
 		case O_IP4:
 		case O_TAG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_EXTERNAL_ACTION:
 			if (cmd->arg1 == 0 ||
 			    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 				printf("ipfw: invalid external "
 				    "action opcode\n");
 				return (EINVAL);
 			}
 			ci->object_opcodes++;
 			/*
 			 * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA
 			 * opcode?
 			 */
 			if (l != cmdlen) {
 				l -= cmdlen;
 				cmd += cmdlen;
 				cmdlen = F_LEN(cmd);
 				if (cmd->opcode == O_EXTERNAL_DATA)
 					goto check_action;
 				if (cmd->opcode != O_EXTERNAL_INSTANCE) {
 					printf("ipfw: invalid opcode "
 					    "next to external action %u\n",
 					    cmd->opcode);
 					return (EINVAL);
 				}
 				if (cmd->arg1 == 0 ||
 				    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 					printf("ipfw: invalid external "
 					    "action instance opcode\n");
 					return (EINVAL);
 				}
 				ci->object_opcodes++;
 			}
 			goto check_action;
 
 		case O_FIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if (cmd->arg1 >= rt_numfibs) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			break;
 
 		case O_SETFIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if ((cmd->arg1 != IP_FW_TARG) &&
 			    ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1 & 0x7FFF);
 				return EINVAL;
 			}
 			goto check_action;
 
 		case O_UID:
 		case O_GID:
 		case O_JAIL:
 		case O_IP_SRC:
 		case O_IP_DST:
 		case O_TCPSEQ:
 		case O_TCPACK:
 		case O_PROB:
 		case O_ICMPTYPE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_LIMIT:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_LOG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
 				goto bad_size;
 
 			((ipfw_insn_log *)cmd)->log_left =
 			    ((ipfw_insn_log *)cmd)->max_log;
 
 			break;
 
 		case O_IP_SRC_MASK:
 		case O_IP_DST_MASK:
 			/* only odd command lengths */
 			if ((cmdlen & 1) == 0)
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_SET:
 		case O_IP_DST_SET:
 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
 				printf("ipfw: invalid set size %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    (cmd->arg1+31)/32 )
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_LOOKUP:
 			if (cmdlen > F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 		case O_IP_DST_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_IP_FLOW_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_MACADDR2:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
 				goto bad_size;
 			break;
 
 		case O_NOP:
 		case O_IPID:
 		case O_IPTTL:
 		case O_IPLEN:
 		case O_TCPDATALEN:
 		case O_TCPWIN:
 		case O_TAGGED:
 			if (cmdlen < 1 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_DSCP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1)
 				goto bad_size;
 			break;
 
 		case O_MAC_TYPE:
 		case O_IP_SRCPORT:
 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
 			if (cmdlen < 2 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_RECV:
 		case O_XMIT:
 		case O_VIA:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_ALTQ:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
 				goto bad_size;
 			break;
 
 		case O_PIPE:
 		case O_QUEUE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_IP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
 				goto bad_size;
 			goto check_action;
 #ifdef INET6
 		case O_FORWARD_IP6:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6))
 				goto bad_size;
 			goto check_action;
 #endif /* INET6 */
 
 		case O_DIVERT:
 		case O_TEE:
 			if (ip_divert_ptr == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NETGRAPH:
 		case O_NGTEE:
 			if (ng_ipfw_input_p == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NAT:
 			if (!IPFW_NAT_LOADED)
 				return EINVAL;
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
  				goto bad_size;		
  			goto check_action;
 		case O_CHECK_STATE:
 			ci->object_opcodes++;
 			/* FALLTHROUGH */
 		case O_FORWARD_MAC: /* XXX not implemented yet */
 		case O_COUNT:
 		case O_ACCEPT:
 		case O_DENY:
 		case O_REJECT:
 		case O_SETDSCP:
 #ifdef INET6
 		case O_UNREACH6:
 #endif
 		case O_SKIPTO:
 		case O_REASS:
 		case O_CALLRETURN:
 check_size:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 check_action:
 			if (have_action) {
 				printf("ipfw: opcode %d, multiple actions"
 					" not allowed\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			have_action = 1;
 			if (l != cmdlen) {
 				printf("ipfw: opcode %d, action must be"
 					" last opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			break;
 #ifdef INET6
 		case O_IP6_SRC:
 		case O_IP6_DST:
 			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
 			    F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_FLOW6ID:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    ((ipfw_insn_u32 *)cmd)->o.arg1)
 				goto bad_size;
 			break;
 
 		case O_IP6_SRC_MASK:
 		case O_IP6_DST_MASK:
 			if ( !(cmdlen & 1) || cmdlen > 127)
 				goto bad_size;
 			break;
 		case O_ICMP6TYPE:
 			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
 				goto bad_size;
 			break;
 #endif
 
 		default:
 			switch (cmd->opcode) {
 #ifndef INET6
 			case O_IP6_SRC_ME:
 			case O_IP6_DST_ME:
 			case O_EXT_HDR:
 			case O_IP6:
 			case O_UNREACH6:
 			case O_IP6_SRC:
 			case O_IP6_DST:
 			case O_FLOW6ID:
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 			case O_ICMP6TYPE:
 				printf("ipfw: no IPv6 support in kernel\n");
 				return (EPROTONOSUPPORT);
 #endif
 			default:
 				printf("ipfw: opcode %d, unknown opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 		}
 	}
 	if (have_action == 0) {
 		printf("ipfw: missing action\n");
 		return (EINVAL);
 	}
 	return 0;
 
 bad_size:
 	printf("ipfw: opcode %d size %d wrong\n",
 		cmd->opcode, cmdlen);
 	return (EINVAL);
 }
 
 
 /*
  * Translation of requests for compatibility with FreeBSD 7.2/8.
  * a static variable tells us if we have an old client from userland,
  * and if necessary we translate requests and responses between the
  * two formats.
  */
 static int is7 = 0;
 
 struct ip_fw7 {
 	struct ip_fw7	*next;		/* linked list of rules     */
 	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
 	/* 'next_rule' is used to pass up 'set_disable' status      */
 
 	uint16_t	act_ofs;	/* offset of action in 32-bit units */
 	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
 	uint16_t	rulenum;	/* rule number          */
 	uint8_t		set;		/* rule set (0..31)     */
 	// #define RESVD_SET   31  /* set for default and persistent rules */
 	uint8_t		_pad;		/* padding          */
 	// uint32_t        id;             /* rule id, only in v.8 */
 	/* These fields are present in all rules.           */
 	uint64_t	pcnt;		/* Packet counter       */
 	uint64_t	bcnt;		/* Byte counter         */
 	uint32_t	timestamp;	/* tv_sec of last match     */
 
 	ipfw_insn	cmd[1];		/* storage for commands     */
 };
 
 static int convert_rule_to_7(struct ip_fw_rule0 *rule);
 static int convert_rule_to_8(struct ip_fw_rule0 *rule);
 
 #ifndef RULESIZE7
 #define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
 	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
 #endif
 
 
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
  * Must be run under IPFW_UH_RLOCK
  */
 static size_t
 ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 {
 	char *bp = buf;
 	char *ep = bp + space;
 	struct ip_fw *rule;
 	struct ip_fw_rule0 *dst;
 	struct timeval boottime;
 	int error, i, l, warnflag;
 	time_t	boot_seconds;
 
 	warnflag = 0;
 
 	getboottime(&boottime);
         boot_seconds = boottime.tv_sec;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 
 		if (is7) {
 		    /* Convert rule to FreeBSd 7.2 format */
 		    l = RULESIZE7(rule);
 		    if (bp + l + sizeof(uint32_t) <= ep) {
 			bcopy(rule, bp, l + sizeof(uint32_t));
 			error = set_legacy_obj_kidx(chain,
 			    (struct ip_fw_rule0 *)bp);
 			if (error != 0)
 				return (0);
 			error = convert_rule_to_7((struct ip_fw_rule0 *) bp);
 			if (error)
 				return 0; /*XXX correct? */
 			/*
 			 * XXX HACK. Store the disable mask in the "next"
 			 * pointer in a wild attempt to keep the ABI the same.
 			 * Why do we do this on EVERY rule?
 			 */
 			bcopy(&V_set_disable,
 				&(((struct ip_fw7 *)bp)->next_rule),
 				sizeof(V_set_disable));
 			if (((struct ip_fw7 *)bp)->timestamp)
 			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
 			bp += l;
 		    }
 		    continue; /* go to next rule */
 		}
 
 		l = RULEUSIZE0(rule);
 		if (bp + l > ep) { /* should not happen */
 			printf("overflow dumping static rules\n");
 			break;
 		}
 		dst = (struct ip_fw_rule0 *)bp;
 		export_rule0(rule, dst, l);
 		error = set_legacy_obj_kidx(chain, dst);
 
 		/*
 		 * XXX HACK. Store the disable mask in the "next"
 		 * pointer in a wild attempt to keep the ABI the same.
 		 * Why do we do this on EVERY rule?
 		 *
 		 * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask
 		 * so we need to fail _after_ saving at least one mask.
 		 */
 		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
 		if (dst->timestamp)
 			dst->timestamp += boot_seconds;
 		bp += l;
 
 		if (error != 0) {
 			if (error == 2) {
 				/* Non-fatal table rewrite error. */
 				warnflag = 1;
 				continue;
 			}
 			printf("Stop on rule %d. Fail to convert table\n",
 			    rule->rulenum);
 			break;
 		}
 	}
 	if (warnflag != 0)
 		printf("ipfw: process %s is using legacy interfaces,"
 		    " consider rebuilding\n", "");
 	ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */
 	return (bp - (char *)buf);
 }
 
 
 struct dump_args {
 	uint32_t	b;	/* start rule */
 	uint32_t	e;	/* end rule */
 	uint32_t	rcount;	/* number of rules */
 	uint32_t	rsize;	/* rules size */
 	uint32_t	tcount;	/* number of tables */
 	int		rcounters;	/* counters */
 };
 
 void
 ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv)
 {
 
 	ntlv->head.type = no->etlv;
 	ntlv->head.length = sizeof(*ntlv);
 	ntlv->idx = no->kidx;
 	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
 }
 
 /*
  * Export named object info in instance @ni, identified by @kidx
  * to ipfw_obj_ntlv. TLV is allocated from @sd space.
  *
  * Returns 0 on success.
  */
 static int
 export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd)
 {
 	struct named_object *no;
 	ipfw_obj_ntlv *ntlv;
 
 	no = ipfw_objhash_lookup_kidx(ni, kidx);
 	KASSERT(no != NULL, ("invalid object kernel index passed"));
 
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 /*
  * Dumps static rules with table TLVs in buffer @sd.
  *
  * Returns 0 on success.
  */
 static int
 dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da,
     uint32_t *bmask, struct sockopt_data *sd)
 {
 	int error;
 	int i, l;
 	uint32_t tcount;
 	ipfw_obj_ctlv *ctlv;
 	struct ip_fw *krule;
 	struct namedobj_instance *ni;
 	caddr_t dst;
 
 	/* Dump table names first (if any) */
 	if (da->tcount > 0) {
 		/* Header first */
 		ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 		if (ctlv == NULL)
 			return (ENOMEM);
 		ctlv->head.type = IPFW_TLV_TBLNAME_LIST;
 		ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + 
 		    sizeof(*ctlv);
 		ctlv->count = da->tcount;
 		ctlv->objsize = sizeof(ipfw_obj_ntlv);
 	}
 
 	i = 0;
 	tcount = da->tcount;
 	ni = ipfw_get_table_objhash(chain);
 	while (tcount > 0) {
 		if ((bmask[i / 32] & (1 << (i % 32))) == 0) {
 			i++;
 			continue;
 		}
 
 		/* Jump to shared named object bitmask */
 		if (i >= IPFW_TABLES_MAX) {
 			ni = CHAIN_TO_SRV(chain);
 			i -= IPFW_TABLES_MAX;
 			bmask += IPFW_TABLES_MAX / 32;
 		}
 
 		if ((error = export_objhash_ntlv(ni, i, sd)) != 0)
 			return (error);
 
 		i++;
 		tcount--;
 	}
 
 	/* Dump rules */
 	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 	if (ctlv == NULL)
 		return (ENOMEM);
 	ctlv->head.type = IPFW_TLV_RULE_LIST;
 	ctlv->head.length = da->rsize + sizeof(*ctlv);
 	ctlv->count = da->rcount;
 
 	for (i = da->b; i < da->e; i++) {
 		krule = chain->map[i];
 
 		l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv);
 		if (da->rcounters != 0)
 			l += sizeof(struct ip_fw_bcounter);
 		dst = (caddr_t)ipfw_get_sopt_space(sd, l);
 		if (dst == NULL)
 			return (ENOMEM);
 
 		export_rule1(krule, dst, l, da->rcounters);
 	}
 
 	return (0);
 }
 
 /*
  * Marks every object index used in @rule with bit in @bmask.
  * Used to generate bitmask of referenced tables/objects for given ruleset
  * or its part.
  *
  * Returns number of newly-referenced objects.
  */
 static int
 mark_object_kidx(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint32_t *bmask)
 {
 	struct opcode_obj_rewrite *rw;
 	ipfw_insn *cmd;
 	int bidx, cmdlen, l, count;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	count = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		bidx = kidx / 32;
 		/*
 		 * Maintain separate bitmasks for table and
 		 * non-table objects.
 		 */
 		if (rw->etlv != IPFW_TLV_TBL_NAME)
 			bidx += IPFW_TABLES_MAX / 32;
 
 		if ((bmask[bidx] & (1 << (kidx % 32))) == 0)
 			count++;
 
 		bmask[bidx] |= 1 << (kidx % 32);
 	}
 
 	return (count);
 }
 
 /*
  * Dumps requested objects data
  * Data layout (version 0)(current):
  * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags
  *   size = ipfw_cfg_lheader.size
  * Reply: [ ipfw_cfg_lheader 
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST)
  *     ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ]
  *   ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional)
  * ]
  * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize.
  * The rest (size, count) are set to zero and needs to be ignored.
  *
  * Returns 0 on success.
  */
 static int
 dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_cfg_lheader *hdr;
 	struct ip_fw *rule;
 	size_t sz, rnum;
 	uint32_t hdr_flags;
 	int error, i;
 	struct dump_args da;
 	uint32_t *bmask;
 
 	hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	error = 0;
 	bmask = NULL;
 	/* Allocate needed state. Note we allocate 2xspace mask, for table&srv  */
 	if (hdr->flags & IPFW_CFG_GET_STATIC)
 		bmask = malloc(IPFW_TABLES_MAX / 4, M_TEMP, M_WAITOK | M_ZERO);
 
 	IPFW_UH_RLOCK(chain);
 
 	/*
 	 * STAGE 1: Determine size/count for objects in range.
 	 * Prepare used tables bitmask.
 	 */
 	sz = sizeof(ipfw_cfg_lheader);
 	memset(&da, 0, sizeof(da));
 
 	da.b = 0;
 	da.e = chain->n_rules;
 
 	if (hdr->end_rule != 0) {
 		/* Handle custom range */
 		if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE)
 			rnum = IPFW_DEFAULT_RULE;
 		da.b = ipfw_find_rule(chain, rnum, 0);
 		rnum = (hdr->end_rule < IPFW_DEFAULT_RULE) ?
 		    hdr->end_rule + 1: IPFW_DEFAULT_RULE;
 		da.e = ipfw_find_rule(chain, rnum, UINT32_MAX) + 1;
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATIC) {
 		for (i = da.b; i < da.e; i++) {
 			rule = chain->map[i];
 			da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv);
 			da.rcount++;
 			/* Update bitmask of used objects for given range */
 			da.tcount += mark_object_kidx(chain, rule, bmask);
 		}
 		/* Add counters if requested */
 		if (hdr->flags & IPFW_CFG_GET_COUNTERS) {
 			da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount;
 			da.rcounters = 1;
 		}
 
 		if (da.tcount > 0)
 			sz += da.tcount * sizeof(ipfw_obj_ntlv) +
 			    sizeof(ipfw_obj_ctlv);
 		sz += da.rsize + sizeof(ipfw_obj_ctlv);
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATES)
 		sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) +
 		     sizeof(ipfw_obj_ctlv);
 
 
 	/*
 	 * Fill header anyway.
 	 * Note we have to save header fields to stable storage
 	 * buffer inside @sd can be flushed after dumping rules
 	 */
 	hdr->size = sz;
 	hdr->set_mask = ~V_set_disable;
 	hdr_flags = hdr->flags;
 	hdr = NULL;
 
 	if (sd->valsize < sz) {
 		error = ENOMEM;
 		goto cleanup;
 	}
 
 	/* STAGE2: Store actual data */
 	if (hdr_flags & IPFW_CFG_GET_STATIC) {
 		error = dump_static_rules(chain, &da, bmask, sd);
 		if (error != 0)
 			goto cleanup;
 	}
 
 	if (hdr_flags & IPFW_CFG_GET_STATES)
 		error = ipfw_dump_states(chain, sd);
 
 cleanup:
 	IPFW_UH_RUNLOCK(chain);
 
 	if (bmask != NULL)
 		free(bmask, M_TEMP);
 
 	return (error);
 }
 
 int
 ipfw_check_object_name_generic(const char *name)
 {
 	int nsize;
 
 	nsize = sizeof(((ipfw_obj_ntlv *)0)->name);
 	if (strnlen(name, nsize) == nsize)
 		return (EINVAL);
 	if (name[0] == '\0')
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Creates non-existent objects referenced by rule.
  *
  * Return 0 on success.
  */
 int
 create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti)
 {
 	struct opcode_obj_rewrite *rw;
 	struct obj_idx *p;
 	uint16_t kidx;
 	int error;
 
 	/*
 	 * Compatibility stuff: do actual creation for non-existing,
 	 * but referenced objects.
 	 */
 	for (p = oib; p < pidx; p++) {
 		if (p->kidx != 0)
 			continue;
 
 		ti->uidx = p->uidx;
 		ti->type = p->type;
 		ti->atype = 0;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		if (rw->create_object == NULL)
 			error = EOPNOTSUPP;
 		else
 			error = rw->create_object(ch, ti, &kidx);
 		if (error == 0) {
 			p->kidx = kidx;
 			continue;
 		}
 
 		/*
 		 * Error happened. We have to rollback everything.
 		 * Drop all already acquired references.
 		 */
 		IPFW_UH_WLOCK(ch);
 		unref_oib_objects(ch, cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Compatibility function for old ipfw(8) binaries.
  * Rewrites table/nat kernel indices with userland ones.
  * Convert tables matching '/^\d+$/' to their atoi() value.
  * Use number 65535 for other tables.
  *
  * Returns 0 on success.
  */
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	char *end;
 	long val;
 	int cmdlen, error, l;
 	uint16_t kidx, uidx;
 	uint8_t subtype;
 
 	error = 0;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		/* Check if is index in given opcode */
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		/* Try to find referenced kernel object */
 		no = rw->find_bykidx(ch, kidx);
 		if (no == NULL)
 			continue;
 
 		val = strtol(no->name, &end, 10);
 		if (*end == '\0' && val < 65535) {
 			uidx = val;
 		} else {
 
 			/*
 			 * We are called via legacy opcode.
 			 * Save error and show table as fake number
 			 * not to make ipfw(8) hang.
 			 */
 			uidx = 65535;
 			error = 2;
 		}
 
 		rw->update(cmd, uidx);
 	}
 
 	return (error);
 }
 
 
 /*
  * Unreferences all already-referenced objects in given @cmd rule,
  * using information in @oib.
  *
  * Used to rollback partially converted rule on error.
  */
 static void
 unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib,
     struct obj_idx *end)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	struct obj_idx *p;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	for (p = oib; p < end; p++) {
 		if (p->kidx == 0)
 			continue;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		/* Find & unref by existing idx */
 		no = rw->find_bykidx(ch, p->kidx);
 		KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx));
 		no->refcnt--;
 	}
 }
 
 /*
  * Remove references from every object used in @rule.
  * Used at rule removal code.
  */
 static void
 unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	int cmdlen, l;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 		no = rw->find_bykidx(ch, kidx);
 
 		KASSERT(no != NULL, ("object id %d not found", kidx));
 		KASSERT(no->subtype == subtype,
 		    ("wrong type %d (%d) for object id %d",
 		    no->subtype, subtype, kidx));
 		KASSERT(no->refcnt > 0, ("refcount for object %d is %d",
 		    kidx, no->refcnt));
 
 		if (no->refcnt == 1 && rw->destroy_object != NULL)
 			rw->destroy_object(ch, no);
 		else
 			no->refcnt--;
 	}
 }
 
 
 /*
  * Find and reference object (if any) stored in instruction @cmd.
  *
  * Saves object info in @pidx, sets
  *  - @unresolved to 1 if object should exists but not found
  *
  * Returns non-zero value in case of error.
  */
 static int
 ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti,
     struct obj_idx *pidx, int *unresolved)
 {
 	struct named_object *no;
 	struct opcode_obj_rewrite *rw;
 	int error;
 
 	/* Check if this opcode is candidate for rewrite */
 	rw = find_op_rw(cmd, &ti->uidx, &ti->type);
 	if (rw == NULL)
 		return (0);
 
 	/* Need to rewrite. Save necessary fields */
 	pidx->uidx = ti->uidx;
 	pidx->type = ti->type;
 
 	/* Try to find referenced kernel object */
 	error = rw->find_byname(ch, ti, &no);
 	if (error != 0)
 		return (error);
 	if (no == NULL) {
 		/*
 		 * Report about unresolved object for automaic
 		 * creation.
 		 */
 		*unresolved = 1;
 		return (0);
 	}
 
 	/*
 	 * Object is already exist.
 	 * Its subtype should match with expected value.
 	 */
 	if (ti->type != no->subtype)
 		return (EINVAL);
 
 	/* Bump refcount and update kidx. */
 	no->refcnt++;
 	rw->update(cmd, no->kidx);
 	return (0);
 }
 
 /*
  * Finds and bumps refcount for objects referenced by given @rule.
  * Auto-creates non-existing tables.
  * Fills in @oib array with userland/kernel indexes.
  *
  * Returns 0 on success.
  */
 static int
 ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti)
 {
 	struct obj_idx *pidx;
 	ipfw_insn *cmd;
 	int cmdlen, error, l, unresolved;
 
 	pidx = oib;
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	error = 0;
 
 	IPFW_UH_WLOCK(ch);
 
 	/* Increase refcount on each existing referenced table. */
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		unresolved = 0;
 
 		error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved);
 		if (error != 0)
 			break;
 		/*
 		 * Compatibility stuff for old clients:
 		 * prepare to automaitcally create non-existing objects.
 		 */
 		if (unresolved != 0) {
 			pidx->off = rule->cmd_len - l;
 			pidx++;
 		}
 	}
 
 	if (error != 0) {
 		/* Unref everything we have already done */
 		unref_oib_objects(ch, rule->cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 		return (error);
 	}
 	IPFW_UH_WUNLOCK(ch);
 
 	/* Perform auto-creation for non-existing objects */
 	if (pidx != oib)
 		error = create_objects_compat(ch, rule->cmd, oib, pidx, ti);
 
 	/* Calculate real number of dynamic objects */
 	ci->object_opcodes = (uint16_t)(pidx - oib);
 
 	return (error);
 }
 
 /*
  * Checks is opcode is referencing table of appropriate type.
  * Adds reference count for found table if true.
  * Rewrites user-supplied opcode values with kernel ones.
  *
  * Returns 0 on success and appropriate error code otherwise.
  */
 static int
 rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci)
 {
 	int error;
 	ipfw_insn *cmd;
 	uint8_t type;
 	struct obj_idx *p, *pidx_first, *pidx_last;
 	struct tid_info ti;
 
 	/*
 	 * Prepare an array for storing opcode indices.
 	 * Use stack allocation by default.
 	 */
 	if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) {
 		/* Stack */
 		pidx_first = ci->obuf;
 	} else
 		pidx_first = malloc(
 		    ci->object_opcodes * sizeof(struct obj_idx),
 		    M_IPFW, M_WAITOK | M_ZERO);
 
 	error = 0;
 	type = 0;
 	memset(&ti, 0, sizeof(ti));
 
 	/* Use set rule is assigned to. */
 	ti.set = ci->krule->set;
 	if (ci->ctlv != NULL) {
 		ti.tlvs = (void *)(ci->ctlv + 1);
 		ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv);
 	}
 
 	/* Reference all used tables and other objects */
 	error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti);
 	if (error != 0)
 		goto free;
 	/*
 	 * Note that ref_rule_objects() might have updated ci->object_opcodes
 	 * to reflect actual number of object opcodes.
 	 */
 
 	/* Perform rewrite of remaining opcodes */
 	p = pidx_first;
 	pidx_last = pidx_first + ci->object_opcodes;
 	for (p = pidx_first; p < pidx_last; p++) {
 		cmd = ci->krule->cmd + p->off;
 		update_opcode_kidx(cmd, p->kidx);
 	}
 
 free:
 	if (pidx_first != ci->obuf)
 		free(pidx_first, M_IPFW);
 
 	return (error);
 }
 
 /*
  * Adds one or more rules to ipfw @chain.
  * Data layout (version 0)(current):
  * Request:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3)
  * ]
  * Reply:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ]
  * ]
  *
  * Rules in reply are modified to store their actual ruleset number.
  *
  * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending
  * according to their idx field and there has to be no duplicates.
  * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending.
  * (*3) Each ip_fw structure needs to be aligned to u64 boundary.
  *
  * Returns 0 on success.
  */
 static int
 add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_ctlv *ctlv, *rtlv, *tstate;
 	ipfw_obj_ntlv *ntlv;
 	int clen, error, idx;
 	uint32_t count, read;
 	struct ip_fw_rule *r;
 	struct rule_check_info rci, *ci, *cbuf;
 	int i, rsize;
 
 	op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize);
 	ctlv = (ipfw_obj_ctlv *)(op3 + 1);
 
 	read = sizeof(ip_fw3_opheader);
 	rtlv = NULL;
 	tstate = NULL;
 	cbuf = NULL;
 	memset(&rci, 0, sizeof(struct rule_check_info));
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) {
 		clen = ctlv->head.length;
 		/* Check size and alignment */
 		if (clen > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * Some table names or other named objects.
 		 * Check for validness.
 		 */
 		count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv);
 		if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv))
 			return (EINVAL);
 
 		/*
 		 * Check each TLV.
 		 * Ensure TLVs are sorted ascending and
 		 * there are no duplicates.
 		 */
 		idx = -1;
 		ntlv = (ipfw_obj_ntlv *)(ctlv + 1);
 		while (count > 0) {
 			if (ntlv->head.length != sizeof(ipfw_obj_ntlv))
 				return (EINVAL);
 
 			error = ipfw_check_object_name_generic(ntlv->name);
 			if (error != 0)
 				return (error);
 
 			if (ntlv->idx <= idx)
 				return (EINVAL);
 
 			idx = ntlv->idx;
 			count--;
 			ntlv++;
 		}
 
 		tstate = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_RULE_LIST) {
 		clen = ctlv->head.length;
 		if (clen + read > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * TODO: Permit adding multiple rules at once
 		 */
 		if (ctlv->count != 1)
 			return (ENOTSUP);
 
 		clen -= sizeof(*ctlv);
 
 		if (ctlv->count > clen / sizeof(struct ip_fw_rule))
 			return (EINVAL);
 
 		/* Allocate state for each rule or use stack */
 		if (ctlv->count == 1) {
 			memset(&rci, 0, sizeof(struct rule_check_info));
 			cbuf = &rci;
 		} else
 			cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP,
 			    M_WAITOK | M_ZERO);
 		ci = cbuf;
 
 		/*
 		 * Check each rule for validness.
 		 * Ensure numbered rules are sorted ascending
 		 * and properly aligned
 		 */
 		idx = 0;
 		r = (struct ip_fw_rule *)(ctlv + 1);
 		count = 0;
 		error = 0;
 		while (clen > 0) {
 			rsize = roundup2(RULESIZE(r), sizeof(uint64_t));
 			if (rsize > clen || ctlv->count <= count) {
 				error = EINVAL;
 				break;
 			}
 
 			ci->ctlv = tstate;
 			error = check_ipfw_rule1(r, rsize, ci);
 			if (error != 0)
 				break;
 
 			/* Check sorting */
 			if (r->rulenum != 0 && r->rulenum < idx) {
 				printf("rulenum %d idx %d\n", r->rulenum, idx);
 				error = EINVAL;
 				break;
 			}
 			idx = r->rulenum;
 
 			ci->urule = (caddr_t)r;
 
 			rsize = roundup2(rsize, sizeof(uint64_t));
 			clen -= rsize;
 			r = (struct ip_fw_rule *)((caddr_t)r + rsize);
 			count++;
 			ci++;
 		}
 
 		if (ctlv->count != count || error != 0) {
 			if (cbuf != &rci)
 				free(cbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		rtlv = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) {
 		if (cbuf != NULL && cbuf != &rci)
 			free(cbuf, M_TEMP);
 		return (EINVAL);
 	}
 
 	/*
 	 * Passed rules seems to be valid.
 	 * Allocate storage and try to add them to chain.
 	 */
 	for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) {
 		clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule);
 		ci->krule = ipfw_alloc_rule(chain, clen);
 		import_rule1(ci);
 	}
 
 	if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) {
 		/* Free allocate krules */
 		for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++)
 			free_rule(ci->krule);
 	}
 
 	if (cbuf != NULL && cbuf != &rci)
 		free(cbuf, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Lists all sopts currently registered.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ]
  *
  * Returns 0 on success
  */
 static int
 dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct _ipfw_obj_lheader *olh;
 	ipfw_sopt_info *i;
 	struct ipfw_sopt_handler *sh;
 	uint32_t count, n, size;
 
 	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
 	if (olh == NULL)
 		return (EINVAL);
 	if (sd->valsize < olh->size)
 		return (EINVAL);
 
 	CTL3_LOCK();
 	count = ctl3_hsize;
 	size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader);
 
 	/* Fill in header regadless of buffer size */
 	olh->count = count;
 	olh->objsize = sizeof(ipfw_sopt_info);
 
 	if (size > olh->size) {
 		olh->size = size;
 		CTL3_UNLOCK();
 		return (ENOMEM);
 	}
 	olh->size = size;
 
 	for (n = 1; n <= count; n++) {
 		i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i));
 		KASSERT(i != NULL, ("previously checked buffer is not enough"));
 		sh = &ctl3_handlers[n];
 		i->opcode = sh->opcode;
 		i->version = sh->version;
 		i->refcnt = sh->refcnt;
 	}
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Compares two opcodes.
  * Used both in qsort() and bsearch().
  *
  * Returns 0 if match is found.
  */
 static int
 compare_opcodes(const void *_a, const void *_b)
 {
 	const struct opcode_obj_rewrite *a, *b;
 
 	a = (const struct opcode_obj_rewrite *)_a;
 	b = (const struct opcode_obj_rewrite *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	return (0);
 }
 
 /*
  * XXX: Rewrite bsearch()
  */
 static int
 find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo,
     struct opcode_obj_rewrite **phi)
 {
 	struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = op;
 
 	rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters,
 	    ctl3_rsize, sizeof(h), compare_opcodes);
 	if (rw == NULL)
 		return (1);
 
 	/* Find the first element matching the same opcode */
 	lo = rw;
 	for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--)
 		;
 
 	/* Find the last element matching the same opcode */
 	hi = rw;
 	ctl3_max = ctl3_rewriters + ctl3_rsize;
 	for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++)
 		;
 
 	*plo = lo;
 	*phi = hi;
 
 	return (0);
 }
 
 /*
  * Finds opcode object rewriter based on @code.
  *
  * Returns pointer to handler or NULL.
  */
 static struct opcode_obj_rewrite *
 find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
 {
 	struct opcode_obj_rewrite *rw, *lo, *hi;
 	uint16_t uidx;
 	uint8_t subtype;
 
 	if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0)
 		return (NULL);
 
 	for (rw = lo; rw <= hi; rw++) {
 		if (rw->classifier(cmd, &uidx, &subtype) == 0) {
 			if (puidx != NULL)
 				*puidx = uidx;
 			if (ptype != NULL)
 				*ptype = subtype;
 			return (rw);
 		}
 	}
 
 	return (NULL);
 }
 int
 classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx)
 {
 
 	if (find_op_rw(cmd, puidx, NULL) == NULL)
 		return (1);
 	return (0);
 }
 
 void
 update_opcode_kidx(ipfw_insn *cmd, uint16_t idx)
 {
 	struct opcode_obj_rewrite *rw;
 
 	rw = find_op_rw(cmd, NULL, NULL);
 	KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode));
 	rw->update(cmd, idx);
 }
 
 void
 ipfw_init_obj_rewriter()
 {
 
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 void
 ipfw_destroy_obj_rewriter()
 {
 
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 /*
  * Adds one or more opcode object rewrite handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_rsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_rsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_rsize + count;
 	memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw));
 	memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw));
 	qsort(tmp, sz, sizeof(*rw), compare_opcodes);
 	/* Switch new and free old */
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = tmp;
 	ctl3_rsize = sz;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more object rewrite handlers from the global array.
  */
 int
 ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0)
 			continue;
 
 		for (ktmp = lo; ktmp <= hi; ktmp++) {
 			if (ktmp->classifier != rw[i].classifier)
 				continue;
 
 			ctl3_max = ctl3_rewriters + ctl3_rsize;
 			sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp);
 			memmove(ktmp, ktmp + 1, sz);
 			ctl3_rsize--;
 			break;
 		}
 
 	}
 
 	if (ctl3_rsize == 0) {
 		if (ctl3_rewriters != NULL)
 			free(ctl3_rewriters, M_IPFW);
 		ctl3_rewriters = NULL;
 	}
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static int
 export_objhash_ntlv_internal(struct namedobj_instance *ni,
     struct named_object *no, void *arg)
 {
 	struct sockopt_data *sd;
 	ipfw_obj_ntlv *ntlv;
 
 	sd = (struct sockopt_data *)arg;
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 /*
  * Lists all service objects.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ]
  * Returns 0 on success
  */
 static int
 dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *hdr;
 	int count;
 
 	hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	count = ipfw_objhash_count(CHAIN_TO_SRV(chain));
 	hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv);
 	if (sd->valsize < hdr->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 	hdr->count = count;
 	hdr->objsize = sizeof(ipfw_obj_ntlv);
 	if (count > 0)
 		ipfw_objhash_foreach(CHAIN_TO_SRV(chain),
 		    export_objhash_ntlv_internal, sd);
 	IPFW_UH_RUNLOCK(chain);
 	return (0);
 }
 
 /*
  * Compares two sopt handlers (code, version and handler ptr).
  * Used both as qsort() and bsearch().
  * Does not compare handler for latter case.
  *
  * Returns 0 if match is found.
  */
 static int
 compare_sh(const void *_a, const void *_b)
 {
 	const struct ipfw_sopt_handler *a, *b;
 
 	a = (const struct ipfw_sopt_handler *)_a;
 	b = (const struct ipfw_sopt_handler *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	if (a->version < b->version)
 		return (-1);
 	else if (a->version > b->version)
 		return (1);
 
 	/* bsearch helper */
 	if (a->handler == NULL)
 		return (0);
 
 	if ((uintptr_t)a->handler < (uintptr_t)b->handler)
 		return (-1);
 	else if ((uintptr_t)a->handler > (uintptr_t)b->handler)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Finds sopt handler based on @code and @version.
  *
  * Returns pointer to handler or NULL.
  */
 static struct ipfw_sopt_handler *
 find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler)
 {
 	struct ipfw_sopt_handler *sh, h;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = code;
 	h.version = version;
 	h.handler = handler;
 
 	sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers,
 	    ctl3_hsize, sizeof(h), compare_sh);
 
 	return (sh);
 }
 
 static int
 find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	if ((sh = find_sh(opcode, version, NULL)) == NULL) {
 		CTL3_UNLOCK();
 		printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n",
 		    opcode, version);
 		return (EINVAL);
 	}
 	sh->refcnt++;
 	ctl3_refct++;
 	/* Copy handler data to requested buffer */
 	*psh = *sh; 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static void
 find_unref_sh(struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	sh = find_sh(psh->opcode, psh->version, NULL);
 	KASSERT(sh != NULL, ("ctl3 handler disappeared"));
 	sh->refcnt--;
 	ctl3_refct--;
 	CTL3_UNLOCK();
 }
 
 void
 ipfw_init_sopt_handler()
 {
 
 	CTL3_LOCK_INIT();
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 }
 
 void
 ipfw_destroy_sopt_handler()
 {
 
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	CTL3_LOCK_DESTROY();
 }
 
 /*
  * Adds one or more sockopt handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_hsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_hsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_hsize + count;
 	memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh));
 	memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh));
 	qsort(tmp, sz, sizeof(*sh), compare_sh);
 	/* Switch new and free old */
 	if (ctl3_handlers != NULL)
 		free(ctl3_handlers, M_IPFW);
 	ctl3_handlers = tmp;
 	ctl3_hsize = sz;
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more sockopt handlers from the global array.
  */
 int
 ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp, *h;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		tmp = &sh[i];
 		h = find_sh(tmp->opcode, tmp->version, tmp->handler);
 		if (h == NULL)
 			continue;
 
 		sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h);
 		memmove(h, h + 1, sz);
 		ctl3_hsize--;
 	}
 
 	if (ctl3_hsize == 0) {
 		if (ctl3_handlers != NULL)
 			free(ctl3_handlers, M_IPFW);
 		ctl3_handlers = NULL;
 	}
 
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Writes data accumulated in @sd to sockopt buffer.
  * Zeroes internal @sd buffer.
  */
 static int
 ipfw_flush_sopt_data(struct sockopt_data *sd)
 {
 	struct sockopt *sopt;
 	int error;
 	size_t sz;
 
 	sz = sd->koff;
 	if (sz == 0)
 		return (0);
 
 	sopt = sd->sopt;
 
 	if (sopt->sopt_dir == SOPT_GET) {
 		error = copyout(sd->kbuf, sopt->sopt_val, sz);
 		if (error != 0)
 			return (error);
 	}
 
 	memset(sd->kbuf, 0, sd->ksize);
 	sd->ktotal += sz;
 	sd->koff = 0;
 	if (sd->ktotal + sd->ksize < sd->valsize)
 		sd->kavail = sd->ksize;
 	else
 		sd->kavail = sd->valsize - sd->ktotal;
 
 	/* Update sopt buffer data */
 	sopt->sopt_valsize = sd->ktotal;
 	sopt->sopt_val = sd->sopt_val + sd->ktotal;
 
 	return (0);
 }
 
 /*
  * Ensures that @sd buffer has contiguous @neeeded number of
  * bytes.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed)
 {
 	int error;
 	caddr_t addr;
 
 	if (sd->kavail < needed) {
 		/*
 		 * Flush data and try another time.
 		 */
 		error = ipfw_flush_sopt_data(sd);
 
 		if (sd->kavail < needed || error != 0)
 			return (NULL);
 	}
 
 	addr = sd->kbuf + sd->koff;
 	sd->koff += needed;
 	sd->kavail -= needed;
 	return (addr);
 }
 
 /*
  * Requests @needed contiguous bytes from @sd buffer.
  * Function is used to notify subsystem that we are
  * interesed in first @needed bytes (request header)
  * and the rest buffer can be safely zeroed.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed)
 {
 	caddr_t addr;
 
 	if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL)
 		return (NULL);
 
 	if (sd->kavail > 0)
 		memset(sd->kbuf + sd->koff, 0, sd->kavail);
 	
 	return (addr);
 }
 
 /*
  * New sockopt handler.
  */
 int
 ipfw_ctl3(struct sockopt *sopt)
 {
 	int error, locked;
 	size_t size, valsize;
 	struct ip_fw_chain *chain;
 	char xbuf[256];
 	struct sockopt_data sdata;
 	struct ipfw_sopt_handler h;
 	ip_fw3_opheader *op3 = NULL;
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
 	if (error != 0)
 		return (error);
 
 	if (sopt->sopt_name != IP_FW3)
 		return (ipfw_ctl(sopt));
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	memset(&sdata, 0, sizeof(sdata));
 	/* Read op3 header first to determine actual operation */
 	op3 = (ip_fw3_opheader *)xbuf;
 	error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3));
 	if (error != 0)
 		return (error);
 	sopt->sopt_valsize = valsize;
 
 	/*
 	 * Find and reference command.
 	 */
 	error = find_ref_sh(op3->opcode, op3->version, &h);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0) {
 			find_unref_sh(&h);
 			return (error);
 		}
 	}
 
 	/*
 	 * Fill in sockopt_data structure that may be useful for
 	 * IP_FW3 get requests.
 	 */
 	locked = 0;
 	if (valsize <= sizeof(xbuf)) {
 		/* use on-stack buffer */
 		sdata.kbuf = xbuf;
 		sdata.ksize = sizeof(xbuf);
 		sdata.kavail = valsize;
 	} else {
 
 		/*
 		 * Determine opcode type/buffer size:
 		 * allocate sliding-window buf for data export or
 		 * contiguous buffer for special ops.
 		 */
 		if ((h.dir & HDIR_SET) != 0) {
 			/* Set request. Allocate contigous buffer. */
 			if (valsize > CTL3_LARGEBUF) {
 				find_unref_sh(&h);
 				return (EFBIG);
 			}
 
 			size = valsize;
 		} else {
 			/* Get request. Allocate sliding window buffer */
 			size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF;
 
 			if (size < valsize) {
 				/* We have to wire user buffer */
 				error = vslock(sopt->sopt_val, valsize);
 				if (error != 0)
 					return (error);
 				locked = 1;
 			}
 		}
 
 		sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		sdata.ksize = size;
 		sdata.kavail = size;
 	}
 
 	sdata.sopt = sopt;
 	sdata.sopt_val = sopt->sopt_val;
 	sdata.valsize = valsize;
 
 	/*
 	 * Copy either all request (if valsize < bsize_max)
 	 * or first bsize_max bytes to guarantee most consumers
 	 * that all necessary data has been copied).
 	 * Anyway, copy not less than sizeof(ip_fw3_opheader).
 	 */
 	if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize,
 	    sizeof(ip_fw3_opheader))) != 0)
 		return (error);
 	op3 = (ip_fw3_opheader *)sdata.kbuf;
 
 	/* Finally, run handler */
 	error = h.handler(chain, op3, &sdata);
 	find_unref_sh(&h);
 
 	/* Flush state and free buffers */
 	if (error == 0)
 		error = ipfw_flush_sopt_data(&sdata);
 	else
 		ipfw_flush_sopt_data(&sdata);
 
 	if (locked != 0)
 		vsunlock(sdata.sopt_val, valsize);
 
 	/* Restore original pointer and set number of bytes written */
 	sopt->sopt_val = sdata.sopt_val;
 	sopt->sopt_valsize = sdata.ktotal;
 	if (sdata.kbuf != xbuf)
 		free(sdata.kbuf, M_TEMP);
 
 	return (error);
 }
 
 /**
  * {set|get}sockopt parser.
  */
 int
 ipfw_ctl(struct sockopt *sopt)
 {
 #define	RULE_MAXSIZE	(512*sizeof(u_int32_t))
 	int error;
 	size_t size, valsize;
 	struct ip_fw *buf;
 	struct ip_fw_rule0 *rule;
 	struct ip_fw_chain *chain;
 	u_int32_t rulenum[2];
 	uint32_t opt;
 	struct rule_check_info ci;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	opt = sopt->sopt_name;
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if (opt == IP_FW_ADD ||
 	    (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0)
 			return (error);
 	}
 
 	switch (opt) {
 	case IP_FW_GET:
 		/*
 		 * pass up a copy of the current rules. Static rules
 		 * come first (the last of which has number IPFW_DEFAULT_RULE),
 		 * followed by a possibly empty list of dynamic rule.
 		 * The last dynamic rule has NULL in the "next" field.
 		 *
 		 * Note that the calculated size is used to bound the
 		 * amount of data returned to the user.  The rule set may
 		 * change between calculating the size and returning the
 		 * data in which case we'll just return what fits.
 		 */
 		for (;;) {
 			int len = 0, want;
 
 			size = chain->static_len;
 			size += ipfw_dyn_len();
 			if (size >= sopt->sopt_valsize)
 				break;
 			buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 			IPFW_UH_RLOCK(chain);
 			/* check again how much space we need */
 			want = chain->static_len + ipfw_dyn_len();
 			if (size >= want)
 				len = ipfw_getrules(chain, buf, size);
 			IPFW_UH_RUNLOCK(chain);
 			if (size >= want)
 				error = sooptcopyout(sopt, buf, len);
 			free(buf, M_TEMP);
 			if (size >= want)
 				break;
 		}
 		break;
 
 	case IP_FW_FLUSH:
 		/* locking is done within del_entry() */
 		error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
 		break;
 
 	case IP_FW_ADD:
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
 			sizeof(struct ip_fw7) );
 
 		memset(&ci, 0, sizeof(struct rule_check_info));
 
 		/*
 		 * If the size of commands equals RULESIZE7 then we assume
 		 * a FreeBSD7.2 binary is talking to us (set is7=1).
 		 * is7 is persistent so the next 'ipfw list' command
 		 * will use this format.
 		 * NOTE: If wrong version is guessed (this can happen if
 		 *       the first ipfw command is 'ipfw [pipe] list')
 		 *       the ipfw binary may crash or loop infinitly...
 		 */
 		size = sopt->sopt_valsize;
 		if (size == RULESIZE7(rule)) {
 		    is7 = 1;
 		    error = convert_rule_to_8(rule);
 		    if (error) {
 			free(rule, M_TEMP);
 			return error;
 		    }
 		    size = RULESIZE(rule);
 		} else
 		    is7 = 0;
 		if (error == 0)
 			error = check_ipfw_rule0(rule, size, &ci);
 		if (error == 0) {
 			/* locking is done within add_rule() */
 			struct ip_fw *krule;
 			krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule));
 			ci.urule = (caddr_t)rule;
 			ci.krule = krule;
 			import_rule0(&ci);
 			error = commit_rules(chain, &ci, 1);
 			if (error != 0)
 				free_rule(ci.krule);
 			else if (sopt->sopt_dir == SOPT_GET) {
 				if (is7) {
 					error = convert_rule_to_7(rule);
 					size = RULESIZE7(rule);
 					if (error) {
 						free(rule, M_TEMP);
 						return error;
 					}
 				}
 				error = sooptcopyout(sopt, rule, size);
 			}
 		}
 		free(rule, M_TEMP);
 		break;
 
 	case IP_FW_DEL:
 		/*
 		 * IP_FW_DEL is used for deleting single rules or sets,
 		 * and (ab)used to atomically manipulate sets. Argument size
 		 * is used to distinguish between the two:
 		 *    sizeof(u_int32_t)
 		 *	delete single rule or set of rules,
 		 *	or reassign rules (or sets) to a different set.
 		 *    2*sizeof(u_int32_t)
 		 *	atomic disable/enable sets.
 		 *	first u_int32_t contains sets to be disabled,
 		 *	second u_int32_t contains sets to be enabled.
 		 */
 		error = sooptcopyin(sopt, rulenum,
 			2*sizeof(u_int32_t), sizeof(u_int32_t));
 		if (error)
 			break;
 		size = sopt->sopt_valsize;
 		if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
 			/* delete or reassign, locking done in del_entry() */
 			error = del_entry(chain, rulenum[0]);
 		} else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
 			IPFW_UH_WLOCK(chain);
 			V_set_disable =
 			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
 			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
 			IPFW_UH_WUNLOCK(chain);
 		} else
 			error = EINVAL;
 		break;
 
 	case IP_FW_ZERO:
 	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
 		rulenum[0] = 0;
 		if (sopt->sopt_val != 0) {
 		    error = sooptcopyin(sopt, rulenum,
 			    sizeof(u_int32_t), sizeof(u_int32_t));
 		    if (error)
 			break;
 		}
 		error = zero_entry(chain, rulenum[0],
 			sopt->sopt_name == IP_FW_RESETLOG);
 		break;
 
 	/*--- TABLE opcodes ---*/
 	case IP_FW_TABLE_ADD:
 	case IP_FW_TABLE_DEL:
 		{
 			ipfw_table_entry ent;
 			struct tentry_info tei;
 			struct tid_info ti;
 			struct table_value v;
 
 			error = sooptcopyin(sopt, &ent,
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
 
 			memset(&tei, 0, sizeof(tei));
 			tei.paddr = &ent.addr;
 			tei.subtype = AF_INET;
 			tei.masklen = ent.masklen;
 			ipfw_import_table_value_legacy(ent.value, &v);
 			tei.pvalue = &v;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = ent.tbl;
 			ti.type = IPFW_TABLE_CIDR;
 
 			error = (opt == IP_FW_TABLE_ADD) ?
 			    add_table_entry(chain, &ti, &tei, 0, 1) :
 			    del_table_entry(chain, &ti, &tei, 0, 1);
 		}
 		break;
 
 
 	case IP_FW_TABLE_FLUSH:
 		{
 			u_int16_t tbl;
 			struct tid_info ti;
 
 			error = sooptcopyin(sopt, &tbl,
 			    sizeof(tbl), sizeof(tbl));
 			if (error)
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			error = flush_table(chain, &ti);
 		}
 		break;
 
 	case IP_FW_TABLE_GETSIZE:
 		{
 			u_int32_t tbl, cnt;
 			struct tid_info ti;
 
 			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
 			    sizeof(tbl))))
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_count_table(chain, &ti, &cnt);
 			IPFW_RUNLOCK(chain);
 			if (error)
 				break;
 			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
 		}
 		break;
 
 	case IP_FW_TABLE_LIST:
 		{
 			ipfw_table *tbl;
 			struct tid_info ti;
 
 			if (sopt->sopt_valsize < sizeof(*tbl)) {
 				error = EINVAL;
 				break;
 			}
 			size = sopt->sopt_valsize;
 			tbl = malloc(size, M_TEMP, M_WAITOK);
 			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			tbl->size = (size - sizeof(*tbl)) /
 			    sizeof(ipfw_table_entry);
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl->tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_dump_table_legacy(chain, &ti, tbl);
 			IPFW_RUNLOCK(chain);
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			error = sooptcopyout(sopt, tbl, size);
 			free(tbl, M_TEMP);
 		}
 		break;
 
 	/*--- NAT operations are protected by the IPFW_LOCK ---*/
 	case IP_FW_NAT_CFG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_DEL:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_del_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_DEL: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_CONFIG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_LOG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_log_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_LOG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	default:
 		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
 		error = EINVAL;
 	}
 
 	return (error);
 #undef RULE_MAXSIZE
 }
 #define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
 
 /* Functions to convert rules 7.2 <==> 8.0 */
 static int
 convert_rule_to_7(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
 	/* copy of original rule, version 8 */
 	struct ip_fw_rule0 *tmp;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 	bcopy(rule, tmp, RULE_MAXSIZE);
 
 	/* Copy fields */
 	//rule7->_pad = tmp->_pad;
 	rule7->set = tmp->set;
 	rule7->rulenum = tmp->rulenum;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->act_ofs = tmp->act_ofs;
 	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->pcnt = tmp->pcnt;
 	rule7->bcnt = tmp->bcnt;
 	rule7->timestamp = tmp->timestamp;
 
 	/* Copy commands */
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * decrement opcode if it is after O_REASS
 			 */
 			dst->opcode--;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 				ccmd->opcode);
 			return EINVAL;
 		}
 	}
 	free(tmp, M_TEMP);
 
 	return 0;
 }
 
 static int
 convert_rule_to_8(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	/* Copy of original rule */
 	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 
 	bcopy(rule7, tmp, RULE_MAXSIZE);
 
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 		
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * increment opcode if it is after O_REASS
 			 */
 			dst->opcode++;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 			    ccmd->opcode);
 			return EINVAL;
 		}
 	}
 
 	rule->_pad = tmp->_pad;
 	rule->set = tmp->set;
 	rule->rulenum = tmp->rulenum;
 	rule->cmd_len = tmp->cmd_len;
 	rule->act_ofs = tmp->act_ofs;
 	rule->next_rule = (struct ip_fw *)tmp->next_rule;
 	rule->cmd_len = tmp->cmd_len;
 	rule->id = 0; /* XXX see if is ok = 0 */
 	rule->pcnt = tmp->pcnt;
 	rule->bcnt = tmp->bcnt;
 	rule->timestamp = tmp->timestamp;
 
 	free (tmp, M_TEMP);
 	return 0;
 }
 
 /*
  * Named object api
  *
  */
 
 void
 ipfw_init_srv(struct ip_fw_chain *ch)
 {
 
 	ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT);
 	ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT,
 	    M_IPFW, M_WAITOK | M_ZERO);
 }
 
 void
 ipfw_destroy_srv(struct ip_fw_chain *ch)
 {
 
 	free(ch->srvstate, M_IPFW);
 	ipfw_objhash_destroy(ch->srvmap);
 }
 
 /*
  * Allocate new bitmask which can be used to enlarge/shrink
  * named instance index.
  */
 void
 ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks)
 {
 	size_t size;
 	int max_blocks;
 	u_long *idx_mask;
 
 	KASSERT((items % BLOCK_ITEMS) == 0,
 	   ("bitmask size needs to power of 2 and greater or equal to %zu",
 	    BLOCK_ITEMS));
 
 	max_blocks = items / BLOCK_ITEMS;
 	size = items / 8;
 	idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK);
 	/* Mark all as free */
 	memset(idx_mask, 0xFF, size * IPFW_MAX_SETS);
 	*idx_mask &= ~(u_long)1; /* Skip index 0 */
 
 	*idx = idx_mask;
 	*pblocks = max_blocks;
 }
 
 /*
  * Copy current bitmask index to new one.
  */
 void
 ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks, new_blocks;
 	u_long *old_idx, *new_idx;
 	int i;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 	new_idx = *idx;
 	new_blocks = *blocks;
 
 	for (i = 0; i < IPFW_MAX_SETS; i++) {
 		memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i],
 		    old_blocks * sizeof(u_long));
 	}
 }
 
 /*
  * Swaps current @ni index with new one.
  */
 void
 ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks;
 	u_long *old_idx;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 
 	ni->idx_mask = *idx;
 	ni->max_blocks = *blocks;
 
 	/* Save old values */
 	*idx = old_idx;
 	*blocks = old_blocks;
 }
 
 void
 ipfw_objhash_bitmap_free(void *idx, int blocks)
 {
 
 	free(idx, M_IPFW);
 }
 
 /*
  * Creates named hash instance.
  * Must be called without holding any locks.
  * Return pointer to new instance.
  */
 struct namedobj_instance *
 ipfw_objhash_create(uint32_t items)
 {
 	struct namedobj_instance *ni;
 	int i;
 	size_t size;
 
 	size = sizeof(struct namedobj_instance) +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE;
 
 	ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO);
 	ni->nn_size = NAMEDOBJ_HASH_SIZE;
 	ni->nv_size = NAMEDOBJ_HASH_SIZE;
 
 	ni->names = (struct namedobjects_head *)(ni +1);
 	ni->values = &ni->names[ni->nn_size];
 
 	for (i = 0; i < ni->nn_size; i++)
 		TAILQ_INIT(&ni->names[i]);
 
 	for (i = 0; i < ni->nv_size; i++)
 		TAILQ_INIT(&ni->values[i]);
 
 	/* Set default hashing/comparison functions */
 	ni->hash_f = objhash_hash_name;
 	ni->cmp_f = objhash_cmp_name;
 
 	/* Allocate bitmask separately due to possible resize */
 	ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks);
 
 	return (ni);
 }
 
 void
 ipfw_objhash_destroy(struct namedobj_instance *ni)
 {
 
 	free(ni->idx_mask, M_IPFW);
 	free(ni, M_IPFW);
 }
 
 void
 ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f,
     objhash_cmp_f *cmp_f)
 {
 
 	ni->hash_f = hash_f;
 	ni->cmp_f = cmp_f;
 }
 
 static uint32_t
 objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set)
 {
 
 	return (fnv_32_str((const char *)name, FNV1_32_INIT));
 }
 
 static int
 objhash_cmp_name(struct named_object *no, const void *name, uint32_t set)
 {
 
 	if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set))
 		return (0);
 
 	return (1);
 }
 
 static uint32_t
 objhash_hash_idx(struct namedobj_instance *ni, uint32_t val)
 {
 	uint32_t v;
 
 	v = val % (ni->nv_size - 1);
 
 	return (v);
 }
 
 struct named_object *
 ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 	
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 /*
  * Find named object by @uid.
  * Check @tlvs for valid data inside.
  *
  * Returns pointer to found TLV or NULL.
  */
 ipfw_obj_ntlv *
 ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv)
 {
 	ipfw_obj_ntlv *ntlv;
 	uintptr_t pa, pe;
 	int l;
 
 	pa = (uintptr_t)tlvs;
 	pe = pa + len;
 	l = 0;
 	for (; pa < pe; pa += l) {
 		ntlv = (ipfw_obj_ntlv *)pa;
 		l = ntlv->head.length;
 
 		if (l != sizeof(*ntlv))
 			return (NULL);
 
 		if (ntlv->idx != uidx)
 			continue;
 		/*
 		 * When userland has specified zero TLV type, do
 		 * not compare it with eltv. In some cases userland
 		 * doesn't know what type should it have. Use only
 		 * uidx and name for search named_object.
 		 */
 		if (ntlv->head.type != 0 &&
 		    ntlv->head.type != (uint16_t)etlv)
 			continue;
 
 		if (ipfw_check_object_name_generic(ntlv->name) != 0)
 			return (NULL);
 
 		return (ntlv);
 	}
 
 	return (NULL);
 }
 
 /*
  * Finds object config based on either legacy index
  * or name in ntlv.
  * Note @ti structure contains unchecked data from userland.
  *
  * Returns 0 in success and fills in @pno with found config
  */
 int
 ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
     uint32_t etlv, struct named_object **pno)
 {
 	char *name;
 	ipfw_obj_ntlv *ntlv;
 	uint32_t set;
 
 	if (ti->tlvs == NULL)
 		return (EINVAL);
 
 	ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv);
 	if (ntlv == NULL)
 		return (EINVAL);
 	name = ntlv->name;
 
 	/*
 	 * Use set provided by @ti instead of @ntlv one.
 	 * This is needed due to different sets behavior
 	 * controlled by V_fw_tables_sets.
 	 */
 	set = ti->set;
 	*pno = ipfw_objhash_lookup_name(ni, set, name);
 	if (*pno == NULL)
 		return (ESRCH);
 	return (0);
 }
 
 /*
  * Find named object by name, considering also its TLV type.
  */
 struct named_object *
 ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set,
     uint32_t type, const char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0 &&
 		    no->etlv == (uint16_t)type)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 struct named_object *
 ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = objhash_hash_idx(ni, kidx);
 	
 	TAILQ_FOREACH(no, &ni->values[hash], nv_next) {
 		if (no->kidx == kidx)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 int
 ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
     struct named_object *b)
 {
 
 	if ((strcmp(a->name, b->name) == 0) && a->set == b->set)
 		return (1);
 
 	return (0);
 }
 
 void
 ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next);
 
 	ni->count++;
 }
 
 void
 ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_REMOVE(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_REMOVE(&ni->values[hash], no, nv_next);
 
 	ni->count--;
 }
 
 uint32_t
 ipfw_objhash_count(struct namedobj_instance *ni)
 {
 
 	return (ni->count);
 }
 
 uint32_t
 ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type)
 {
 	struct named_object *no;
 	uint32_t count;
 	int i;
 
 	count = 0;
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH(no, &ni->names[i], nn_next) {
 			if (no->etlv == type)
 				count++;
 		}
 	}
 	return (count);
 }
 
 /*
  * Runs @func for each found named object.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Runs @f for each found named object with type @type.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
     void *arg, uint16_t type)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			if (no->etlv != type)
 				continue;
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Removes index from given set.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx)
 {
 	u_long *mask;
 	int i, v;
 
 	i = idx / BLOCK_ITEMS;
 	v = idx % BLOCK_ITEMS;
 
 	if (i >= ni->max_blocks)
 		return (1);
 
 	mask = &ni->idx_mask[i];
 
 	if ((*mask & ((u_long)1 << v)) != 0)
 		return (1);
 
 	/* Mark as free */
 	*mask |= (u_long)1 << v;
 
 	/* Update free offset */
 	if (ni->free_off[0] > i)
 		ni->free_off[0] = i;
 	
 	return (0);
 }
 
 /*
  * Allocate new index in given instance and stores in in @pidx.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_alloc_idx(void *n, uint16_t *pidx)
 {
 	struct namedobj_instance *ni;
 	u_long *mask;
 	int i, off, v;
 
 	ni = (struct namedobj_instance *)n;
 
 	off = ni->free_off[0];
 	mask = &ni->idx_mask[off];
 
 	for (i = off; i < ni->max_blocks; i++, mask++) {
 		if ((v = ffsl(*mask)) == 0)
 			continue;
 
 		/* Mark as busy */
 		*mask &= ~ ((u_long)1 << (v - 1));
 
 		ni->free_off[0] = i;
 		
 		v = BLOCK_ITEMS * i + v - 1;
 
 		*pidx = v;
 		return (0);
 	}
 
 	return (1);
 }
 
 /* end of file */
Index: head/sys/netpfil/ipfw/ip_fw_table_algo.c
===================================================================
--- head/sys/netpfil/ipfw/ip_fw_table_algo.c	(revision 328237)
+++ head/sys/netpfil/ipfw/ip_fw_table_algo.c	(revision 328238)
@@ -1,4111 +1,4111 @@
 /*-
  * Copyright (c) 2014 Yandex LLC
  * Copyright (c) 2014 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Lookup table algorithms.
  *
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/route_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
 #include <netinet6/in6_fib.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_fw_table.h>
 
 
 /*
  * IPFW table lookup algorithms.
  *
  * What is needed to add another table algo?
  *
  * Algo init:
  * * struct table_algo has to be filled with:
  *   name: "type:algoname" format, e.g. "addr:radix". Currently
  *     there are the following types: "addr", "iface", "number" and "flow".
  *   type: one of IPFW_TABLE_* types
  *   flags: one or more TA_FLAGS_*
  *   ta_buf_size: size of structure used to store add/del item state.
  *     Needs to be less than TA_BUF_SZ.
  *   callbacks: see below for description.
  * * ipfw_add_table_algo / ipfw_del_table_algo has to be called
  *
  * Callbacks description:
  *
  * -init: request to initialize new table instance.
  * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state,
  *     struct table_info *ti, char *data, uint8_t tflags);
  * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
  *
  *  Allocate all structures needed for normal operations.
  *  * Caller may want to parse @data for some algo-specific
  *    options provided by userland.
  *  * Caller may want to save configuration state pointer to @ta_state
  *  * Caller needs to save desired runtime structure pointer(s)
  *    inside @ti fields. Note that it is not correct to save
  *    @ti pointer at this moment. Use -change_ti hook for that.
  *  * Caller has to fill in ti->lookup to appropriate function
  *    pointer.
  *
  *
  *
  * -destroy: request to destroy table instance.
  * typedef void (ta_destroy)(void *ta_state, struct table_info *ti);
  * MANDATORY, unlocked. (M_WAITOK).
  *
  * Frees all table entries and all tables structures allocated by -init.
  *
  *
  *
  * -prepare_add: request to allocate state for adding new entry.
  * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei,
  *     void *ta_buf);
  * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
  *
  * Allocates state and fills it in with all necessary data (EXCEPT value)
  * from @tei to minimize operations needed to be done under WLOCK.
  * "value" field has to be copied to new entry in @add callback.
  * Buffer ta_buf of size ta->ta_buf_sz may be used to store
  * allocated state.
  *
  *
  *
  * -prepare_del: request to set state for deleting existing entry.
  * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei,
  *     void *ta_buf);
  * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success.
  *
  * Buffer ta_buf of size ta->ta_buf_sz may be used to store
  * allocated state. Caller should use on-stack ta_buf allocation
  * instead of doing malloc().
  *
  *
  *
  * -add: request to insert new entry into runtime/config structures.
  *  typedef int (ta_add)(void *ta_state, struct table_info *ti,
  *     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
  * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
  *
  * Insert new entry using previously-allocated state in @ta_buf.
  * * @tei may have the following flags:
  *   TEI_FLAGS_UPDATE: request to add or update entry.
  *   TEI_FLAGS_DONTADD: request to update (but not add) entry.
  * * Caller is required to do the following:
  *   copy real entry value from @tei
  *   entry added: return 0, set 1 to @pnum
  *   entry updated: return 0, store 0 to @pnum, store old value in @tei,
  *     add TEI_FLAGS_UPDATED flag to @tei.
  *   entry exists: return EEXIST
  *   entry not found: return ENOENT
  *   other error: return non-zero error code.
  *
  *
  *
  * -del: request to delete existing entry from runtime/config structures.
  *  typedef int (ta_del)(void *ta_state, struct table_info *ti,
  *     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
  *  MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
  *
  *  Delete entry using previously set up in @ta_buf.
  * * Caller is required to do the following:
  *   entry deleted: return 0, set 1 to @pnum, store old value in @tei.
  *   entry not found: return ENOENT
  *   other error: return non-zero error code.
  *
  *
  *
  * -flush_entry: flush entry state created by -prepare_add / -del / others
  *  typedef void (ta_flush_entry)(struct ip_fw_chain *ch,
  *      struct tentry_info *tei, void *ta_buf);
  *  MANDATORY, may be locked. (M_NOWAIT).
  *
  *  Delete state allocated by:
  *  -prepare_add (-add returned EEXIST|UPDATED)
  *  -prepare_del (if any)
  *  -del
  *  * Caller is required to handle empty @ta_buf correctly.
  *
  *
  * -find_tentry: finds entry specified by key @tei
  *  typedef int ta_find_tentry(void *ta_state, struct table_info *ti,
  *      ipfw_obj_tentry *tent);
  *  OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success.
  *
  *  Finds entry specified by given key.
  *  * Caller is required to do the following:
  *    entry found: returns 0, export entry to @tent
  *    entry not found: returns ENOENT
  *
  *
  * -need_modify: checks if @ti has enough space to hold another @count items.
  *  typedef int (ta_need_modify)(void *ta_state, struct table_info *ti,
  *      uint32_t count, uint64_t *pflags);
  *  OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has.
  *
  *  Checks if given table has enough space to add @count items without
  *  resize. Caller may use @pflags to store desired modification data.
  *
  *
  *
  * -prepare_mod: allocate structures for table modification.
  *  typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags);
  * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success.
  *
  * Allocate all needed state for table modification. Caller
  * should use `struct mod_item` to store new state in @ta_buf.
  * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf.
  * 
  *
  *
  * -fill_mod: copy some data to new state/
  *  typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti,
  *      void *ta_buf, uint64_t *pflags);
  * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success.
  *
  * Copy as much data as we can to minimize changes under WLOCK.
  * For example, array can be merged inside this callback.
  *
  *
  *
  * -modify: perform final modification.
  *  typedef void (ta_modify)(void *ta_state, struct table_info *ti,
  *      void *ta_buf, uint64_t pflags);
  * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). 
  *
  * Performs all changes necessary to switch to new structures.
  * * Caller should save old pointers to @ta_buf storage.
  *
  *
  *
  * -flush_mod: flush table modification state.
  *  typedef void (ta_flush_mod)(void *ta_buf);
  * OPTIONAL(need_modify), unlocked. (M_WAITOK).
  *
  * Performs flush for the following:
  *   - prepare_mod (modification was not necessary)
  *   - modify (for the old state)
  *
  *
  *
  * -change_gi: monitor table info pointer changes
  * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti);
  * OPTIONAL, locked (UH). (M_NOWAIT).
  *
  * Called on @ti pointer changed. Called immediately after -init
  * to set initial state.
  *
  *
  *
  * -foreach: calls @f for each table entry
  *  typedef void ta_foreach(void *ta_state, struct table_info *ti,
  *      ta_foreach_f *f, void *arg);
  * MANDATORY, locked(UH). (M_NOWAIT).
  *
  * Runs callback with specified argument for each table entry,
  * Typically used for dumping table entries.
  *
  *
  *
  * -dump_tentry: dump table entry in current @tentry format.
  *  typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e,
  *      ipfw_obj_tentry *tent);
  * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success.
  *
  * Dumps entry @e to @tent.
  *
  *
  * -print_config: prints custom algorithm options into buffer.
  *  typedef void (ta_print_config)(void *ta_state, struct table_info *ti,
  *      char *buf, size_t bufsize);
  * OPTIONAL. locked(UH). (M_NOWAIT).
  *
  * Prints custom algorithm options in the format suitable to pass
  * back to -init callback.
  *
  *
  *
  * -dump_tinfo: dumps algo-specific info.
  *  typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti,
  *      ipfw_ta_tinfo *tinfo);
  * OPTIONAL. locked(UH). (M_NOWAIT).
  *
  * Dumps options like items size/hash size, etc.
  */
 
 MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
 
 /*
  * Utility structures/functions common to more than one algo
  */
 
 struct mod_item {
 	void	*main_ptr;
 	size_t	size;
 	void	*main_ptr6;
 	size_t	size6;
 };
 
 static int badd(const void *key, void *item, void *base, size_t nmemb,
     size_t size, int (*compar) (const void *, const void *));
 static int bdel(const void *key, void *base, size_t nmemb, size_t size,
     int (*compar) (const void *, const void *));
 
 
 /*
  * ADDR implementation using radix
  *
  */
 
 /*
  * The radix code expects addr and mask to be array of bytes,
  * with the first byte being the length of the array. rn_inithead
  * is called with the offset in bits of the lookup key within the
  * array. If we use a sockaddr_in as the underlying type,
  * sin_len is conveniently located at offset 0, sin_addr is at
  * offset 4 and normally aligned.
  * But for portability, let's avoid assumption and make the code explicit
  */
 #define KEY_LEN(v)	*((uint8_t *)&(v))
 /*
  * Do not require radix to compare more than actual IPv4/IPv6 address
  */
 #define KEY_LEN_INET	(offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
 #define KEY_LEN_INET6	(offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr))
 
 #define OFF_LEN_INET	(8 * offsetof(struct sockaddr_in, sin_addr))
 #define OFF_LEN_INET6	(8 * offsetof(struct sa_in6, sin6_addr))
 
 struct radix_addr_entry {
 	struct radix_node	rn[2];
 	struct sockaddr_in	addr;
 	uint32_t		value;
 	uint8_t			masklen;
 };
 
 struct sa_in6 {
 	uint8_t			sin6_len;
 	uint8_t			sin6_family;
 	uint8_t			pad[2];
 	struct in6_addr		sin6_addr;
 };
 
 struct radix_addr_xentry {
 	struct radix_node	rn[2];
 	struct sa_in6		addr6;
 	uint32_t		value;
 	uint8_t			masklen;
 };
 
 struct radix_cfg {
 	struct radix_node_head	*head4;
 	struct radix_node_head	*head6;
 	size_t			count4;
 	size_t			count6;
 };
 
 struct ta_buf_radix
 {
 	void *ent_ptr;
 	struct sockaddr	*addr_ptr;
 	struct sockaddr	*mask_ptr;
 	union {
 		struct {
 			struct sockaddr_in sa;
 			struct sockaddr_in ma;
 		} a4;
 		struct {
 			struct sa_in6 sa;
 			struct sa_in6 ma;
 		} a6;
 	} addr;
 };
 
 static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static int flush_radix_entry(struct radix_node *rn, void *arg);
 static void ta_destroy_radix(void *ta_state, struct table_info *ti);
 static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int ta_find_radix_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_radix(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
     struct sockaddr *ma, int *set_mask);
 static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_radix(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_radix(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_radix(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 
 static int
 ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct radix_node_head *rnh;
 
 	if (keylen == sizeof(in_addr_t)) {
 		struct radix_addr_entry *ent;
 		struct sockaddr_in sa;
 		KEY_LEN(sa) = KEY_LEN_INET;
 		sa.sin_addr.s_addr = *((in_addr_t *)key);
 		rnh = (struct radix_node_head *)ti->state;
 		ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh));
 		if (ent != NULL) {
 			*val = ent->value;
 			return (1);
 		}
 	} else {
 		struct radix_addr_xentry *xent;
 		struct sa_in6 sa6;
 		KEY_LEN(sa6) = KEY_LEN_INET6;
 		memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr));
 		rnh = (struct radix_node_head *)ti->xstate;
 		xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh));
 		if (xent != NULL) {
 			*val = xent->value;
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * New table
  */
 static int
 ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct radix_cfg *cfg;
 
 	if (!rn_inithead(&ti->state, OFF_LEN_INET))
 		return (ENOMEM);
 	if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) {
 		rn_detachhead(&ti->state);
 		return (ENOMEM);
 	}
 
 	cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	*ta_state = cfg;
 	ti->lookup = ta_lookup_radix;
 
 	return (0);
 }
 
 static int
 flush_radix_entry(struct radix_node *rn, void *arg)
 {
 	struct radix_node_head * const rnh = arg;
 	struct radix_addr_entry *ent;
 
 	ent = (struct radix_addr_entry *)
 	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh);
 	if (ent != NULL)
 		free(ent, M_IPFW_TBL);
 	return (0);
 }
 
 static void
 ta_destroy_radix(void *ta_state, struct table_info *ti)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 
 	cfg = (struct radix_cfg *)ta_state;
 
 	rnh = (struct radix_node_head *)(ti->state);
 	rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
 	rn_detachhead(&ti->state);
 
 	rnh = (struct radix_node_head *)(ti->xstate);
 	rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
 	rn_detachhead(&ti->xstate);
 
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct radix_cfg *cfg;
 
 	cfg = (struct radix_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_RADIX;
 	tinfo->count4 = cfg->count4;
 	tinfo->itemsize4 = sizeof(struct radix_addr_entry);
 	tinfo->taclass6 = IPFW_TACLASS_RADIX;
 	tinfo->count6 = cfg->count6;
 	tinfo->itemsize6 = sizeof(struct radix_addr_xentry);
 }
 
 static int
 ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct radix_addr_entry *n;
 #ifdef INET6
 	struct radix_addr_xentry *xn;
 #endif
 
 	n = (struct radix_addr_entry *)e;
 
 	/* Guess IPv4/IPv6 radix by sockaddr family */
 	if (n->addr.sin_family == AF_INET) {
 		tent->k.addr.s_addr = n->addr.sin_addr.s_addr;
 		tent->masklen = n->masklen;
 		tent->subtype = AF_INET;
 		tent->v.kidx = n->value;
 #ifdef INET6
 	} else {
 		xn = (struct radix_addr_xentry *)e;
 		memcpy(&tent->k.addr6, &xn->addr6.sin6_addr,
 		    sizeof(struct in6_addr));
 		tent->masklen = xn->masklen;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = xn->value;
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_find_radix_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct radix_node_head *rnh;
 	void *e;
 
 	e = NULL;
 	if (tent->subtype == AF_INET) {
 		struct sockaddr_in sa;
 		KEY_LEN(sa) = KEY_LEN_INET;
 		sa.sin_addr.s_addr = tent->k.addr.s_addr;
 		rnh = (struct radix_node_head *)ti->state;
 		e = rnh->rnh_matchaddr(&sa, &rnh->rh);
 	} else {
 		struct sa_in6 sa6;
 		KEY_LEN(sa6) = KEY_LEN_INET6;
 		memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr));
 		rnh = (struct radix_node_head *)ti->xstate;
 		e = rnh->rnh_matchaddr(&sa6, &rnh->rh);
 	}
 
 	if (e != NULL) {
 		ta_dump_radix_tentry(ta_state, ti, e, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct radix_node_head *rnh;
 
 	rnh = (struct radix_node_head *)(ti->state);
 	rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
 
 	rnh = (struct radix_node_head *)(ti->xstate);
 	rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
 }
 
 
 #ifdef INET6
 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask);
 
 static inline void
 ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
 {
 	uint32_t *cp;
 
 	for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
 		*cp++ = 0xFFFFFFFF;
 	if (mask > 0)
 		*cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
 }
 #endif
 
 static void
 tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
     struct sockaddr *ma, int *set_mask)
 {
 	int mlen;
 #ifdef INET
 	struct sockaddr_in *addr, *mask;
 #endif
 #ifdef INET6
 	struct sa_in6 *addr6, *mask6;
 #endif
 	in_addr_t a4;
 
 	mlen = tei->masklen;
 
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		addr = (struct sockaddr_in *)sa;
 		mask = (struct sockaddr_in *)ma;
 		/* Set 'total' structure length */
 		KEY_LEN(*addr) = KEY_LEN_INET;
 		KEY_LEN(*mask) = KEY_LEN_INET;
 		addr->sin_family = AF_INET;
 		mask->sin_addr.s_addr =
 		    htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 		a4 = *((in_addr_t *)tei->paddr);
 		addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr;
 		if (mlen != 32)
 			*set_mask = 1;
 		else
 			*set_mask = 0;
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		addr6 = (struct sa_in6 *)sa;
 		mask6 = (struct sa_in6 *)ma;
 		/* Set 'total' structure length */
 		KEY_LEN(*addr6) = KEY_LEN_INET6;
 		KEY_LEN(*mask6) = KEY_LEN_INET6;
 		addr6->sin6_family = AF_INET6;
 		ipv6_writemask(&mask6->sin6_addr, mlen);
 		memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr));
 		APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr);
 		if (mlen != 128)
 			*set_mask = 1;
 		else
 			*set_mask = 0;
 #endif
 	}
 }
 
 static int
 ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 	struct radix_addr_entry *ent;
 #ifdef INET6
 	struct radix_addr_xentry *xent;
 #endif
 	struct sockaddr *addr, *mask;
 	int mlen, set_mask;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	mlen = tei->masklen;
 	set_mask = 0;
 	
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		if (mlen > 32)
 			return (EINVAL);
 		ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 		ent->masklen = mlen;
 
 		addr = (struct sockaddr *)&ent->addr;
 		mask = (struct sockaddr *)&tb->addr.a4.ma;
 		tb->ent_ptr = ent;
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		if (mlen > 128)
 			return (EINVAL);
 		xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 		xent->masklen = mlen;
 
 		addr = (struct sockaddr *)&xent->addr6;
 		mask = (struct sockaddr *)&tb->addr.a6.ma;
 		tb->ent_ptr = xent;
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
 	/* Set pointers */
 	tb->addr_ptr = addr;
 	if (set_mask != 0)
 		tb->mask_ptr = mask;
 
 	return (0);
 }
 
 static int
 ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
 	struct ta_buf_radix *tb;
 	uint32_t *old_value, value;
 
 	cfg = (struct radix_cfg *)ta_state;
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	/* Save current entry value from @tei */
 	if (tei->subtype == AF_INET) {
 		rnh = ti->state;
 		((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value;
 	} else {
 		rnh = ti->xstate;
 		((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value;
 	}
 
 	/* Search for an entry first */
 	rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
 	if (rn != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		if (tei->subtype == AF_INET)
 			old_value = &((struct radix_addr_entry *)rn)->value;
 		else
 			old_value = &((struct radix_addr_xentry *)rn)->value;
 
 		value = *old_value;
 		*old_value = tei->value;
 		tei->value = value;
 
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh,tb->ent_ptr);
 	if (rn == NULL) {
 		/* Unknown error */
 		return (EINVAL);
 	}
 	
 	if (tei->subtype == AF_INET)
 		cfg->count4++;
 	else
 		cfg->count6++;
 	tb->ent_ptr = NULL;
 	*pnum = 1;
 
 	return (0);
 }
 
 static int
 ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 	struct sockaddr *addr, *mask;
 	int mlen, set_mask;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	mlen = tei->masklen;
 	set_mask = 0;
 
 	if (tei->subtype == AF_INET) {
 		if (mlen > 32)
 			return (EINVAL);
 
 		addr = (struct sockaddr *)&tb->addr.a4.sa;
 		mask = (struct sockaddr *)&tb->addr.a4.ma;
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		if (mlen > 128)
 			return (EINVAL);
 
 		addr = (struct sockaddr *)&tb->addr.a6.sa;
 		mask = (struct sockaddr *)&tb->addr.a6.ma;
 #endif
 	} else
 		return (EINVAL);
 
 	tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
 	tb->addr_ptr = addr;
 	if (set_mask != 0)
 		tb->mask_ptr = mask;
 
 	return (0);
 }
 
 static int
 ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
 	struct ta_buf_radix *tb;
 
 	cfg = (struct radix_cfg *)ta_state;
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	if (tei->subtype == AF_INET)
 		rnh = ti->state;
 	else
 		rnh = ti->xstate;
 
 	rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
 
 	if (rn == NULL)
 		return (ENOENT);
 
 	/* Save entry value to @tei */
 	if (tei->subtype == AF_INET)
 		tei->value = ((struct radix_addr_entry *)rn)->value;
 	else
 		tei->value = ((struct radix_addr_xentry *)rn)->value;
 
 	tb->ent_ptr = rn;
 	
 	if (tei->subtype == AF_INET)
 		cfg->count4--;
 	else
 		cfg->count6--;
 	*pnum = 1;
 
 	return (0);
 }
 
 static void
 ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 static int
 ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 
 	/*
 	 * radix does not require additional memory allocations
 	 * other than nodes itself. Adding new masks to the tree do
 	 * but we don't have any API to call (and we don't known which
 	 * sizes do we need).
 	 */
 	return (0);
 }
 
 struct table_algo addr_radix = {
 	.name		= "addr:radix",
 	.type		= IPFW_TABLE_ADDR,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_radix),
 	.init		= ta_init_radix,
 	.destroy	= ta_destroy_radix,
 	.prepare_add	= ta_prepare_add_radix,
 	.prepare_del	= ta_prepare_del_radix,
 	.add		= ta_add_radix,
 	.del		= ta_del_radix,
 	.flush_entry	= ta_flush_radix_entry,
 	.foreach	= ta_foreach_radix,
 	.dump_tentry	= ta_dump_radix_tentry,
 	.find_tentry	= ta_find_radix_tentry,
 	.dump_tinfo	= ta_dump_radix_tinfo,
 	.need_modify	= ta_need_modify_radix,
 };
 
 
 /*
  * addr:hash cmds
  *
  *
  * ti->data:
  * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
  * [        8][        8[          8][         8]
  *
  * inv.mask4: 32 - mask
  * inv.mask6:
  * 1) _slow lookup: mask
  * 2) _aligned: (128 - mask) / 8
  * 3) _64: 8
  *
  *
  * pflags:
  * [v4=1/v6=0][hsize]
  * [       32][   32]
  */
 
 struct chashentry;
 
 SLIST_HEAD(chashbhead, chashentry);
 
 struct chash_cfg {
 	struct chashbhead *head4;
 	struct chashbhead *head6;
 	size_t	size4;
 	size_t	size6;
 	size_t	items4;
 	size_t	items6;
 	uint8_t	mask4;
 	uint8_t	mask6;
 };
 
 struct chashentry {
 	SLIST_ENTRY(chashentry)	next;
 	uint32_t	value;
 	uint32_t	type;
 	union {
 		uint32_t	a4;	/* Host format */
 		struct in6_addr	a6;	/* Network format */
 	} a;
 };
 
 struct ta_buf_chash
 {
 	void *ent_ptr;
 	struct chashentry ent;
 };
 
 #ifdef INET
 static __inline uint32_t hash_ip(uint32_t addr, int hsize);
 #endif
 #ifdef INET6
 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize);
 static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize);
 static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key,
     int mask, int hsize);
 static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask,
     int hsize);
 #endif
 static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_lookup_chash_aligned(struct table_info *ti, void *key,
     uint32_t keylen, uint32_t *val);
 static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int chash_parse_opts(struct chash_cfg *cfg, char *data);
 static void ta_print_chash_config(void *ta_state, struct table_info *ti,
     char *buf, size_t bufsize);
 static int ta_log2(uint32_t v);
 static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_chash(void *ta_state, struct table_info *ti);
 static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static uint32_t hash_ent(struct chashentry *ent, int af, int mlen,
     uint32_t size);
 static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent);
 static int ta_find_chash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_chash(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_chash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_chash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_chash(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags);
 static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_chash(void *ta_buf);
 
 
 #ifdef INET
 static __inline uint32_t
 hash_ip(uint32_t addr, int hsize)
 {
 
 	return (addr % (hsize - 1));
 }
 #endif
 
 #ifdef INET6
 static __inline uint32_t
 hash_ip6(struct in6_addr *addr6, int hsize)
 {
 	uint32_t i;
 
 	i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^
 	    addr6->s6_addr32[2] ^ addr6->s6_addr32[3];
 
 	return (i % (hsize - 1));
 }
 
 
 static __inline uint16_t
 hash_ip64(struct in6_addr *addr6, int hsize)
 {
 	uint32_t i;
 
 	i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1];
 
 	return (i % (hsize - 1));
 }
 
 
 static __inline uint32_t
 hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize)
 {
 	struct in6_addr mask6;
 
 	ipv6_writemask(&mask6, mask);
 	memcpy(addr6, key, sizeof(struct in6_addr));
 	APPLY_MASK(addr6, &mask6);
 	return (hash_ip6(addr6, hsize));
 }
 
 static __inline uint32_t
 hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize)
 {
 	uint64_t *paddr;
 
 	paddr = (uint64_t *)addr6;
 	*paddr = 0;
 	*(paddr + 1) = 0;
 	memcpy(addr6, key, mask);
 	return (hash_ip6(addr6, hsize));
 }
 #endif
 
 static int
 ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: worst scenario: non-round mask */
 		struct in6_addr addr6;
 		head = (struct chashbhead *)ti->xstate;
 		imask = (ti->data & 0xFF0000) >> 16;
 		hsize = 1 << (ti->data & 0xFF);
 		hash = hash_ip6_slow(&addr6, key, imask, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (memcmp(&ent->a.a6, &addr6, 16) == 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: aligned to 8bit mask */
 		struct in6_addr addr6;
 		uint64_t *paddr, *ptmp;
 		head = (struct chashbhead *)ti->xstate;
 		imask = (ti->data & 0xFF0000) >> 16;
 		hsize = 1 << (ti->data & 0xFF);
 
 		hash = hash_ip6_al(&addr6, key, imask, hsize);
 		paddr = (uint64_t *)&addr6;
 		SLIST_FOREACH(ent, &head[hash], next) {
 			ptmp = (uint64_t *)&ent->a.a6;
 			if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: /64 */
 		uint64_t a6, *paddr;
 		head = (struct chashbhead *)ti->xstate;
 		paddr = (uint64_t *)key;
 		hsize = 1 << (ti->data & 0xFF);
 		a6 = *paddr;
 		hash = hash_ip64((struct in6_addr *)key, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			paddr = (uint64_t *)&ent->a.a6;
 			if (a6 == *paddr) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 chash_parse_opts(struct chash_cfg *cfg, char *data)
 {
 	char *pdel, *pend, *s;
 	int mask4, mask6;
 
 	mask4 = cfg->mask4;
 	mask6 = cfg->mask6;
 
 	if (data == NULL)
 		return (0);
 	if ((pdel = strchr(data, ' ')) == NULL)
 		return (0);
 	while (*pdel == ' ')
 		pdel++;
 	if (strncmp(pdel, "masks=", 6) != 0)
 		return (EINVAL);
 	if ((s = strchr(pdel, ' ')) != NULL)
 		*s++ = '\0';
 
 	pdel += 6;
 	/* Need /XX[,/YY] */
 	if (*pdel++ != '/')
 		return (EINVAL);
 	mask4 = strtol(pdel, &pend, 10);
 	if (*pend == ',') {
 		/* ,/YY */
 		pdel = pend + 1;
 		if (*pdel++ != '/')
 			return (EINVAL);
 		mask6 = strtol(pdel, &pend, 10);
 		if (*pend != '\0')
 			return (EINVAL);
 	} else if (*pend != '\0')
 		return (EINVAL);
 
 	if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128)
 		return (EINVAL);
 
 	cfg->mask4 = mask4;
 	cfg->mask6 = mask6;
 
 	return (0);
 }
 
 static void
 ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf,
     size_t bufsize)
 {
 	struct chash_cfg *cfg;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	if (cfg->mask4 != 32 || cfg->mask6 != 128)
 		snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash",
 		    cfg->mask4, cfg->mask6);
 	else
 		snprintf(buf, bufsize, "%s", "addr:hash");
 }
 
 static int
 ta_log2(uint32_t v)
 {
 	uint32_t r;
 
 	r = 0;
 	while (v >>= 1)
 		r++;
 
 	return (r);
 }
 
 /*
  * New table.
  * We assume 'data' to be either NULL or the following format:
  * 'addr:hash [masks=/32[,/128]]'
  */
 static int
 ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	int error, i;
 	uint32_t hsize;
 	struct chash_cfg *cfg;
 
 	cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->mask4 = 32;
 	cfg->mask6 = 128;
 
 	if ((error = chash_parse_opts(cfg, data)) != 0) {
 		free(cfg, M_IPFW);
 		return (error);
 	}
 
 	cfg->size4 = 128;
 	cfg->size6 = 128;
 
 	cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_INIT(&cfg->head4[i]);
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_INIT(&cfg->head6[i]);
 
 
 	*ta_state = cfg;
 	ti->state = cfg->head4;
 	ti->xstate = cfg->head6;
 
 	/* Store data depending on v6 mask length */
 	hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
 	if (cfg->mask6 == 64) {
 		ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16|
 		    hsize;
 		ti->lookup = ta_lookup_chash_64;
 	} else if ((cfg->mask6  % 8) == 0) {
 		ti->data = (32 - cfg->mask4) << 24 |
 		    cfg->mask6 << 13 | hsize;
 		ti->lookup = ta_lookup_chash_aligned;
 	} else {
 		/* don't do that! */
 		ti->data = (32 - cfg->mask4) << 24 |
 		    cfg->mask6 << 16 | hsize;
 		ti->lookup = ta_lookup_chash_slow;
 	}
 
 	return (0);
 }
 
 static void
 ta_destroy_chash(void *ta_state, struct table_info *ti)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	free(cfg->head4, M_IPFW);
 	free(cfg->head6, M_IPFW);
 
 	free(cfg, M_IPFW);
 }
 
 static void
 ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct chash_cfg *cfg;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_HASH;
 	tinfo->size4 = cfg->size4;
 	tinfo->count4 = cfg->items4;
 	tinfo->itemsize4 = sizeof(struct chashentry);
 	tinfo->taclass6 = IPFW_TACLASS_HASH;
 	tinfo->size6 = cfg->size6;
 	tinfo->count6 = cfg->items6;
 	tinfo->itemsize6 = sizeof(struct chashentry);
 }
 
 static int
 ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent;
 
 	cfg = (struct chash_cfg *)ta_state;
 	ent = (struct chashentry *)e;
 
 	if (ent->type == AF_INET) {
 		tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4));
 		tent->masklen = cfg->mask4;
 		tent->subtype = AF_INET;
 		tent->v.kidx = ent->value;
 #ifdef INET6
 	} else {
 		memcpy(&tent->k.addr6, &ent->a.a6, sizeof(struct in6_addr));
 		tent->masklen = cfg->mask6;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = ent->value;
 #endif
 	}
 
 	return (0);
 }
 
 static uint32_t
 hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size)
 {
 	uint32_t hash;
 
 	hash = 0;
 
 	if (af == AF_INET) {
 #ifdef INET
 		hash = hash_ip(ent->a.a4, size);
 #endif
 	} else {
 #ifdef INET6
 		if (mlen == 64)
 			hash = hash_ip64(&ent->a.a6, size);
 		else
 			hash = hash_ip6(&ent->a.a6, size);
 #endif
 	}
 
 	return (hash);
 }
 
 static int
 tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent)
 {
 	int mlen;
 #ifdef INET6
 	struct in6_addr mask6;
 #endif
 
 
 	mlen = tei->masklen;
 	
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		if (mlen > 32)
 			return (EINVAL);
 		ent->type = AF_INET;
 
 		/* Calculate masked address */
 		ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen);
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		if (mlen > 128)
 			return (EINVAL);
 		ent->type = AF_INET6;
 
 		ipv6_writemask(&mask6, mlen);
 		memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr));
 		APPLY_MASK(&ent->a.a6, &mask6);
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 ta_find_chash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry ent, *tmp;
 	struct tentry_info tei;
 	int error;
 	uint32_t hash;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	memset(&ent, 0, sizeof(ent));
 	memset(&tei, 0, sizeof(tei));
 
 	if (tent->subtype == AF_INET) {
 		tei.paddr = &tent->k.addr;
 		tei.masklen = cfg->mask4;
 		tei.subtype = AF_INET;
 
 		if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
 			return (error);
 
 		head = cfg->head4;
 		hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (tmp->a.a4 != ent.a.a4)
 				continue;
 
 			ta_dump_chash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	} else {
 		tei.paddr = &tent->k.addr6;
 		tei.masklen = cfg->mask6;
 		tei.subtype = AF_INET6;
 
 		if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
 			return (error);
 
 		head = cfg->head6;
 		hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0)
 				continue;
 			ta_dump_chash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
 			f(ent, arg);
 
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
 			f(ent, arg);
 }
 
 static int
 ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 	struct chashentry *ent;
 	int error;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 
 	error = tei_to_chash_ent(tei, ent);
 	if (error != 0) {
 		free(ent, M_IPFW_TBL);
 		return (error);
 	}
 	tb->ent_ptr = ent;
 
 	return (0);
 }
 
 static int
 ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry *ent, *tmp;
 	struct ta_buf_chash *tb;
 	int exists;
 	uint32_t hash, value;
 
 	cfg = (struct chash_cfg *)ta_state;
 	tb = (struct ta_buf_chash *)ta_buf;
 	ent = (struct chashentry *)tb->ent_ptr;
 	hash = 0;
 	exists = 0;
 
 	/* Read current value from @tei */
 	ent->value = tei->value;
 
 	/* Read cuurrent value */
 	if (tei->subtype == AF_INET) {
 		if (tei->masklen != cfg->mask4)
 			return (EINVAL);
 		head = cfg->head4;
 		hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
 
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (tmp->a.a4 == ent->a.a4) {
 				exists = 1;
 				break;
 			}
 		}
 	} else {
 		if (tei->masklen != cfg->mask6)
 			return (EINVAL);
 		head = cfg->head6;
 		hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) {
 				exists = 1;
 				break;
 			}
 		}
 	}
 
 	if (exists == 1) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 	} else {
 		if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 			return (EFBIG);
 		SLIST_INSERT_HEAD(&head[hash], ent, next);
 		tb->ent_ptr = NULL;
 		*pnum = 1;
 
 		/* Update counters */
 		if (tei->subtype == AF_INET)
 			cfg->items4++;
 		else
 			cfg->items6++;
 	}
 
 	return (0);
 }
 
 static int
 ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	return (tei_to_chash_ent(tei, &tb->ent));
 }
 
 static int
 ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry *tmp, *tmp_next, *ent;
 	struct ta_buf_chash *tb;
 	uint32_t hash;
 
 	cfg = (struct chash_cfg *)ta_state;
 	tb = (struct ta_buf_chash *)ta_buf;
 	ent = &tb->ent;
 
 	if (tei->subtype == AF_INET) {
 		if (tei->masklen != cfg->mask4)
 			return (EINVAL);
 		head = cfg->head4;
 		hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
 
 		SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
 			if (tmp->a.a4 != ent->a.a4)
 				continue;
 
 			SLIST_REMOVE(&head[hash], tmp, chashentry, next);
 			cfg->items4--;
 			tb->ent_ptr = tmp;
 			tei->value = tmp->value;
 			*pnum = 1;
 			return (0);
 		}
 	} else {
 		if (tei->masklen != cfg->mask6)
 			return (EINVAL);
 		head = cfg->head6;
 		hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
 		SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
 			if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0)
 				continue;
 
 			SLIST_REMOVE(&head[hash], tmp, chashentry, next);
 			cfg->items6--;
 			tb->ent_ptr = tmp;
 			tei->value = tmp->value;
 			*pnum = 1;
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 /*
  * Hash growing callbacks.
  */
 
 static int
 ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct chash_cfg *cfg;
 	uint64_t data;
 
 	/*
 	 * Since we don't know exact number of IPv4/IPv6 records in @count,
 	 * ignore non-zero @count value at all. Check current hash sizes
 	 * and return appropriate data.
 	 */
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	data = 0;
 	if (cfg->items4 > cfg->size4 && cfg->size4 < 65536)
 		data |= (cfg->size4 * 2) << 16;
 	if (cfg->items6 > cfg->size6 && cfg->size6 < 65536)
 		data |= cfg->size6 * 2;
 
 	if (data != 0) {
 		*pflags = data;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger chash.
  */
 static int
 ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct chashbhead *head;
 	int i;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = (*pflags >> 16) & 0xFFFF;
 	mi->size6 = *pflags & 0xFFFF;
 	if (mi->size > 0) {
 		head = malloc(sizeof(struct chashbhead) * mi->size,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		for (i = 0; i < mi->size; i++)
 			SLIST_INIT(&head[i]);
 		mi->main_ptr = head;
 	}
 
 	if (mi->size6 > 0) {
 		head = malloc(sizeof(struct chashbhead) * mi->size6,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		for (i = 0; i < mi->size6; i++)
 			SLIST_INIT(&head[i]);
 		mi->main_ptr6 = head;
 	}
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 
 	/* In is not possible to do rehash if we're not holidng WLOCK. */
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct chash_cfg *cfg;
 	struct chashbhead *old_head, *new_head;
 	struct chashentry *ent, *ent_next;
 	int af, i, mlen;
 	uint32_t nhash;
 	size_t old_size, new_size;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct chash_cfg *)ta_state;
 
 	/* Check which hash we need to grow and do we still need that */
 	if (mi->size > 0 && cfg->size4 < mi->size) {
 		new_head = (struct chashbhead *)mi->main_ptr;
 		new_size = mi->size;
 		old_size = cfg->size4;
 		old_head = ti->state;
 		mlen = cfg->mask4;
 		af = AF_INET;
 
 		for (i = 0; i < old_size; i++) {
 			SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 				nhash = hash_ent(ent, af, mlen, new_size);
 				SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 			}
 		}
 
 		ti->state = new_head;
 		cfg->head4 = new_head;
 		cfg->size4 = mi->size;
 		mi->main_ptr = old_head;
 	}
 
 	if (mi->size6 > 0 && cfg->size6 < mi->size6) {
 		new_head = (struct chashbhead *)mi->main_ptr6;
 		new_size = mi->size6;
 		old_size = cfg->size6;
 		old_head = ti->xstate;
 		mlen = cfg->mask6;
 		af = AF_INET6;
 
 		for (i = 0; i < old_size; i++) {
 			SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 				nhash = hash_ent(ent, af, mlen, new_size);
 				SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 			}
 		}
 
 		ti->xstate = new_head;
 		cfg->head6 = new_head;
 		cfg->size6 = mi->size6;
 		mi->main_ptr6 = old_head;
 	}
 
 	/* Update lower 32 bits with new values */
 	ti->data &= 0xFFFFFFFF00000000;
 	ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_chash(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 	if (mi->main_ptr6 != NULL)
 		free(mi->main_ptr6, M_IPFW);
 }
 
 struct table_algo addr_hash = {
 	.name		= "addr:hash",
 	.type		= IPFW_TABLE_ADDR,
 	.ta_buf_size	= sizeof(struct ta_buf_chash),
 	.init		= ta_init_chash,
 	.destroy	= ta_destroy_chash,
 	.prepare_add	= ta_prepare_add_chash,
 	.prepare_del	= ta_prepare_del_chash,
 	.add		= ta_add_chash,
 	.del		= ta_del_chash,
 	.flush_entry	= ta_flush_chash_entry,
 	.foreach	= ta_foreach_chash,
 	.dump_tentry	= ta_dump_chash_tentry,
 	.find_tentry	= ta_find_chash_tentry,
 	.print_config	= ta_print_chash_config,
 	.dump_tinfo	= ta_dump_chash_tinfo,
 	.need_modify	= ta_need_modify_chash,
 	.prepare_mod	= ta_prepare_mod_chash,
 	.fill_mod	= ta_fill_mod_chash,
 	.modify		= ta_modify_chash,
 	.flush_mod	= ta_flush_mod_chash,
 };
 
 
 /*
  * Iface table cmds.
  *
  * Implementation:
  *
  * Runtime part:
  * - sorted array of "struct ifidx" pointed by ti->state.
  *   Array is allocated with rounding up to IFIDX_CHUNK. Only existing
  *   interfaces are stored in array, however its allocated size is
  *   sufficient to hold all table records if needed.
  * - current array size is stored in ti->data
  *
  * Table data:
  * - "struct iftable_cfg" is allocated to store table state (ta_state).
  * - All table records are stored inside namedobj instance.
  *
  */
 
 struct ifidx {
 	uint16_t	kidx;
 	uint16_t	spare;
 	uint32_t	value;
 };
 #define	DEFAULT_IFIDX_SIZE	64
 
 struct iftable_cfg;
 
 struct ifentry {
 	struct named_object	no;
 	struct ipfw_ifc		ic;
 	struct iftable_cfg	*icfg;
 	uint32_t		value;
 	int			linked;
 };
 
 struct iftable_cfg {
 	struct namedobj_instance	*ii;
 	struct ip_fw_chain	*ch;
 	struct table_info	*ti;
 	void	*main_ptr;
 	size_t	size;	/* Number of items allocated in array */
 	size_t	count;	/* Number of all items */
 	size_t	used;	/* Number of items _active_ now */
 };
 
 struct ta_buf_ifidx
 {
 	struct ifentry *ife;
 	uint32_t value;
 };
 
 int compare_ifidx(const void *k, const void *v);
 static struct ifidx * ifidx_find(struct table_info *ti, void *key);
 static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti);
 static int destroy_ifidx_locked(struct namedobj_instance *ii,
     struct named_object *no, void *arg);
 static void ta_destroy_ifidx(void *ta_state, struct table_info *ti);
 static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_ifidx(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_ifidx(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_ifidx_entry(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex);
 static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_ifidx(void *ta_buf);
 static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent);
 static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
     void *arg);
 static void ta_foreach_ifidx(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 int
 compare_ifidx(const void *k, const void *v)
 {
 	const struct ifidx *ifidx;
 	uint16_t key;
 
 	key = *((const uint16_t *)k);
 	ifidx = (const struct ifidx *)v;
 
 	if (key < ifidx->kidx)
 		return (-1);
 	else if (key > ifidx->kidx)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Adds item @item with key @key into ascending-sorted array @base.
  * Assumes @base has enough additional storage.
  *
  * Returns 1 on success, 0 on duplicate key.
  */
 static int
 badd(const void *key, void *item, void *base, size_t nmemb,
     size_t size, int (*compar) (const void *, const void *))
 {
 	int min, max, mid, shift, res;
 	caddr_t paddr;
 
 	if (nmemb == 0) {
 		memcpy(base, item, size);
 		return (1);
 	}
 
 	/* Binary search */
 	min = 0;
 	max = nmemb - 1;
 	mid = 0;
 	while (min <= max) {
 		mid = (min + max) / 2;
 		res = compar(key, (const void *)((caddr_t)base + mid * size));
 		if (res == 0)
 			return (0);
 
 		if (res > 0)
 			min = mid + 1;
 		else
 			max = mid - 1;
 	}
 
 	/* Item not found. */
 	res = compar(key, (const void *)((caddr_t)base + mid * size));
 	if (res > 0)
 		shift = mid + 1;
 	else
 		shift = mid;
 
 	paddr = (caddr_t)base + shift * size;
 	if (nmemb > shift)
 		memmove(paddr + size, paddr, (nmemb - shift) * size);
 
 	memcpy(paddr, item, size);
 
 	return (1);
 }
 
 /*
  * Deletes item with key @key from ascending-sorted array @base.
  *
  * Returns 1 on success, 0 for non-existent key.
  */
 static int
 bdel(const void *key, void *base, size_t nmemb, size_t size,
     int (*compar) (const void *, const void *))
 {
 	caddr_t item;
 	size_t sz;
 
 	item = (caddr_t)bsearch(key, base, nmemb, size, compar);
 
 	if (item == NULL)
 		return (0);
 
 	sz = (caddr_t)base + nmemb * size - item;
 
 	if (sz > 0)
 		memmove(item, item + size, sz);
 
 	return (1);
 }
 
 static struct ifidx *
 ifidx_find(struct table_info *ti, void *key)
 {
 	struct ifidx *ifi;
 
 	ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx),
 	    compare_ifidx);
 
 	return (ifi);
 }
 
 static int
 ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct ifidx *ifi;
 
 	ifi = ifidx_find(ti, key);
 
 	if (ifi != NULL) {
 		*val = ifi->value;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct iftable_cfg *icfg;
 
 	icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE);
 	icfg->size = DEFAULT_IFIDX_SIZE;
 	icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	icfg->ch = ch;
 
 	*ta_state = icfg;
 	ti->state = icfg->main_ptr;
 	ti->lookup = ta_lookup_ifidx;
 
 	return (0);
 }
 
 /*
  * Handle tableinfo @ti pointer change (on table array resize).
  */
 static void
 ta_change_ti_ifidx(void *ta_state, struct table_info *ti)
 {
 	struct iftable_cfg *icfg;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	icfg->ti = ti;
 }
 
 static int
 destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 	struct ifentry *ife;
 	struct ip_fw_chain *ch;
 
 	ch = (struct ip_fw_chain *)arg;
 	ife = (struct ifentry *)no;
 
 	ipfw_iface_del_notify(ch, &ife->ic);
 	ipfw_iface_unref(ch, &ife->ic);
 	free(ife, M_IPFW_TBL);
 	return (0);
 }
 
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_ifidx(void *ta_state, struct table_info *ti)
 {
 	struct iftable_cfg *icfg;
 	struct ip_fw_chain *ch;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	ch = icfg->ch;
 
 	if (icfg->main_ptr != NULL)
 		free(icfg->main_ptr, M_IPFW);
 
 	IPFW_UH_WLOCK(ch);
 	ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch);
 	IPFW_UH_WUNLOCK(ch);
 
 	ipfw_objhash_destroy(icfg->ii);
 
 	free(icfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct iftable_cfg *cfg;
 
 	cfg = (struct iftable_cfg *)ta_state;
 
 	tinfo->taclass4 = IPFW_TACLASS_ARRAY;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->used;
 	tinfo->itemsize4 = sizeof(struct ifidx);
 }
 
 /*
  * Prepare state to add to the table:
  * allocate ifentry and reference needed interface.
  */
 static int
 ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 	struct ifentry *ife;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	/* Check if string is terminated */
 	ifname = (char *)tei->paddr;
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO);
 	ife->ic.cb = if_notifier;
 	ife->ic.cbdata = ife;
 
 	if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) {
 		free(ife, M_IPFW_TBL);
 		return (EINVAL);
 	}
 
 	/* Use ipfw_iface 'ifname' field as stable storage */
 	ife->no.name = ife->ic.iface->ifname;
 
 	tb->ife = ife;
 
 	return (0);
 }
 
 static int
 ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife, *tmp;
 	struct ta_buf_ifidx *tb;
 	struct ipfw_iface *iif;
 	struct ifidx *ifi;
 	char *ifname;
 	uint32_t value;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 	ifname = (char *)tei->paddr;
 	icfg = (struct iftable_cfg *)ta_state;
 	ife = tb->ife;
 
 	ife->icfg = icfg;
 	ife->value = tei->value;
 
 	tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (tmp != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 
 		/* Exchange values in @tmp and @tei */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 
 		iif = tmp->ic.iface;
 		if (iif->resolved != 0) {
 			/* We have to update runtime value, too */
 			ifi = ifidx_find(ti, &iif->ifindex);
 			ifi->value = ife->value;
 		}
 
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	/* Link to internal list */
 	ipfw_objhash_add(icfg->ii, &ife->no);
 
 	/* Link notifier (possible running its callback) */
 	ipfw_iface_add_notify(icfg->ch, &ife->ic);
 	icfg->count++;
 
 	tb->ife = NULL;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Prepare to delete key from table.
  * Do basic interface name checks.
  */
 static int
 ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	/* Check if string is terminated */
 	ifname = (char *)tei->paddr;
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Remove key from both configuration list and
  * runtime array. Removed interface notification.
  */
 static int
 ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife;
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 	uint16_t ifindex;
 	int res;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 	ifname = (char *)tei->paddr;
 	icfg = (struct iftable_cfg *)ta_state;
 
 	ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (ife == NULL)
 		return (ENOENT);
 
 	if (ife->linked != 0) {
 		/* We have to remove item from runtime */
 		ifindex = ife->ic.iface->ifindex;
 
 		res = bdel(&ifindex, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 
 		KASSERT(res == 1, ("index %d does not exist", ifindex));
 		icfg->used--;
 		ti->data = icfg->used;
 		ife->linked = 0;
 	}
 
 	/* Unlink from local list */
 	ipfw_objhash_del(icfg->ii, &ife->no);
 	/* Unlink notifier and deref */
 	ipfw_iface_del_notify(icfg->ch, &ife->ic);
 	ipfw_iface_unref(icfg->ch, &ife->ic);
 
 	icfg->count--;
 	tei->value = ife->value;
 
 	tb->ife = ife;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Flush deleted entry.
  * Drops interface reference and frees entry.
  */
 static void
 ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	if (tb->ife != NULL)
 		free(tb->ife, M_IPFW_TBL);
 }
 
 
 /*
  * Handle interface announce/withdrawal for particular table.
  * Every real runtime array modification happens here.
  */
 static void
 if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex)
 {
 	struct ifentry *ife;
 	struct ifidx ifi;
 	struct iftable_cfg *icfg;
 	struct table_info *ti;
 	int res;
 
 	ife = (struct ifentry *)cbdata;
 	icfg = ife->icfg;
 	ti = icfg->ti;
 
 	KASSERT(ti != NULL, ("ti=NULL, check change_ti handler"));
 
 	if (ife->linked == 0 && ifindex != 0) {
 		/* Interface announce */
 		ifi.kidx = ifindex;
 		ifi.spare = 0;
 		ifi.value = ife->value;
 		res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 		KASSERT(res == 1, ("index %d already exists", ifindex));
 		icfg->used++;
 		ti->data = icfg->used;
 		ife->linked = 1;
 	} else if (ife->linked != 0 && ifindex == 0) {
 		/* Interface withdrawal */
 		ifindex = ife->ic.iface->ifindex;
 
 		res = bdel(&ifindex, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 
 		KASSERT(res == 1, ("index %d does not exist", ifindex));
 		icfg->used--;
 		ti->data = icfg->used;
 		ife->linked = 0;
 	}
 }
 
 
 /*
  * Table growing callbacks.
  */
 
 static int
 ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct iftable_cfg *cfg;
 	uint32_t size;
 
 	cfg = (struct iftable_cfg *)ta_state;
 
 	size = cfg->size;
 	while (size < cfg->count + count)
 		size *= 2;
 
 	if (size != cfg->size) {
 		*pflags = size;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate ned, larger runtime ifidx array.
  */
 static int
 ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct iftable_cfg *icfg;
 
 	mi = (struct mod_item *)ta_buf;
 	icfg = (struct iftable_cfg *)ta_state;
 
 	/* Check if we still need to grow array */
 	if (icfg->size >= mi->size) {
 		*pflags = 0;
 		return (0);
 	}
 
 	memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx));
 
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct iftable_cfg *icfg;
 	void *old_ptr;
 
 	mi = (struct mod_item *)ta_buf;
 	icfg = (struct iftable_cfg *)ta_state;
 
 	old_ptr = icfg->main_ptr;
 	icfg->main_ptr = mi->main_ptr;
 	icfg->size = mi->size;
 	ti->state = icfg->main_ptr;
 
 	mi->main_ptr = old_ptr;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_ifidx(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 static int
 ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct ifentry *ife;
 
 	ife = (struct ifentry *)e;
 
 	tent->masklen = 8 * IF_NAMESIZE;
 	memcpy(&tent->k, ife->no.name, IF_NAMESIZE);
 	tent->v.kidx = ife->value;
 
 	return (0);
 }
 
 static int
 ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife;
 	char *ifname;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	ifname = tent->k.iface;
 
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (ife != NULL) {
 		ta_dump_ifidx_tentry(ta_state, ti, ife, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 struct wa_ifidx {
 	ta_foreach_f	*f;
 	void		*arg;
 };
 
 static int
 foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 	struct ifentry *ife;
 	struct wa_ifidx *wa;
 
 	ife = (struct ifentry *)no;
 	wa = (struct wa_ifidx *)arg;
 
 	wa->f(ife, wa->arg);
 	return (0);
 }
 
 static void
 ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct iftable_cfg *icfg;
 	struct wa_ifidx wa;
 
 	icfg = (struct iftable_cfg *)ta_state;
 
 	wa.f = f;
 	wa.arg = arg;
 
 	ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa);
 }
 
 struct table_algo iface_idx = {
 	.name		= "iface:array",
 	.type		= IPFW_TABLE_INTERFACE,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_ifidx),
 	.init		= ta_init_ifidx,
 	.destroy	= ta_destroy_ifidx,
 	.prepare_add	= ta_prepare_add_ifidx,
 	.prepare_del	= ta_prepare_del_ifidx,
 	.add		= ta_add_ifidx,
 	.del		= ta_del_ifidx,
 	.flush_entry	= ta_flush_ifidx_entry,
 	.foreach	= ta_foreach_ifidx,
 	.dump_tentry	= ta_dump_ifidx_tentry,
 	.find_tentry	= ta_find_ifidx_tentry,
 	.dump_tinfo	= ta_dump_ifidx_tinfo,
 	.need_modify	= ta_need_modify_ifidx,
 	.prepare_mod	= ta_prepare_mod_ifidx,
 	.fill_mod	= ta_fill_mod_ifidx,
 	.modify		= ta_modify_ifidx,
 	.flush_mod	= ta_flush_mod_ifidx,
 	.change_ti	= ta_change_ti_ifidx,
 };
 
 /*
  * Number array cmds.
  *
  * Implementation:
  *
  * Runtime part:
  * - sorted array of "struct numarray" pointed by ti->state.
  *   Array is allocated with rounding up to NUMARRAY_CHUNK.
  * - current array size is stored in ti->data
  *
  */
 
 struct numarray {
 	uint32_t	number;
 	uint32_t	value;
 };
 
 struct numarray_cfg {
 	void	*main_ptr;
 	size_t	size;	/* Number of items allocated in array */
 	size_t	used;	/* Number of items _active_ now */
 };
 
 struct ta_buf_numarray
 {
 	struct numarray na;
 };
 
 int compare_numarray(const void *k, const void *v);
 static struct numarray *numarray_find(struct table_info *ti, void *key);
 static int ta_lookup_numarray(struct table_info *ti, void *key,
     uint32_t keylen, uint32_t *val);
 static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_numarray(void *ta_state, struct table_info *ti);
 static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_prepare_add_numarray(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_add_numarray(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_del_numarray(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_numarray_entry(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_need_modify_numarray(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_numarray(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t pflags);
 static void ta_flush_mod_numarray(void *ta_buf);
 static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_numarray(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 int
 compare_numarray(const void *k, const void *v)
 {
 	const struct numarray *na;
 	uint32_t key;
 
 	key = *((const uint32_t *)k);
 	na = (const struct numarray *)v;
 
 	if (key < na->number)
 		return (-1);
 	else if (key > na->number)
 		return (1);
 
 	return (0);
 }
 
 static struct numarray *
 numarray_find(struct table_info *ti, void *key)
 {
 	struct numarray *ri;
 
 	ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray),
 	    compare_ifidx);
 
 	return (ri);
 }
 
 static int
 ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct numarray *ri;
 
 	ri = numarray_find(ti, key);
 
 	if (ri != NULL) {
 		*val = ri->value;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->size = 16;
 	cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	*ta_state = cfg;
 	ti->state = cfg->main_ptr;
 	ti->lookup = ta_lookup_numarray;
 
 	return (0);
 }
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_numarray(void *ta_state, struct table_info *ti)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	if (cfg->main_ptr != NULL)
 		free(cfg->main_ptr, M_IPFW);
 
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	tinfo->taclass4 = IPFW_TACLASS_ARRAY;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->used;
 	tinfo->itemsize4 = sizeof(struct numarray);
 }
 
 /*
  * Prepare for addition/deletion to an array.
  */
 static int
 ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_numarray *tb;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 
 	tb->na.number = *((uint32_t *)tei->paddr);
 
 	return (0);
 }
 
 static int
 ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct numarray_cfg *cfg;
 	struct ta_buf_numarray *tb;
 	struct numarray *ri;
 	int res;
 	uint32_t value;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	/* Read current value from @tei */
 	tb->na.value = tei->value;
 
 	ri = numarray_find(ti, &tb->na.number);
 	
 	if (ri != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 
 		/* Exchange values between ri and @tei */
 		value = ri->value;
 		ri->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used,
 	    sizeof(struct numarray), compare_numarray);
 
 	KASSERT(res == 1, ("number %d already exists", tb->na.number));
 	cfg->used++;
 	ti->data = cfg->used;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Remove key from both configuration list and
  * runtime array. Removed interface notification.
  */
 static int
 ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct numarray_cfg *cfg;
 	struct ta_buf_numarray *tb;
 	struct numarray *ri;
 	int res;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	ri = numarray_find(ti, &tb->na.number);
 	if (ri == NULL)
 		return (ENOENT);
 
 	tei->value = ri->value;
 	
 	res = bdel(&tb->na.number, cfg->main_ptr, cfg->used,
 	    sizeof(struct numarray), compare_numarray);
 
 	KASSERT(res == 1, ("number %u does not exist", tb->na.number));
 	cfg->used--;
 	ti->data = cfg->used;
 	*pnum = 1;
 
 	return (0);
 }
 
 static void
 ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 
 	/* We don't have any state, do nothing */
 }
 
 
 /*
  * Table growing callbacks.
  */
 
 static int
 ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct numarray_cfg *cfg;
 	size_t size;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	size = cfg->size;
 	while (size < cfg->used + count)
 		size *= 2;
 
 	if (size != cfg->size) {
 		*pflags = size;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger runtime array.
  */
 static int
 ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct numarray_cfg *cfg;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	/* Check if we still need to grow array */
 	if (cfg->size >= mi->size) {
 		*pflags = 0;
 		return (0);
 	}
 
 	memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray));
 
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct numarray_cfg *cfg;
 	void *old_ptr;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	old_ptr = cfg->main_ptr;
 	cfg->main_ptr = mi->main_ptr;
 	cfg->size = mi->size;
 	ti->state = cfg->main_ptr;
 
 	mi->main_ptr = old_ptr;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_numarray(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 static int
 ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct numarray *na;
 
 	na = (struct numarray *)e;
 
 	tent->k.key = na->number;
 	tent->v.kidx = na->value;
 
 	return (0);
 }
 
 static int
 ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct numarray_cfg *cfg;
 	struct numarray *ri;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	ri = numarray_find(ti, &tent->k.key);
 
 	if (ri != NULL) {
 		ta_dump_numarray_tentry(ta_state, ti, ri, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct numarray_cfg *cfg;
 	struct numarray *array;
 	int i;
 
 	cfg = (struct numarray_cfg *)ta_state;
 	array = cfg->main_ptr;
 
 	for (i = 0; i < cfg->used; i++)
 		f(&array[i], arg);
 }
 
 struct table_algo number_array = {
 	.name		= "number:array",
 	.type		= IPFW_TABLE_NUMBER,
 	.ta_buf_size	= sizeof(struct ta_buf_numarray),
 	.init		= ta_init_numarray,
 	.destroy	= ta_destroy_numarray,
 	.prepare_add	= ta_prepare_add_numarray,
 	.prepare_del	= ta_prepare_add_numarray,
 	.add		= ta_add_numarray,
 	.del		= ta_del_numarray,
 	.flush_entry	= ta_flush_numarray_entry,
 	.foreach	= ta_foreach_numarray,
 	.dump_tentry	= ta_dump_numarray_tentry,
 	.find_tentry	= ta_find_numarray_tentry,
 	.dump_tinfo	= ta_dump_numarray_tinfo,
 	.need_modify	= ta_need_modify_numarray,
 	.prepare_mod	= ta_prepare_mod_numarray,
 	.fill_mod	= ta_fill_mod_numarray,
 	.modify		= ta_modify_numarray,
 	.flush_mod	= ta_flush_mod_numarray,
 };
 
 /*
  * flow:hash cmds
  *
  *
  * ti->data:
  * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
  * [        8][        8[          8][         8]
  *
  * inv.mask4: 32 - mask
  * inv.mask6:
  * 1) _slow lookup: mask
  * 2) _aligned: (128 - mask) / 8
  * 3) _64: 8
  *
  *
  * pflags:
  * [hsize4][hsize6]
  * [    16][    16]
  */
 
 struct fhashentry;
 
 SLIST_HEAD(fhashbhead, fhashentry);
 
 struct fhashentry {
 	SLIST_ENTRY(fhashentry)	next;
 	uint8_t		af;
 	uint8_t		proto;
 	uint16_t	spare0;
 	uint16_t	dport;
 	uint16_t	sport;
 	uint32_t	value;
 	uint32_t	spare1;
 };
 
 struct fhashentry4 {
 	struct fhashentry	e;
 	struct in_addr		dip;
 	struct in_addr		sip;
 };
 
 struct fhashentry6 {
 	struct fhashentry	e;
 	struct in6_addr		dip6;
 	struct in6_addr		sip6;
 };
 
 struct fhash_cfg {
 	struct fhashbhead	*head;
 	size_t			size;
 	size_t			items;
 	struct fhashentry4	fe4;
 	struct fhashentry6	fe6;
 };
 
 struct ta_buf_fhash {
 	void	*ent_ptr;
 	struct fhashentry6 fe6;
 };
 
 static __inline int cmp_flow_ent(struct fhashentry *a,
     struct fhashentry *b, size_t sz);
 static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize);
 static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize);
 static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size);
 static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state,
 struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_fhash(void *ta_state, struct table_info *ti);
 static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent);
 static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_fhash(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static int ta_prepare_add_fhash(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_add_fhash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_fhash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_fhash(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_fhash(void *ta_buf);
 
 static __inline int
 cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz)
 {
 	uint64_t *ka, *kb;
 
 	ka = (uint64_t *)(&a->next + 1);
 	kb = (uint64_t *)(&b->next + 1);
 
 	if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0))
 		return (1);
 
 	return (0);
 }
 
 static __inline uint32_t
 hash_flow4(struct fhashentry4 *f, int hsize)
 {
 	uint32_t i;
 
 	i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport);
 
 	return (i % (hsize - 1));
 }
 
 static __inline uint32_t
 hash_flow6(struct fhashentry6 *f, int hsize)
 {
 	uint32_t i;
 
 	i = (f->dip6.__u6_addr.__u6_addr32[2]) ^
 	    (f->dip6.__u6_addr.__u6_addr32[3]) ^
 	    (f->sip6.__u6_addr.__u6_addr32[2]) ^
 	    (f->sip6.__u6_addr.__u6_addr32[3]) ^
 	    (f->e.dport) ^ (f->e.sport);
 
 	return (i % (hsize - 1));
 }
 
 static uint32_t
 hash_flow_ent(struct fhashentry *ent, uint32_t size)
 {
 	uint32_t hash;
 
 	if (ent->af == AF_INET) {
 		hash = hash_flow4((struct fhashentry4 *)ent, size);
 	} else {
 		hash = hash_flow6((struct fhashentry6 *)ent, size);
 	}
 
 	return (hash);
 }
 
 static int
 ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct fhashbhead *head;
 	struct fhashentry *ent;
 	struct fhashentry4 *m4;
 	struct ipfw_flow_id *id;
 	uint16_t hash, hsize;
 
 	id = (struct ipfw_flow_id *)key;
 	head = (struct fhashbhead *)ti->state;
 	hsize = ti->data;
 	m4 = (struct fhashentry4 *)ti->xstate;
 
 	if (id->addr_type == 4) {
 		struct fhashentry4 f;
 
 		/* Copy hash mask */
 		f = *m4;
 
 		f.dip.s_addr &= id->dst_ip;
 		f.sip.s_addr &= id->src_ip;
 		f.e.dport &= id->dst_port;
 		f.e.sport &= id->src_port;
 		f.e.proto &= id->proto;
 		hash = hash_flow4(&f, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 	} else if (id->addr_type == 6) {
 		struct fhashentry6 f;
 		uint64_t *fp, *idp;
 
 		/* Copy hash mask */
 		f = *((struct fhashentry6 *)(m4 + 1));
 
 		/* Handle lack of __u6_addr.__u6_addr64 */
 		fp = (uint64_t *)&f.dip6;
 		idp = (uint64_t *)&id->dst_ip6;
 		/* src IPv6 is stored after dst IPv6 */
 		*fp++ &= *idp++;
 		*fp++ &= *idp++;
 		*fp++ &= *idp++;
 		*fp &= *idp;
 		f.e.dport &= id->dst_port;
 		f.e.sport &= id->src_port;
 		f.e.proto &= id->proto;
 		hash = hash_flow6(&f, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * New table.
  */
 static int
 ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
-	int i;
 	struct fhash_cfg *cfg;
 	struct fhashentry4 *fe4;
 	struct fhashentry6 *fe6;
+	u_int i;
 
 	cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->size = 512;
 
 	cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < cfg->size; i++)
 		SLIST_INIT(&cfg->head[i]);
 
 	/* Fill in fe masks based on @tflags */
 	fe4 = &cfg->fe4;
 	fe6 = &cfg->fe6;
 	if (tflags & IPFW_TFFLAG_SRCIP) {
 		memset(&fe4->sip, 0xFF, sizeof(fe4->sip));
 		memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6));
 	}
 	if (tflags & IPFW_TFFLAG_DSTIP) {
 		memset(&fe4->dip, 0xFF, sizeof(fe4->dip));
 		memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6));
 	}
 	if (tflags & IPFW_TFFLAG_SRCPORT) {
 		memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport));
 		memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport));
 	}
 	if (tflags & IPFW_TFFLAG_DSTPORT) {
 		memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport));
 		memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport));
 	}
 	if (tflags & IPFW_TFFLAG_PROTO) {
 		memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto));
 		memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto));
 	}
 
 	fe4->e.af = AF_INET;
 	fe6->e.af = AF_INET6;
 
 	*ta_state = cfg;
 	ti->state = cfg->head;
 	ti->xstate = &cfg->fe4;
 	ti->data = cfg->size;
 	ti->lookup = ta_lookup_fhash;
 
 	return (0);
 }
 
 static void
 ta_destroy_fhash(void *ta_state, struct table_info *ti)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	free(cfg->head, M_IPFW);
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct fhash_cfg *cfg;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_HASH;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->items;
 	tinfo->itemsize4 = sizeof(struct fhashentry4);
 	tinfo->itemsize6 = sizeof(struct fhashentry6);
 }
 
 static int
 ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent;
 	struct fhashentry4 *fe4;
 #ifdef INET6
 	struct fhashentry6 *fe6;
 #endif
 	struct tflow_entry *tfe;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	ent = (struct fhashentry *)e;
 	tfe = &tent->k.flow;
 
 	tfe->af = ent->af;
 	tfe->proto = ent->proto;
 	tfe->dport = htons(ent->dport);
 	tfe->sport = htons(ent->sport);
 	tent->v.kidx = ent->value;
 	tent->subtype = ent->af;
 
 	if (ent->af == AF_INET) {
 		fe4 = (struct fhashentry4 *)ent;
 		tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr);
 		tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr);
 		tent->masklen = 32;
 #ifdef INET6
 	} else {
 		fe6 = (struct fhashentry6 *)ent;
 		tfe->a.a6.sip6 = fe6->sip6;
 		tfe->a.a6.dip6 = fe6->dip6;
 		tent->masklen = 128;
 #endif
 	}
 
 	return (0);
 }
 
 static int
 tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent)
 {
 #ifdef INET
 	struct fhashentry4 *fe4;
 #endif
 #ifdef INET6
 	struct fhashentry6 *fe6;
 #endif
 	struct tflow_entry *tfe;
 
 	tfe = (struct tflow_entry *)tei->paddr;
 
 	ent->af = tei->subtype;
 	ent->proto = tfe->proto;
 	ent->dport = ntohs(tfe->dport);
 	ent->sport = ntohs(tfe->sport);
 
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		fe4 = (struct fhashentry4 *)ent;
 		fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr);
 		fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr);
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		fe6 = (struct fhashentry6 *)ent;
 		fe6->sip6 = tfe->a.a6.sip6;
 		fe6->dip6 = tfe->a.a6.dip6;
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 
 static int
 ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct fhashentry6 fe6;
 	struct tentry_info tei;
 	int error;
 	uint32_t hash;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	ent = &fe6.e;
 
 	memset(&fe6, 0, sizeof(fe6));
 	memset(&tei, 0, sizeof(tei));
 
 	tei.paddr = &tent->k.flow;
 	tei.subtype = tent->subtype;
 
 	if ((error = tei_to_fhash_ent(&tei, ent)) != 0)
 		return (error);
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei.subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) != 0) {
 			ta_dump_fhash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
 			f(ent, arg);
 }
 
 static int
 ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 	struct fhashentry *ent;
 	size_t sz;
 	int error;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	if (tei->subtype == AF_INET)
 		sz = sizeof(struct fhashentry4);
 	else if (tei->subtype == AF_INET6)
 		sz = sizeof(struct fhashentry6);
 	else
 		return (EINVAL);
 
 	ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO);
 
 	error = tei_to_fhash_ent(tei, ent);
 	if (error != 0) {
 		free(ent, M_IPFW_TBL);
 		return (error);
 	}
 	tb->ent_ptr = ent;
 
 	return (0);
 }
 
 static int
 ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct ta_buf_fhash *tb;
 	int exists;
 	uint32_t hash, value;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	tb = (struct ta_buf_fhash *)ta_buf;
 	ent = (struct fhashentry *)tb->ent_ptr;
 	exists = 0;
 
 	/* Read current value from @tei */
 	ent->value = tei->value;
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei->subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) != 0) {
 			exists = 1;
 			break;
 		}
 	}
 
 	if (exists == 1) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		/* Exchange values between tmp and @tei */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 	} else {
 		if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 			return (EFBIG);
 
 		SLIST_INSERT_HEAD(&head[hash], ent, next);
 		tb->ent_ptr = NULL;
 		*pnum = 1;
 
 		/* Update counters and check if we need to grow hash */
 		cfg->items++;
 	}
 
 	return (0);
 }
 
 static int
 ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	return (tei_to_fhash_ent(tei, &tb->fe6.e));
 }
 
 static int
 ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct ta_buf_fhash *tb;
 	uint32_t hash;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	tb = (struct ta_buf_fhash *)ta_buf;
 	ent = &tb->fe6.e;
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei->subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) == 0)
 			continue;
 
 		SLIST_REMOVE(&head[hash], tmp, fhashentry, next);
 		tei->value = tmp->value;
 		*pnum = 1;
 		cfg->items--;
 		tb->ent_ptr = tmp;
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 /*
  * Hash growing callbacks.
  */
 
 static int
 ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct fhash_cfg *cfg;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	if (cfg->items > cfg->size && cfg->size < 65536) {
 		*pflags = cfg->size * 2;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger fhash.
  */
 static int
 ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct fhashbhead *head;
-	int i;
+	u_int i;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < mi->size; i++)
 		SLIST_INIT(&head[i]);
 
 	mi->main_ptr = head;
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 
 	/* In is not possible to do rehash if we're not holidng WLOCK. */
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct fhash_cfg *cfg;
 	struct fhashbhead *old_head, *new_head;
 	struct fhashentry *ent, *ent_next;
 	int i;
 	uint32_t nhash;
 	size_t old_size;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct fhash_cfg *)ta_state;
 
 	old_size = cfg->size;
 	old_head = ti->state;
 
 	new_head = (struct fhashbhead *)mi->main_ptr;
 	for (i = 0; i < old_size; i++) {
 		SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 			nhash = hash_flow_ent(ent, mi->size);
 			SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 		}
 	}
 
 	ti->state = new_head;
 	ti->data = mi->size;
 	cfg->head = new_head;
 	cfg->size = mi->size;
 
 	mi->main_ptr = old_head;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_fhash(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 struct table_algo flow_hash = {
 	.name		= "flow:hash",
 	.type		= IPFW_TABLE_FLOW,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_fhash),
 	.init		= ta_init_fhash,
 	.destroy	= ta_destroy_fhash,
 	.prepare_add	= ta_prepare_add_fhash,
 	.prepare_del	= ta_prepare_del_fhash,
 	.add		= ta_add_fhash,
 	.del		= ta_del_fhash,
 	.flush_entry	= ta_flush_fhash_entry,
 	.foreach	= ta_foreach_fhash,
 	.dump_tentry	= ta_dump_fhash_tentry,
 	.find_tentry	= ta_find_fhash_tentry,
 	.dump_tinfo	= ta_dump_fhash_tinfo,
 	.need_modify	= ta_need_modify_fhash,
 	.prepare_mod	= ta_prepare_mod_fhash,
 	.fill_mod	= ta_fill_mod_fhash,
 	.modify		= ta_modify_fhash,
 	.flush_mod	= ta_flush_mod_fhash,
 };
 
 /*
  * Kernel fibs bindings.
  *
  * Implementation:
  *
  * Runtime part:
  * - fully relies on route API
  * - fib number is stored in ti->data
  *
  */
 
 static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int kfib_parse_opts(int *pfib, char *data);
 static void ta_print_kfib_config(void *ta_state, struct table_info *ti,
     char *buf, size_t bufsize);
 static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_kfib(void *ta_state, struct table_info *ti);
 static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int contigmask(uint8_t *p, int len);
 static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent);
 static int ta_dump_kfib_tentry_int(struct sockaddr *paddr,
     struct sockaddr *pmask, ipfw_obj_tentry *tent);
 static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_kfib(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 
 static int
 ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 #ifdef INET
 	struct nhop4_basic nh4;
 	struct in_addr in;
 #endif
 #ifdef INET6
 	struct nhop6_basic nh6;
 #endif
 	int error;
 
 	error = ENOENT;
 #ifdef INET
 	if (keylen == 4) {
 		in.s_addr = *(in_addr_t *)key;
 		error = fib4_lookup_nh_basic(ti->data,
 		    in, 0, 0, &nh4);
 	}
 #endif
 #ifdef INET6
 	if (keylen == 6)
 		error = fib6_lookup_nh_basic(ti->data,
 		    (struct in6_addr *)key, 0, 0, 0, &nh6);
 #endif
 
 	if (error != 0)
 		return (0);
 
 	*val = 0;
 
 	return (1);
 }
 
 /* Parse 'fib=%d' */
 static int
 kfib_parse_opts(int *pfib, char *data)
 {
 	char *pdel, *pend, *s;
 	int fibnum;
 
 	if (data == NULL)
 		return (0);
 	if ((pdel = strchr(data, ' ')) == NULL)
 		return (0);
 	while (*pdel == ' ')
 		pdel++;
 	if (strncmp(pdel, "fib=", 4) != 0)
 		return (EINVAL);
 	if ((s = strchr(pdel, ' ')) != NULL)
 		*s++ = '\0';
 
 	pdel += 4;
 	/* Need \d+ */
 	fibnum = strtol(pdel, &pend, 10);
 	if (*pend != '\0')
 		return (EINVAL);
 
 	*pfib = fibnum;
 
 	return (0);
 }
 
 static void
 ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf,
     size_t bufsize)
 {
 
 	if (ti->data != 0)
 		snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data);
 	else
 		snprintf(buf, bufsize, "%s", "addr:kfib");
 }
 
 static int
 ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	int error, fibnum;
 
 	fibnum = 0;
 	if ((error = kfib_parse_opts(&fibnum, data)) != 0)
 		return (error);
 
 	if (fibnum >= rt_numfibs)
 		return (E2BIG);
 
 	ti->data = fibnum;
 	ti->lookup = ta_lookup_kfib;
 
 	return (0);
 }
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_kfib(void *ta_state, struct table_info *ti)
 {
 
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA;
 	tinfo->taclass4 = IPFW_TACLASS_RADIX;
 	tinfo->count4 = 0;
 	tinfo->itemsize4 = sizeof(struct rtentry);
 	tinfo->taclass6 = IPFW_TACLASS_RADIX;
 	tinfo->count6 = 0;
 	tinfo->itemsize6 = sizeof(struct rtentry);
 }
 
 static int
 contigmask(uint8_t *p, int len)
 {
 	int i, n;
 
 	for (i = 0; i < len ; i++)
 		if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
 			break;
 	for (n= i + 1; n < len; n++)
 		if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
 			return (-1); /* mask not contiguous */
 	return (i);
 }
 
 
 static int
 ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct rtentry *rte;
 
 	rte = (struct rtentry *)e;
 
 	return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent);
 }
 
 static int
 ta_dump_kfib_tentry_int(struct sockaddr *paddr, struct sockaddr *pmask,
     ipfw_obj_tentry *tent)
 {
 #ifdef INET
 	struct sockaddr_in *addr, *mask;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *addr6, *mask6;
 #endif
 	int len;
 
 	len = 0;
 
 	/* Guess IPv4/IPv6 radix by sockaddr family */
 #ifdef INET
 	if (paddr->sa_family == AF_INET) {
 		addr = (struct sockaddr_in *)paddr;
 		mask = (struct sockaddr_in *)pmask;
 		tent->k.addr.s_addr = addr->sin_addr.s_addr;
 		len = 32;
 		if (mask != NULL)
 			len = contigmask((uint8_t *)&mask->sin_addr, 32);
 		if (len == -1)
 			len = 0;
 		tent->masklen = len;
 		tent->subtype = AF_INET;
 		tent->v.kidx = 0; /* Do we need to put GW here? */
 	}
 #endif
 #ifdef INET6
 	if (paddr->sa_family == AF_INET6) {
 		addr6 = (struct sockaddr_in6 *)paddr;
 		mask6 = (struct sockaddr_in6 *)pmask;
 		memcpy(&tent->k.addr6, &addr6->sin6_addr,
 		    sizeof(struct in6_addr));
 		len = 128;
 		if (mask6 != NULL)
 			len = contigmask((uint8_t *)&mask6->sin6_addr, 128);
 		if (len == -1)
 			len = 0;
 		tent->masklen = len;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = 0;
 	}
 #endif
 
 	return (0);
 }
 
 static int
 ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct rt_addrinfo info;
 	struct sockaddr_in6 key6, dst6, mask6;
 	struct sockaddr *dst, *key, *mask;
 
 	/* Prepare sockaddr for prefix/mask and info */
 	bzero(&dst6, sizeof(dst6));
 	dst6.sin6_len = sizeof(dst6);
 	dst = (struct sockaddr *)&dst6;
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_len = sizeof(mask6);
 	mask = (struct sockaddr *)&mask6;
 
 	bzero(&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_NETMASK] = mask;
 
 	/* Prepare the lookup key */
 	bzero(&key6, sizeof(key6));
 	key6.sin6_family = tent->subtype;
 	key = (struct sockaddr *)&key6;
 
 	if (tent->subtype == AF_INET) {
 		((struct sockaddr_in *)&key6)->sin_addr = tent->k.addr;
 		key6.sin6_len = sizeof(struct sockaddr_in);
 	} else {
 		key6.sin6_addr = tent->k.addr6;
 		key6.sin6_len = sizeof(struct sockaddr_in6);
 	}
 
 	if (rib_lookup_info(ti->data, key, 0, 0, &info) != 0)
 		return (ENOENT);
 	if ((info.rti_addrs & RTA_NETMASK) == 0)
 		mask = NULL;
 
 	ta_dump_kfib_tentry_int(dst, mask, tent);
 
 	return (0);
 }
 
 static void
 ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct rib_head *rh;
 	int error;
 
 	rh = rt_tables_get_rnh(ti->data, AF_INET);
 	if (rh != NULL) {
 		RIB_RLOCK(rh); 
 		error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
 		RIB_RUNLOCK(rh);
 	}
 
 	rh = rt_tables_get_rnh(ti->data, AF_INET6);
 	if (rh != NULL) {
 		RIB_RLOCK(rh); 
 		error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
 		RIB_RUNLOCK(rh); 
 	}
 }
 
 struct table_algo addr_kfib = {
 	.name		= "addr:kfib",
 	.type		= IPFW_TABLE_ADDR,
 	.flags		= TA_FLAG_READONLY,
 	.ta_buf_size	= 0,
 	.init		= ta_init_kfib,
 	.destroy	= ta_destroy_kfib,
 	.foreach	= ta_foreach_kfib,
 	.dump_tentry	= ta_dump_kfib_tentry,
 	.find_tentry	= ta_find_kfib_tentry,
 	.dump_tinfo	= ta_dump_kfib_tinfo,
 	.print_config	= ta_print_kfib_config,
 };
 
 void
 ipfw_table_algo_init(struct ip_fw_chain *ch)
 {
 	size_t sz;
 
 	/*
 	 * Register all algorithms presented here.
 	 */
 	sz = sizeof(struct table_algo);
 	ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx);
 	ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx);
 	ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx);
 	ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx);
 	ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx);
 	ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx);
 }
 
 void
 ipfw_table_algo_destroy(struct ip_fw_chain *ch)
 {
 
 	ipfw_del_table_algo(ch, addr_radix.idx);
 	ipfw_del_table_algo(ch, addr_hash.idx);
 	ipfw_del_table_algo(ch, iface_idx.idx);
 	ipfw_del_table_algo(ch, number_array.idx);
 	ipfw_del_table_algo(ch, flow_hash.idx);
 	ipfw_del_table_algo(ch, addr_kfib.idx);
 }
 
 
Index: head/sys/netsmb/smb_crypt.c
===================================================================
--- head/sys/netsmb/smb_crypt.c	(revision 328237)
+++ head/sys/netsmb/smb_crypt.c	(revision 328238)
@@ -1,311 +1,311 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2000-2001, Boris Popov
  * All rights reserved.
  *
  * Copyright (c) 2003, 2004 Tim J. Robbins.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/endian.h>
 #include <sys/mbuf.h>
 #include <sys/mchain.h>
 #include <sys/md4.h>
 #include <sys/md5.h>
 #include <sys/iconv.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 #include <netsmb/smb_rq.h>
 #include <netsmb/smb_dev.h>
 
 #include <crypto/des/des.h>
 
 #include "opt_netsmb.h"
 
 static u_char N8[] = {0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25};
 
 
 static void
 smb_E(const u_char *key, u_char *data, u_char *dest)
 {
 	des_key_schedule *ksp;
 	u_char kk[8];
 
 	kk[0] = key[0] & 0xfe;
 	kk[1] = key[0] << 7 | (key[1] >> 1 & 0xfe);
 	kk[2] = key[1] << 6 | (key[2] >> 2 & 0xfe);
 	kk[3] = key[2] << 5 | (key[3] >> 3 & 0xfe);
 	kk[4] = key[3] << 4 | (key[4] >> 4 & 0xfe);
 	kk[5] = key[4] << 3 | (key[5] >> 5 & 0xfe);
 	kk[6] = key[5] << 2 | (key[6] >> 6 & 0xfe);
 	kk[7] = key[6] << 1;
 	ksp = malloc(sizeof(des_key_schedule), M_SMBTEMP, M_WAITOK);
 	des_set_key((des_cblock *)kk, *ksp);
 	des_ecb_encrypt((des_cblock *)data, (des_cblock *)dest, *ksp, 1);
 	free(ksp, M_SMBTEMP);
 }
 
 
 int
 smb_encrypt(const u_char *apwd, u_char *C8, u_char *RN)
 {
 	u_char *p, *P14, *S21;
 
 	p = malloc(14 + 21, M_SMBTEMP, M_WAITOK);
 	bzero(p, 14 + 21);
 	P14 = p;
 	S21 = p + 14;
 	bcopy(apwd, P14, min(14, strlen(apwd)));
 	/*
 	 * S21 = concat(Ex(P14, N8), zeros(5));
 	 */
 	smb_E(P14, N8, S21);
 	smb_E(P14 + 7, N8, S21 + 8);
 
 	smb_E(S21, C8, RN);
 	smb_E(S21 + 7, C8, RN + 8);
 	smb_E(S21 + 14, C8, RN + 16);
 	free(p, M_SMBTEMP);
 	return 0;
 }
 
 int
 smb_ntencrypt(const u_char *apwd, u_char *C8, u_char *RN)
 {
 	u_char S21[21];
 	u_int16_t *unipwd;
 	MD4_CTX *ctxp;
-	int len;
+	u_int len;
 
 	len = strlen(apwd);
 	unipwd = malloc((len + 1) * sizeof(u_int16_t), M_SMBTEMP, M_WAITOK);
 	/*
 	 * S21 = concat(MD4(U(apwd)), zeros(5));
 	 */
 	smb_strtouni(unipwd, apwd);
 	ctxp = malloc(sizeof(MD4_CTX), M_SMBTEMP, M_WAITOK);
 	MD4Init(ctxp);
 	MD4Update(ctxp, (u_char*)unipwd, len * sizeof(u_int16_t));
 	free(unipwd, M_SMBTEMP);
 	bzero(S21, 21);
 	MD4Final(S21, ctxp);
 	free(ctxp, M_SMBTEMP);
 
 	smb_E(S21, C8, RN);
 	smb_E(S21 + 7, C8, RN + 8);
 	smb_E(S21 + 14, C8, RN + 16);
 	return 0;
 }
 
 /*
  * Calculate message authentication code (MAC) key for virtual circuit.
  */
 int
 smb_calcmackey(struct smb_vc *vcp)
 {
 	const char *pwd;
 	u_int16_t *unipwd;
-	int len;
+	u_int len;
 	MD4_CTX md4;
 	u_char S16[16], S21[21];
 
 	KASSERT(vcp->vc_hflags2 & SMB_FLAGS2_SECURITY_SIGNATURE,
 	    ("signatures not enabled"));
 
 	if (vcp->vc_mackey != NULL) {
 		free(vcp->vc_mackey, M_SMBTEMP);
 		vcp->vc_mackey = NULL;
 		vcp->vc_mackeylen = 0;
 		vcp->vc_seqno = 0;
 	}
 
 	/*
 	 * The partial MAC key is the concatenation of the 16 byte session
 	 * key and the 24 byte challenge response.
 	 */
 	vcp->vc_mackeylen = 16 + 24;
 	vcp->vc_mackey = malloc(vcp->vc_mackeylen, M_SMBTEMP, M_WAITOK);
 
 	/*
 	 * Calculate session key:
 	 *	MD4(MD4(U(PN)))
 	 */
 	pwd = smb_vc_getpass(vcp);
 	len = strlen(pwd);
 	unipwd = malloc((len + 1) * sizeof(u_int16_t), M_SMBTEMP, M_WAITOK);
 	smb_strtouni(unipwd, pwd);
 	MD4Init(&md4);
 	MD4Update(&md4, (u_char *)unipwd, len * sizeof(u_int16_t));
 	MD4Final(S16, &md4);
 	MD4Init(&md4);
 	MD4Update(&md4, S16, 16);
 	MD4Final(vcp->vc_mackey, &md4);
 	free(unipwd, M_SMBTEMP);
 
 	/*
 	 * Calculate response to challenge:
 	 *	Ex(concat(MD4(U(pass)), zeros(5)), C8)
 	 */
 	bzero(S21, 21);
 	bcopy(S16, S21, 16);
 	smb_E(S21, vcp->vc_ch, vcp->vc_mackey + 16);
 	smb_E(S21 + 7, vcp->vc_ch, vcp->vc_mackey + 24);
 	smb_E(S21 + 14, vcp->vc_ch, vcp->vc_mackey + 32);
 
 	return (0);
 }
 
 /*
  * Sign request with MAC.
  */
 int
 smb_rq_sign(struct smb_rq *rqp)
 {
 	struct smb_vc *vcp = rqp->sr_vc;
 	struct mbchain *mbp;
 	struct mbuf *mb;
 	MD5_CTX md5;
 	u_char digest[16];
 
 	KASSERT(vcp->vc_hflags2 & SMB_FLAGS2_SECURITY_SIGNATURE,
 	    ("signatures not enabled"));
 
 	if (vcp->vc_mackey == NULL)
 		/* XXX Should assert that cmd == SMB_COM_NEGOTIATE. */
 		return (0);
 
 	/*
 	 * This is a bit of a kludge. If the request is non-TRANSACTION,
 	 * or it is the first request of a transaction, give it the next
 	 * sequence number, and expect the reply to have the sequence number
 	 * following that one. Otherwise, it is a secondary request in
 	 * a transaction, and it gets the same sequence numbers as the
 	 * primary request.
 	 */
 	if (rqp->sr_t2 == NULL ||
 	    (rqp->sr_t2->t2_flags & SMBT2_SECONDARY) == 0) {
 		rqp->sr_seqno = vcp->vc_seqno++;
 		rqp->sr_rseqno = vcp->vc_seqno++;
 	} else {
 		/*
 		 * Sequence numbers are already in the struct because
 		 * smb_t2_request_int() uses the same one for all the
 		 * requests in the transaction.
 		 * (At least we hope so.)
 		 */
 		KASSERT(rqp->sr_t2 == NULL ||
 		    (rqp->sr_t2->t2_flags & SMBT2_SECONDARY) == 0 ||
 		    rqp->sr_t2->t2_rq == rqp,
 		    ("sec t2 rq not using same smb_rq"));
 	}
 
 	/* Initialize sec. signature field to sequence number + zeros. */
 	le32enc(rqp->sr_rqsig, rqp->sr_seqno);
 	le32enc(rqp->sr_rqsig + 4, 0);
 
 	/*
 	 * Compute HMAC-MD5 of packet data, keyed by MAC key.
 	 * Store the first 8 bytes in the sec. signature field.
 	 */
 	smb_rq_getrequest(rqp, &mbp);
 	MD5Init(&md5);
 	MD5Update(&md5, vcp->vc_mackey, vcp->vc_mackeylen);
 	for (mb = mbp->mb_top; mb != NULL; mb = mb->m_next)
 		MD5Update(&md5, mtod(mb, void *), mb->m_len);
 	MD5Final(digest, &md5);
 	bcopy(digest, rqp->sr_rqsig, 8);
 
 	return (0);
 }
 
 /*
  * Verify reply signature.
  */
 int
 smb_rq_verify(struct smb_rq *rqp)
 {
 	struct smb_vc *vcp = rqp->sr_vc;
 	struct mdchain *mdp;
 	u_char sigbuf[8];
 	MD5_CTX md5;
 	u_char digest[16];
 	struct mbuf *mb;
 
 	KASSERT(vcp->vc_hflags2 & SMB_FLAGS2_SECURITY_SIGNATURE,
 	    ("signatures not enabled"));
 
 	if (vcp->vc_mackey == NULL)
 		/* XXX Should check that this is a SMB_COM_NEGOTIATE reply. */
 		return (0);
 
 	/*
 	 * Compute HMAC-MD5 of packet data, keyed by MAC key.
 	 * We play games to pretend the security signature field
 	 * contains their sequence number, to avoid modifying
 	 * the packet itself.
 	 */
 	smb_rq_getreply(rqp, &mdp);
 	mb = mdp->md_top;
 	KASSERT(mb->m_len >= SMB_HDRLEN, ("forgot to m_pullup"));
 	MD5Init(&md5);
 	MD5Update(&md5, vcp->vc_mackey, vcp->vc_mackeylen);
 	MD5Update(&md5, mtod(mb, void *), 14);
 	*(u_int32_t *)sigbuf = htole32(rqp->sr_rseqno);
 	*(u_int32_t *)(sigbuf + 4) = 0;
 	MD5Update(&md5, sigbuf, 8);
 	MD5Update(&md5, mtod(mb, u_char *) + 22, mb->m_len - 22);
 	for (mb = mb->m_next; mb != NULL; mb = mb->m_next)
 		MD5Update(&md5, mtod(mb, void *), mb->m_len);
 	MD5Final(digest, &md5);
 
 	/*
 	 * Now verify the signature.
 	 */
 	if (bcmp(mtod(mdp->md_top, u_char *) + 14, digest, 8) != 0)
 		return (EAUTH);
 
 	return (0);
 }
Index: head/sys/x86/x86/mca.c
===================================================================
--- head/sys/x86/x86/mca.c	(revision 328237)
+++ head/sys/x86/x86/mca.c	(revision 328238)
@@ -1,1267 +1,1267 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for x86 machine check architecture.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef __amd64__
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 /* Modes for mca_scan() */
 enum scan_mode {
 	POLLED,
 	MCE,
 	CMCI,
 };
 
 #ifdef DEV_APIC
 /*
  * State maintained for each monitored MCx bank to control the
  * corrected machine check interrupt threshold.
  */
 struct cmc_state {
 	int	max_threshold;
 	time_t	last_intr;
 };
 
 struct amd_et_state {
 	int	cur_threshold;
 	time_t	last_intr;
 };
 #endif
 
 struct mca_internal {
 	struct mca_record rec;
 	int		logged;
 	STAILQ_ENTRY(mca_internal) link;
 };
 
 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
 
 static volatile int mca_count;	/* Number of records stored. */
 static int mca_banks;		/* Number of per-CPU register banks. */
 
 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL,
     "Machine Check Architecture");
 
 static int mca_enabled = 1;
 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
     "Administrative toggle for machine check support");
 
 static int amd10h_L1TP = 1;
 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
 
 static int intel6h_HSD131;
 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
     "Administrative toggle for logging of spurious corrected errors");
 
 int workaround_erratum383;
 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
     &workaround_erratum383, 0,
     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 
 static STAILQ_HEAD(, mca_internal) mca_freelist;
 static int mca_freecount;
 static STAILQ_HEAD(, mca_internal) mca_records;
 static struct callout mca_timer;
 static int mca_ticks = 3600;	/* Check hourly by default. */
 static struct taskqueue *mca_tq;
 static struct task mca_refill_task, mca_scan_task;
 static struct mtx mca_lock;
 
 #ifdef DEV_APIC
 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
 static struct amd_et_state **amd_et_state;	/* Indexed by cpuid, bank. */
 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
 
 static int amd_elvt = -1;
 
 static inline bool
 amd_thresholding_supported(void)
 {
 	if (cpu_vendor_id != CPU_VENDOR_AMD)
 		return (false);
 	/*
 	 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
 	 *
 	 * It begins to be documented in family 0x15 model 30 and family 0x16,
 	 * but neither of these families documents the ScalableMca bit, which
 	 * supposedly defines the presence of this feature on family 0x17.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
 		return (true);
 	if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
 		return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
 	return (false);
 }
 #endif
 
 static inline bool
 cmci_supported(uint64_t mcg_cap)
 {
 	/*
 	 * MCG_CAP_CMCI_P bit is reserved in AMD documentation.  Until
 	 * it is defined, do not use it to check for CMCI support.
 	 */
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (false);
 	return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
 }
 
 static int
 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
 	*(int *)arg1 = value;
 	return (0);
 }
 
 static int
 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct mca_record record;
 	struct mca_internal *rec;
 	int i;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (name[0] < 0 || name[0] >= mca_count)
 		return (EINVAL);
 
 	mtx_lock_spin(&mca_lock);
 	if (name[0] >= mca_count) {
 		mtx_unlock_spin(&mca_lock);
 		return (EINVAL);
 	}
 	i = 0;
 	STAILQ_FOREACH(rec, &mca_records, link) {
 		if (i == name[0]) {
 			record = rec->rec;
 			break;
 		}
 		i++;
 	}
 	mtx_unlock_spin(&mca_lock);
 	return (SYSCTL_OUT(req, &record, sizeof(record)));
 }
 
 static const char *
 mca_error_ttype(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x000c) >> 2) {
 	case 0:
 		return ("I");
 	case 1:
 		return ("D");
 	case 2:
 		return ("G");
 	}
 	return ("?");
 }
 
 static const char *
 mca_error_level(uint16_t mca_error)
 {
 
 	switch (mca_error & 0x0003) {
 	case 0:
 		return ("L0");
 	case 1:
 		return ("L1");
 	case 2:
 		return ("L2");
 	case 3:
 		return ("LG");
 	}
 	return ("L?");
 }
 
 static const char *
 mca_error_request(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x00f0) >> 4) {
 	case 0x0:
 		return ("ERR");
 	case 0x1:
 		return ("RD");
 	case 0x2:
 		return ("WR");
 	case 0x3:
 		return ("DRD");
 	case 0x4:
 		return ("DWR");
 	case 0x5:
 		return ("IRD");
 	case 0x6:
 		return ("PREFETCH");
 	case 0x7:
 		return ("EVICT");
 	case 0x8:
 		return ("SNOOP");
 	}
 	return ("???");
 }
 
 static const char *
 mca_error_mmtype(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x70) >> 4) {
 	case 0x0:
 		return ("GEN");
 	case 0x1:
 		return ("RD");
 	case 0x2:
 		return ("WR");
 	case 0x3:
 		return ("AC");
 	case 0x4:
 		return ("MS");
 	}
 	return ("???");
 }
 
 static int
 mca_mute(const struct mca_record *rec)
 {
 
 	/*
 	 * Skip spurious corrected parity errors generated by Intel Haswell-
 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
 	 * erratum respectively), unless reporting is enabled.
 	 * Note that these errors also have been observed with the D0-stepping
 	 * of Haswell, while at least initially the CPU specification updates
 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
 	    rec->mr_bank == 0 &&
 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
 	    !intel6h_HSD131)
 	    	return (1);
 
 	return (0);
 }
 
 /* Dump details about a single machine check. */
 static void
 mca_log(const struct mca_record *rec)
 {
 	uint16_t mca_error;
 
 	if (mca_mute(rec))
 	    	return;
 
 	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
 	    (long long)rec->mr_status);
 	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
 	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
 	    rec->mr_cpu_id, rec->mr_apic_id);
 	printf("MCA: CPU %d ", rec->mr_cpu);
 	if (rec->mr_status & MC_STATUS_UC)
 		printf("UNCOR ");
 	else {
 		printf("COR ");
 		if (cmci_supported(rec->mr_mcg_cap))
 			printf("(%lld) ", ((long long)rec->mr_status &
 			    MC_STATUS_COR_COUNT) >> 38);
 	}
 	if (rec->mr_status & MC_STATUS_PCC)
 		printf("PCC ");
 	if (rec->mr_status & MC_STATUS_OVER)
 		printf("OVER ");
 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
 	switch (mca_error) {
 		/* Simple error codes. */
 	case 0x0000:
 		printf("no error");
 		break;
 	case 0x0001:
 		printf("unclassified error");
 		break;
 	case 0x0002:
 		printf("ucode ROM parity error");
 		break;
 	case 0x0003:
 		printf("external error");
 		break;
 	case 0x0004:
 		printf("FRC error");
 		break;
 	case 0x0005:
 		printf("internal parity error");
 		break;
 	case 0x0400:
 		printf("internal timer error");
 		break;
 	default:
 		if ((mca_error & 0xfc00) == 0x0400) {
 			printf("internal error %x", mca_error & 0x03ff);
 			break;
 		}
 
 		/* Compound error codes. */
 
 		/* Memory hierarchy error. */
 		if ((mca_error & 0xeffc) == 0x000c) {
 			printf("%s memory error", mca_error_level(mca_error));
 			break;
 		}
 
 		/* TLB error. */
 		if ((mca_error & 0xeff0) == 0x0010) {
 			printf("%sTLB %s error", mca_error_ttype(mca_error),
 			    mca_error_level(mca_error));
 			break;
 		}
 
 		/* Memory controller error. */
 		if ((mca_error & 0xef80) == 0x0080) {
 			printf("%s channel ", mca_error_mmtype(mca_error));
 			if ((mca_error & 0x000f) != 0x000f)
 				printf("%d", mca_error & 0x000f);
 			else
 				printf("??");
 			printf(" memory error");
 			break;
 		}
 		
 		/* Cache error. */
 		if ((mca_error & 0xef00) == 0x0100) {
 			printf("%sCACHE %s %s error",
 			    mca_error_ttype(mca_error),
 			    mca_error_level(mca_error),
 			    mca_error_request(mca_error));
 			break;
 		}
 
 		/* Bus and/or Interconnect error. */
 		if ((mca_error & 0xe800) == 0x0800) {			
 			printf("BUS%s ", mca_error_level(mca_error));
 			switch ((mca_error & 0x0600) >> 9) {
 			case 0:
 				printf("Source");
 				break;
 			case 1:
 				printf("Responder");
 				break;
 			case 2:
 				printf("Observer");
 				break;
 			default:
 				printf("???");
 				break;
 			}
 			printf(" %s ", mca_error_request(mca_error));
 			switch ((mca_error & 0x000c) >> 2) {
 			case 0:
 				printf("Memory");
 				break;
 			case 2:
 				printf("I/O");
 				break;
 			case 3:
 				printf("Other");
 				break;
 			default:
 				printf("???");
 				break;
 			}
 			if (mca_error & 0x0100)
 				printf(" timed out");
 			break;
 		}
 
 		printf("unknown error %x", mca_error);
 		break;
 	}
 	printf("\n");
 	if (rec->mr_status & MC_STATUS_ADDRV)
 		printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
 	if (rec->mr_status & MC_STATUS_MISCV)
 		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
 }
 
 static int
 mca_check_status(int bank, struct mca_record *rec)
 {
 	uint64_t status;
 	u_int p[4];
 
 	status = rdmsr(MSR_MC_STATUS(bank));
 	if (!(status & MC_STATUS_VAL))
 		return (0);
 
 	/* Save exception information. */
 	rec->mr_status = status;
 	rec->mr_bank = bank;
 	rec->mr_addr = 0;
 	if (status & MC_STATUS_ADDRV)
 		rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
 	rec->mr_misc = 0;
 	if (status & MC_STATUS_MISCV)
 		rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
 	rec->mr_tsc = rdtsc();
 	rec->mr_apic_id = PCPU_GET(apic_id);
 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
 	rec->mr_cpu_id = cpu_id;
 	rec->mr_cpu_vendor_id = cpu_vendor_id;
 	rec->mr_cpu = PCPU_GET(cpuid);
 
 	/*
 	 * Clear machine check.  Don't do this for uncorrectable
 	 * errors so that the BIOS can see them.
 	 */
 	if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
 		wrmsr(MSR_MC_STATUS(bank), 0);
 		do_cpuid(0, p);
 	}
 	return (1);
 }
 
 static void
 mca_fill_freelist(void)
 {
 	struct mca_internal *rec;
 	int desired;
 
 	/*
 	 * Ensure we have at least one record for each bank and one
 	 * record per CPU.
 	 */
 	desired = imax(mp_ncpus, mca_banks);
 	mtx_lock_spin(&mca_lock);
 	while (mca_freecount < desired) {
 		mtx_unlock_spin(&mca_lock);
 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 		mtx_lock_spin(&mca_lock);
 		STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
 		mca_freecount++;
 	}
 	mtx_unlock_spin(&mca_lock);
 }
 
 static void
 mca_refill(void *context, int pending)
 {
 
 	mca_fill_freelist();
 }
 
 static void
 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
 {
 	struct mca_internal *rec;
 
 	if (mode == POLLED) {
 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 		mtx_lock_spin(&mca_lock);
 	} else {
 		mtx_lock_spin(&mca_lock);
 		rec = STAILQ_FIRST(&mca_freelist);
 		if (rec == NULL) {
 			printf("MCA: Unable to allocate space for an event.\n");
 			mca_log(record);
 			mtx_unlock_spin(&mca_lock);
 			return;
 		}
 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
 		mca_freecount--;
 	}
 
 	rec->rec = *record;
 	rec->logged = 0;
 	STAILQ_INSERT_TAIL(&mca_records, rec, link);
 	mca_count++;
 	mtx_unlock_spin(&mca_lock);
 	if (mode == CMCI && !cold)
 		taskqueue_enqueue(mca_tq, &mca_refill_task);
 }
 
 #ifdef DEV_APIC
 /*
  * Update the interrupt threshold for a CMCI.  The strategy is to use
  * a low trigger that interrupts as soon as the first event occurs.
  * However, if a steady stream of events arrive, the threshold is
  * increased until the interrupts are throttled to once every
  * cmc_throttle seconds or the periodic scan.  If a periodic scan
  * finds that the threshold is too high, it is lowered.
  */
 static int
 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
     int cur_threshold, int max_threshold)
 {
 	u_int delta;
 	int limit;
 
 	delta = (u_int)(time_uptime - last_intr);
 	limit = cur_threshold;
 
 	/*
 	 * If an interrupt was received less than cmc_throttle seconds
 	 * since the previous interrupt and the count from the current
 	 * event is greater than or equal to the current threshold,
 	 * double the threshold up to the max.
 	 */
 	if (mode == CMCI && valid) {
 		if (delta < cmc_throttle && count >= limit &&
 		    limit < max_threshold) {
 			limit = min(limit << 1, max_threshold);
 		}
 		return (limit);
 	}
 
 	/*
 	 * When the banks are polled, check to see if the threshold
 	 * should be lowered.
 	 */
 	if (mode != POLLED)
 		return (limit);
 
 	/* If a CMCI occured recently, do nothing for now. */
 	if (delta < cmc_throttle)
 		return (limit);
 
 	/*
 	 * Compute a new limit based on the average rate of events per
 	 * cmc_throttle seconds since the last interrupt.
 	 */
 	if (valid) {
 		limit = count * cmc_throttle / delta;
 		if (limit <= 0)
 			limit = 1;
 		else if (limit > max_threshold)
 			limit = max_threshold;
 	} else {
 		limit = 1;
 	}
 	return (limit);
 }
 
 static void
 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 	int cur_threshold, new_threshold;
 	int count;
 
 	/* Fetch the current limit for this bank. */
 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
 	ctl = rdmsr(MSR_MC_CTL2(bank));
 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
 
 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 	    cur_threshold, cc->max_threshold);
 
 	if (mode == CMCI && valid)
 		cc->last_intr = time_uptime;
 	if (new_threshold != cur_threshold) {
 		ctl &= ~MC_CTL2_THRESHOLD;
 		ctl |= new_threshold;
 		wrmsr(MSR_MC_CTL2(bank), ctl);
 	}
 }
 
 static void
 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
 {
 	struct amd_et_state *cc;
 	uint64_t misc;
 	int new_threshold;
 	int count;
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][bank];
 	misc = rdmsr(MSR_MC_MISC(bank));
 	count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
 	count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
 
 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 	    cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
 
 	cc->cur_threshold = new_threshold;
 	misc &= ~MC_MISC_AMD_CNT_MASK;
 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
 	    << MC_MISC_AMD_CNT_SHIFT;
 	misc &= ~MC_MISC_AMD_OVERFLOW;
 	wrmsr(MSR_MC_MISC(bank), misc);
 	if (mode == CMCI && valid)
 		cc->last_intr = time_uptime;
 }
 #endif
 
 /*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
  * pinned when this is called.  The 'mode' parameter indicates if we
  * are being called from the MC exception handler, the CMCI handler,
  * or the periodic poller.  In the MC exception case this function
  * returns true if the system is restartable.  Otherwise, it returns a
  * count of the number of valid MC records found.
  */
 static int
 mca_scan(enum scan_mode mode, int *recoverablep)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
 	int count, i, recoverable, valid;
 
 	count = 0;
 	recoverable = 1;
 	ucmask = MC_STATUS_UC | MC_STATUS_PCC;
 
 	/* When handling a MCE#, treat the OVER flag as non-restartable. */
 	if (mode == MCE)
 		ucmask |= MC_STATUS_OVER;
 	mcg_cap = rdmsr(MSR_MCG_CAP);
 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 #ifdef DEV_APIC
 		/*
 		 * For a CMCI, only check banks this CPU is
 		 * responsible for.
 		 */
 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
 			continue;
 #endif
 
 		valid = mca_check_status(i, &rec);
 		if (valid) {
 			count++;
 			if (rec.mr_status & ucmask) {
 				recoverable = 0;
 				mtx_lock_spin(&mca_lock);
 				mca_log(&rec);
 				mtx_unlock_spin(&mca_lock);
 			}
 			mca_record_entry(mode, &rec);
 		}
 	
 #ifdef DEV_APIC
 		/*
 		 * If this is a bank this CPU monitors via CMCI,
 		 * update the threshold.
 		 */
 		if (PCPU_GET(cmci_mask) & 1 << i) {
 			if (cmc_state != NULL)
 				cmci_update(mode, i, valid, &rec);
 			else
 				amd_thresholding_update(mode, i, valid);
 		}
 #endif
 	}
 	if (mode == POLLED)
 		mca_fill_freelist();
 	if (recoverablep != NULL)
 		*recoverablep = recoverable;
 	return (count);
 }
 
 /*
  * Scan the machine check banks on all CPUs by binding to each CPU in
  * turn.  If any of the CPUs contained new machine check records, log
  * them to the console.
  */
 static void
 mca_scan_cpus(void *context, int pending)
 {
 	struct mca_internal *mca;
 	struct thread *td;
 	int count, cpu;
 
 	mca_fill_freelist();
 	td = curthread;
 	count = 0;
 	thread_lock(td);
 	CPU_FOREACH(cpu) {
 		sched_bind(td, cpu);
 		thread_unlock(td);
 		count += mca_scan(POLLED, NULL);
 		thread_lock(td);
 		sched_unbind(td);
 	}
 	thread_unlock(td);
 	if (count != 0) {
 		mtx_lock_spin(&mca_lock);
 		STAILQ_FOREACH(mca, &mca_records, link) {
 			if (!mca->logged) {
 				mca->logged = 1;
 				mca_log(&mca->rec);
 			}
 		}
 		mtx_unlock_spin(&mca_lock);
 	}
 }
 
 static void
 mca_periodic_scan(void *arg)
 {
 
 	taskqueue_enqueue(mca_tq, &mca_scan_task);
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
 
 static int
 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = 0;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error)
 		return (error);
 	if (i)
 		taskqueue_enqueue(mca_tq, &mca_scan_task);
 	return (0);
 }
 
 static void
 mca_createtq(void *dummy)
 {
 	if (mca_banks <= 0)
 		return;
 
 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
 	    taskqueue_thread_enqueue, &mca_tq);
 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
 
 	/* CMCIs during boot may have claimed items from the freelist. */
 	mca_fill_freelist();
 }
 SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
 
 static void
 mca_startup(void *dummy)
 {
 
 	if (mca_banks <= 0)
 		return;
 
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
 #ifdef EARLY_AP_STARTUP
 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
 #else
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
 #endif
 
 #ifdef DEV_APIC
 static void
 cmci_setup(void)
 {
 	int i;
 
 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
 	    M_WAITOK);
 	for (i = 0; i <= mp_maxid; i++)
 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
 		    M_MCA, M_WAITOK | M_ZERO);
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
 
 static void
 amd_thresholding_setup(void)
 {
-	int i;
+	u_int i;
 
 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
 	    M_MCA, M_WAITOK);
 	for (i = 0; i <= mp_maxid; i++)
 		amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
 		    mca_banks, M_MCA, M_WAITOK | M_ZERO);
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
 #endif
 
 static void
 mca_setup(uint64_t mcg_cap)
 {
 
 	/*
 	 * On AMD Family 10h processors, unless logging of level one TLB
 	 * parity (L1TP) errors is disabled, enable the recommended workaround
 	 * for Erratum 383.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
 		workaround_erratum383 = 1;
 
 	mca_banks = mcg_cap & MCG_CAP_COUNT;
 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
 	STAILQ_INIT(&mca_records);
 	TASK_INIT(&mca_scan_task, 0, mca_scan_cpus, NULL);
 	callout_init(&mca_timer, 1);
 	STAILQ_INIT(&mca_freelist);
 	TASK_INIT(&mca_refill_task, 0, mca_refill, NULL);
 	mca_fill_freelist();
 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
 	    "Record count");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
 	    0, sysctl_positive_int, "I",
 	    "Periodic interval in seconds to scan for machine checks");
 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
 #ifdef DEV_APIC
 	if (cmci_supported(mcg_cap))
 		cmci_setup();
 	else if (amd_thresholding_supported())
 		amd_thresholding_setup();
 #endif
 }
 
 #ifdef DEV_APIC
 /*
  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
  * set in MC_CTL2, then another CPU is responsible for this bank, so
  * ignore it.  If CMCI_EN returns zero after being set, then this bank
  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
  * now monitor this bank.
  */
 static void
 cmci_monitor(int i)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	if (ctl & MC_CTL2_CMCI_EN)
 		/* Already monitored by another CPU. */
 		return;
 
 	/* Set the threshold to one event for now. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	if (!(ctl & MC_CTL2_CMCI_EN))
 		/* This bank does not support CMCI. */
 		return;
 
 	cc = &cmc_state[PCPU_GET(cpuid)][i];
 
 	/* Determine maximum threshold. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= 0x7fff;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
 
 	/* Start off with a threshold of 1. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 
 	/* Mark this bank as monitored. */
 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 /*
  * For resume, reset the threshold for any banks we monitor back to
  * one and throw away the timestamp of the last interrupt.
  */
 static void
 cmci_resume(int i)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/* Ignore banks not monitored by this CPU. */
 	if (!(PCPU_GET(cmci_mask) & 1 << i))
 		return;
 
 	cc = &cmc_state[PCPU_GET(cpuid)][i];
 	cc->last_intr = 0;
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 }
 
 /*
  * Apply an AMD ET configuration to the corresponding MSR.
  */
 static void
 amd_thresholding_start(struct amd_et_state *cc, int bank)
 {
 	uint64_t misc;
 
 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
 
 	misc = rdmsr(MSR_MC_MISC(bank));
 
 	misc &= ~MC_MISC_AMD_INT_MASK;
 	misc |= MC_MISC_AMD_INT_LVT;
 
 	misc &= ~MC_MISC_AMD_LVT_MASK;
 	misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
 
 	misc &= ~MC_MISC_AMD_CNT_MASK;
 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
 	    << MC_MISC_AMD_CNT_SHIFT;
 
 	misc &= ~MC_MISC_AMD_OVERFLOW;
 	misc |= MC_MISC_AMD_CNTEN;
 
 	wrmsr(MSR_MC_MISC(bank), misc);
 }
 
 static void
 amd_thresholding_monitor(int i)
 {
 	struct amd_et_state *cc;
 	uint64_t misc;
 
 	/*
 	 * Kludge: On 10h, banks after 4 are not thresholding but also may have
 	 * bogus Valid bits.  Skip them.  This is definitely fixed in 15h, but
 	 * I have not investigated whether it is fixed in earlier models.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
 		return;
 
 	/* The counter must be valid and present. */
 	misc = rdmsr(MSR_MC_MISC(i));
 	if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
 	    (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
 		return;
 
 	/* The register should not be locked. */
 	if ((misc & MC_MISC_AMD_LOCK) != 0) {
 		if (bootverbose)
 			printf("%s: 0x%jx: Bank %d: locked\n", __func__,
 			    (uintmax_t)misc, i);
 		return;
 	}
 
 	/*
 	 * If counter is enabled then either the firmware or another CPU
 	 * has already claimed it.
 	 */
 	if ((misc & MC_MISC_AMD_CNTEN) != 0) {
 		if (bootverbose)
 			printf("%s: 0x%jx: Bank %d: already enabled\n",
 			    __func__, (uintmax_t)misc, i);
 		return;
 	}
 
 	/*
 	 * Configure an Extended Interrupt LVT register for reporting
 	 * counter overflows if that feature is supported and the first
 	 * extended register is available.
 	 */
 	amd_elvt = lapic_enable_mca_elvt();
 	if (amd_elvt < 0) {
 		printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
 		    __func__, i, amd_elvt);
 		return;
 	}
 
 	/* Re-use Intel CMC support infrastructure. */
 	if (bootverbose)
 		printf("%s: Starting AMD thresholding on bank %d\n", __func__,
 		    i);
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
 	cc->cur_threshold = 1;
 	amd_thresholding_start(cc, i);
 
 	/* Mark this bank as monitored. */
 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 static void
 amd_thresholding_resume(int i)
 {
 	struct amd_et_state *cc;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/* Ignore banks not monitored by this CPU. */
 	if (!(PCPU_GET(cmci_mask) & 1 << i))
 		return;
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
 	cc->last_intr = 0;
 	cc->cur_threshold = 1;
 	amd_thresholding_start(cc, i);
 }
 #endif
 
 /*
  * Initializes per-CPU machine check registers and enables corrected
  * machine check interrupts.
  */
 static void
 _mca_init(int boot)
 {
 	uint64_t mcg_cap;
 	uint64_t ctl, mask;
 	int i, skip, family;
 
 	family = CPUID_TO_FAMILY(cpu_id);
 
 	/* MCE is required. */
 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
 		return;
 
 	if (cpu_feature & CPUID_MCA) {
 		if (boot)
 			PCPU_SET(cmci_mask, 0);
 
 		mcg_cap = rdmsr(MSR_MCG_CAP);
 		if (mcg_cap & MCG_CAP_CTL_P)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
 		if (PCPU_GET(cpuid) == 0 && boot)
 			mca_setup(mcg_cap);
 
 		/*
 		 * Disable logging of level one TLB parity (L1TP) errors by
 		 * the data cache as an alternative workaround for AMD Family
 		 * 10h Erratum 383.  Unlike the recommended workaround, there
 		 * is no performance penalty to this workaround.  However,
 		 * L1TP errors will go unreported.
 		 */
 		if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
 		    !amd10h_L1TP) {
 			mask = rdmsr(MSR_MC0_CTL_MASK);
 			if ((mask & (1UL << 5)) == 0)
 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
 		}
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 			/* By default enable logging of all errors. */
 			ctl = 0xffffffffffffffffUL;
 			skip = 0;
 
 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
 				/*
 				 * For P6 models before Nehalem MC0_CTL is
 				 * always enabled and reserved.
 				 */
 				if (i == 0 && family == 0x6
 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
 					skip = 1;
 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
 				/* BKDG for Family 10h: unset GartTblWkEn. */
 				if (i == MC_AMDNB_BANK && family >= 0xf)
 					ctl &= ~(1UL << 10);
 			}
 
 			if (!skip)
 				wrmsr(MSR_MC_CTL(i), ctl);
 
 #ifdef DEV_APIC
 			if (cmci_supported(mcg_cap)) {
 				if (boot)
 					cmci_monitor(i);
 				else
 					cmci_resume(i);
 			} else if (amd_thresholding_supported()) {
 				if (boot)
 					amd_thresholding_monitor(i);
 				else
 					amd_thresholding_resume(i);
 			}
 #endif
 
 			/* Clear all errors. */
 			wrmsr(MSR_MC_STATUS(i), 0);
 		}
 
 #ifdef DEV_APIC
 		if (!amd_thresholding_supported() &&
 		    PCPU_GET(cmci_mask) != 0 && boot)
 			lapic_enable_cmc();
 #endif
 	}
 
 	load_cr4(rcr4() | CR4_MCE);
 }
 
 /* Must be executed on each CPU during boot. */
 void
 mca_init(void)
 {
 
 	_mca_init(1);
 }
 
 /* Must be executed on each CPU during resume. */
 void
 mca_resume(void)
 {
 
 	_mca_init(0);
 }
 
 /*
  * The machine check registers for the BSP cannot be initialized until
  * the local APIC is initialized.  This happens at SI_SUB_CPU,
  * SI_ORDER_SECOND.
  */
 static void
 mca_init_bsp(void *arg __unused)
 {
 
 	mca_init();
 }
 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
 
 /* Called when a machine check exception fires. */
 void
 mca_intr(void)
 {
 	uint64_t mcg_status;
 	int recoverable, count;
 
 	if (!(cpu_feature & CPUID_MCA)) {
 		/*
 		 * Just print the values of the old Pentium registers
 		 * and panic.
 		 */
 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
 		panic("Machine check");
 	}
 
 	/* Scan the banks and check for any non-recoverable errors. */
 	count = mca_scan(MCE, &recoverable);
 	mcg_status = rdmsr(MSR_MCG_STATUS);
 	if (!(mcg_status & MCG_STATUS_RIPV))
 		recoverable = 0;
 
 	if (!recoverable) {
 		/*
 		 * Only panic if the error was detected local to this CPU.
 		 * Some errors will assert a machine check on all CPUs, but
 		 * only certain CPUs will find a valid bank to log.
 		 */
 		while (count == 0)
 			cpu_spinwait();
 
 		panic("Unrecoverable machine check exception");
 	}
 
 	/* Clear MCIP. */
 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
 }
 
 #ifdef DEV_APIC
 /* Called for a CMCI (correctable machine check interrupt). */
 void
 cmc_intr(void)
 {
 	struct mca_internal *mca;
 	int count;
 
 	/*
 	 * Serialize MCA bank scanning to prevent collisions from
 	 * sibling threads.
 	 */
 	count = mca_scan(CMCI, NULL);
 
 	/* If we found anything, log them to the console. */
 	if (count != 0) {
 		mtx_lock_spin(&mca_lock);
 		STAILQ_FOREACH(mca, &mca_records, link) {
 			if (!mca->logged) {
 				mca->logged = 1;
 				mca_log(&mca->rec);
 			}
 		}
 		mtx_unlock_spin(&mca_lock);
 	}
 }
 #endif