Index: projects/hps_callouts/sys/compat/linuxkpi/common/src/linux_compat.c
===================================================================
--- projects/hps_callouts/sys/compat/linuxkpi/common/src/linux_compat.c	(revision 352141)
+++ projects/hps_callouts/sys/compat/linuxkpi/common/src/linux_compat.c	(revision 352142)
@@ -1,2458 +1,2456 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2018 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/sleepqueue.h>
 #include <sys/refcount.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/mman.h>
 #include <sys/stack.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/stdarg.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #endif
 
 #include <linux/kobject.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/cdev.h>
 #include <linux/file.h>
 #include <linux/sysfs.h>
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/compat.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <asm/smp.h>
 #endif
 
 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters");
 
 int linuxkpi_debug;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
     &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
 
 MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
 
 #include <linux/rbtree.h>
 /* Undo Linux compat changes. */
 #undef RB_ROOT
 #undef file
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
 
 static void linux_cdev_deref(struct linux_cdev *ldev);
 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
 
 struct kobject linux_class_root;
 struct device linux_root_device;
 struct class linux_class_misc;
 struct list_head pci_drivers;
 struct list_head pci_devices;
 spinlock_t pci_lock;
 
 unsigned long linux_timer_hz_mask;
 
 int
 panic_cmp(struct rb_node *one, struct rb_node *two)
 {
 	panic("no cmp");
 }
 
 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
 
 int
 kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
 {
 	va_list tmp_va;
 	int len;
 	char *old;
 	char *name;
 	char dummy;
 
 	old = kobj->name;
 
 	if (old && fmt == NULL)
 		return (0);
 
 	/* compute length of string */
 	va_copy(tmp_va, args);
 	len = vsnprintf(&dummy, 0, fmt, tmp_va);
 	va_end(tmp_va);
 
 	/* account for zero termination */
 	len++;
 
 	/* check for error */
 	if (len < 1)
 		return (-EINVAL);
 
 	/* allocate memory for string */
 	name = kzalloc(len, GFP_KERNEL);
 	if (name == NULL)
 		return (-ENOMEM);
 	vsnprintf(name, len, fmt, args);
 	kobj->name = name;
 
 	/* free old string */
 	kfree(old);
 
 	/* filter new string */
 	for (; *name != '\0'; name++)
 		if (*name == '/')
 			*name = '!';
 	return (0);
 }
 
 int
 kobject_set_name(struct kobject *kobj, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 
 	return (error);
 }
 
 static int
 kobject_add_complete(struct kobject *kobj, struct kobject *parent)
 {
 	const struct kobj_type *t;
 	int error;
 
 	kobj->parent = parent;
 	error = sysfs_create_dir(kobj);
 	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
 		struct attribute **attr;
 		t = kobj->ktype;
 
 		for (attr = t->default_attrs; *attr != NULL; attr++) {
 			error = sysfs_create_file(kobj, *attr);
 			if (error)
 				break;
 		}
 		if (error)
 			sysfs_remove_dir(kobj);
 
 	}
 	return (error);
 }
 
 int
 kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 
 	return kobject_add_complete(kobj, parent);
 }
 
 void
 linux_kobject_release(struct kref *kref)
 {
 	struct kobject *kobj;
 	char *name;
 
 	kobj = container_of(kref, struct kobject, kref);
 	sysfs_remove_dir(kobj);
 	name = kobj->name;
 	if (kobj->ktype && kobj->ktype->release)
 		kobj->ktype->release(kobj);
 	kfree(name);
 }
 
 static void
 linux_kobject_kfree(struct kobject *kobj)
 {
 	kfree(kobj);
 }
 
 static void
 linux_kobject_kfree_name(struct kobject *kobj)
 {
 	if (kobj) {
 		kfree(kobj->name);
 	}
 }
 
 const struct kobj_type linux_kfree_type = {
 	.release = linux_kobject_kfree
 };
 
 static void
 linux_device_release(struct device *dev)
 {
 	pr_debug("linux_device_release: %s\n", dev_name(dev));
 	kfree(dev);
 }
 
 static ssize_t
 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct class, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct class, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static void
 linux_class_release(struct kobject *kobj)
 {
 	struct class *class;
 
 	class = container_of(kobj, struct class, kobj);
 	if (class->class_release)
 		class->class_release(class);
 }
 
 static const struct sysfs_ops linux_class_sysfs = {
 	.show  = linux_class_show,
 	.store = linux_class_store,
 };
 
 const struct kobj_type linux_class_ktype = {
 	.release = linux_class_release,
 	.sysfs_ops = &linux_class_sysfs
 };
 
 static void
 linux_dev_release(struct kobject *kobj)
 {
 	struct device *dev;
 
 	dev = container_of(kobj, struct device, kobj);
 	/* This is the precedence defined by linux. */
 	if (dev->release)
 		dev->release(dev);
 	else if (dev->class && dev->class->dev_release)
 		dev->class->dev_release(dev);
 }
 
 static ssize_t
 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct device, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct device, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static const struct sysfs_ops linux_dev_sysfs = {
 	.show  = linux_dev_show,
 	.store = linux_dev_store,
 };
 
 const struct kobj_type linux_dev_ktype = {
 	.release = linux_dev_release,
 	.sysfs_ops = &linux_dev_sysfs
 };
 
 struct device *
 device_create(struct class *class, struct device *parent, dev_t devt,
     void *drvdata, const char *fmt, ...)
 {
 	struct device *dev;
 	va_list args;
 
 	dev = kzalloc(sizeof(*dev), M_WAITOK);
 	dev->parent = parent;
 	dev->class = class;
 	dev->devt = devt;
 	dev->driver_data = drvdata;
 	dev->release = linux_device_release;
 	va_start(args, fmt);
 	kobject_set_name_vargs(&dev->kobj, fmt, args);
 	va_end(args);
 	device_register(dev);
 
 	return (dev);
 }
 
 int
 kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
     struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	kobject_init(kobj, ktype);
 	kobj->ktype = ktype;
 	kobj->parent = parent;
 	kobj->name = NULL;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 	return kobject_add_complete(kobj, parent);
 }
 
 static void
 linux_kq_lock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_lock(s);
 }
 static void
 linux_kq_unlock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_unlock(s);
 }
 
 static void
 linux_kq_lock_owned(void *arg)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	mtx_assert(&s->m, MA_OWNED);
 #endif
 }
 
 static void
 linux_kq_lock_unowned(void *arg)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	mtx_assert(&s->m, MA_NOTOWNED);
 #endif
 }
 
 static void
 linux_file_kqfilter_poll(struct linux_file *, int);
 
 struct linux_file *
 linux_file_alloc(void)
 {
 	struct linux_file *filp;
 
 	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
 
 	/* set initial refcount */
 	filp->f_count = 1;
 
 	/* setup fields needed by kqueue support */
 	spin_lock_init(&filp->f_kqlock);
 	knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
 	    linux_kq_lock, linux_kq_unlock,
 	    linux_kq_lock_owned, linux_kq_lock_unowned);
 
 	return (filp);
 }
 
 void
 linux_file_free(struct linux_file *filp)
 {
 	if (filp->_file == NULL) {
 		if (filp->f_shmem != NULL)
 			vm_object_deallocate(filp->f_shmem);
 		kfree(filp);
 	} else {
 		/*
 		 * The close method of the character device or file
 		 * will free the linux_file structure:
 		 */
 		_fdrop(filp->_file, curthread);
 	}
 }
 
 static int
 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
 		vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
 		vm_page_t page;
 
 		if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 			/*
 			 * If the passed in result page is a fake
 			 * page, update it with the new physical
 			 * address.
 			 */
 			page = *mres;
 			vm_page_updatefake(page, paddr, vm_obj->memattr);
 		} else {
 			/*
 			 * Replace the passed in "mres" page with our
 			 * own fake page and free up the all of the
 			 * original pages.
 			 */
 			VM_OBJECT_WUNLOCK(vm_obj);
 			page = vm_page_getfake(paddr, vm_obj->memattr);
 			VM_OBJECT_WLOCK(vm_obj);
 
 			vm_page_replace_checked(page, vm_obj,
 			    (*mres)->pindex, *mres);
 
 			vm_page_free(*mres);
 			*mres = page;
 		}
 		page->valid = VM_PAGE_BITS_ALL;
 		return (VM_PAGER_OK);
 	}
 	return (VM_PAGER_FAIL);
 }
 
 static int
 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	struct vm_area_struct *vmap;
 	int err;
 
 	linux_set_current(curthread);
 
 	/* get VM area structure */
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	VM_OBJECT_WUNLOCK(vm_obj);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	if (unlikely(vmap->vm_ops == NULL)) {
 		err = VM_FAULT_SIGBUS;
 	} else {
 		struct vm_fault vmf;
 
 		/* fill out VM fault structure */
 		vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
 		vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 		vmf.pgoff = 0;
 		vmf.page = NULL;
 		vmf.vma = vmap;
 
 		vmap->vm_pfn_count = 0;
 		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
 		vmap->vm_obj = vm_obj;
 
 		err = vmap->vm_ops->fault(vmap, &vmf);
 
 		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
 			kern_yield(PRI_USER);
 			err = vmap->vm_ops->fault(vmap, &vmf);
 		}
 	}
 
 	/* translate return code */
 	switch (err) {
 	case VM_FAULT_OOM:
 		err = VM_PAGER_AGAIN;
 		break;
 	case VM_FAULT_SIGBUS:
 		err = VM_PAGER_BAD;
 		break;
 	case VM_FAULT_NOPAGE:
 		/*
 		 * By contract the fault handler will return having
 		 * busied all the pages itself. If pidx is already
 		 * found in the object, it will simply xbusy the first
 		 * page and return with vm_pfn_count set to 1.
 		 */
 		*first = vmap->vm_pfn_first;
 		*last = *first + vmap->vm_pfn_count - 1;
 		err = VM_PAGER_OK;
 		break;
 	default:
 		err = VM_PAGER_ERROR;
 		break;
 	}
 	up_write(&vmap->vm_mm->mmap_sem);
 	VM_OBJECT_WLOCK(vm_obj);
 	return (err);
 }
 
 static struct rwlock linux_vma_lock;
 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
     TAILQ_HEAD_INITIALIZER(linux_vma_head);
 
 static void
 linux_cdev_handle_free(struct vm_area_struct *vmap)
 {
 	/* Drop reference on vm_file */
 	if (vmap->vm_file != NULL)
 		fput(vmap->vm_file);
 
 	/* Drop reference on mm_struct */
 	mmput(vmap->vm_mm);
 
 	kfree(vmap);
 }
 
 static void
 linux_cdev_handle_remove(struct vm_area_struct *vmap)
 {
 	rw_wlock(&linux_vma_lock);
 	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
 	rw_wunlock(&linux_vma_lock);
 }
 
 static struct vm_area_struct *
 linux_cdev_handle_find(void *handle)
 {
 	struct vm_area_struct *vmap;
 
 	rw_rlock(&linux_vma_lock);
 	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
 		if (vmap->vm_private_data == handle)
 			break;
 	}
 	rw_runlock(&linux_vma_lock);
 	return (vmap);
 }
 
 static int
 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	MPASS(linux_cdev_handle_find(handle) != NULL);
 	*color = 0;
 	return (0);
 }
 
 static void
 linux_cdev_pager_dtor(void *handle)
 {
 	const struct vm_operations_struct *vm_ops;
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(handle);
 	MPASS(vmap != NULL);
 
 	/*
 	 * Remove handle before calling close operation to prevent
 	 * other threads from reusing the handle pointer.
 	 */
 	linux_cdev_handle_remove(vmap);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	vm_ops = vmap->vm_ops;
 	if (likely(vm_ops != NULL))
 		vm_ops->close(vmap);
 	up_write(&vmap->vm_mm->mmap_sem);
 
 	linux_cdev_handle_free(vmap);
 }
 
 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
   {
 	/* OBJT_MGTDEVICE */
 	.cdev_pg_populate	= linux_cdev_pager_populate,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
   {
 	/* OBJT_DEVICE */
 	.cdev_pg_fault	= linux_cdev_pager_fault,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
 };
 
 int
 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
     unsigned long size)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vma->vm_obj;
 	if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
 		return (-ENOTSUP);
 	VM_OBJECT_RLOCK(obj);
 	for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
 	    m != NULL && m->pindex < OFF_TO_IDX(address + size);
 	    m = TAILQ_NEXT(m, listq))
 		pmap_remove_all(m);
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 static struct file_operations dummy_ldev_ops = {
 	/* XXXKIB */
 };
 
 static struct linux_cdev dummy_ldev = {
 	.ops = &dummy_ldev_ops,
 };
 
 #define	LDEV_SI_DTR	0x0001
 #define	LDEV_SI_REF	0x0002
 
 static void
 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
     struct linux_cdev **dev)
 {
 	struct linux_cdev *ldev;
 	u_int siref;
 
 	ldev = filp->f_cdev;
 	*fop = filp->f_op;
 	if (ldev != NULL) {
 		for (siref = ldev->siref;;) {
 			if ((siref & LDEV_SI_DTR) != 0) {
 				ldev = &dummy_ldev;
 				siref = ldev->siref;
 				*fop = ldev->ops;
 				MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 			} else if (atomic_fcmpset_int(&ldev->siref, &siref,
 			    siref + LDEV_SI_REF)) {
 				break;
 			}
 		}
 	}
 	*dev = ldev;
 }
 
 static void
 linux_drop_fop(struct linux_cdev *ldev)
 {
 
 	if (ldev == NULL)
 		return;
 	MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
 	atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
 }
 
 #define	OPW(fp,td,code) ({			\
 	struct file *__fpop;			\
 	__typeof(code) __retval;		\
 						\
 	__fpop = (td)->td_fpop;			\
 	(td)->td_fpop = (fp);			\
 	__retval = (code);			\
 	(td)->td_fpop = __fpop;			\
 	__retval;				\
 })
 
 static int
 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
     struct file *file)
 {
 	struct linux_cdev *ldev;
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	int error;
 
 	ldev = dev->si_drv1;
 
 	filp = linux_file_alloc();
 	filp->f_dentry = &filp->f_dentry_store;
 	filp->f_op = ldev->ops;
 	filp->f_mode = file->f_flag;
 	filp->f_flags = file->f_flag;
 	filp->f_vnode = file->f_vnode;
 	filp->_file = file;
 	refcount_acquire(&ldev->refs);
 	filp->f_cdev = ldev;
 
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 
 	if (fop->open != NULL) {
 		error = -fop->open(file->f_vnode, filp);
 		if (error != 0) {
 			linux_drop_fop(ldev);
 			linux_cdev_deref(filp->f_cdev);
 			kfree(filp);
 			return (error);
 		}
 	}
 
 	/* hold on to the vnode - used for fstat() */
 	vhold(filp->f_vnode);
 
 	/* release the file from devfs */
 	finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
 	linux_drop_fop(ldev);
 	return (ENXIO);
 }
 
 #define	LINUX_IOCTL_MIN_PTR 0x10000UL
 #define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
 
 static inline int
 linux_remap_address(void **uaddr, size_t len)
 {
 	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
 
 	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
 	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
 		struct task_struct *pts = current;
 		if (pts == NULL) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* compute data offset */
 		uaddr_val -= LINUX_IOCTL_MIN_PTR;
 
 		/* check that length is within bounds */
 		if ((len > IOCPARM_MAX) ||
 		    (uaddr_val + len) > pts->bsd_ioctl_len) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* re-add kernel buffer address */
 		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
 
 		/* update address location */
 		*uaddr = (void *)uaddr_val;
 		return (1);
 	}
 	return (0);
 }
 
 int
 linux_copyin(const void *uaddr, void *kaddr, size_t len)
 {
 	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(kaddr, uaddr, len);
 		return (0);
 	}
 	return (-copyin(uaddr, kaddr, len));
 }
 
 int
 linux_copyout(const void *kaddr, void *uaddr, size_t len)
 {
 	if (linux_remap_address(&uaddr, len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(uaddr, kaddr, len);
 		return (0);
 	}
 	return (-copyout(kaddr, uaddr, len));
 }
 
 size_t
 linux_clear_user(void *_uaddr, size_t _len)
 {
 	uint8_t *uaddr = _uaddr;
 	size_t len = _len;
 
 	/* make sure uaddr is aligned before going into the fast loop */
 	while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 
 	/* zero 8 bytes at a time */
 	while (len > 7) {
 #ifdef __LP64__
 		if (suword64(uaddr, 0))
 			return (_len);
 #else
 		if (suword32(uaddr, 0))
 			return (_len);
 		if (suword32(uaddr + 4, 0))
 			return (_len);
 #endif
 		uaddr += 8;
 		len -= 8;
 	}
 
 	/* zero fill end, if any */
 	while (len > 0) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 	return (0);
 }
 
 int
 linux_access_ok(const void *uaddr, size_t len)
 {
 	uintptr_t saddr;
 	uintptr_t eaddr;
 
 	/* get start and end address */
 	saddr = (uintptr_t)uaddr;
 	eaddr = (uintptr_t)uaddr + len;
 
 	/* verify addresses are valid for userspace */
 	return ((saddr == eaddr) ||
 	    (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
 }
 
 /*
  * This function should return either EINTR or ERESTART depending on
  * the signal type sent to this thread:
  */
 static int
 linux_get_error(struct task_struct *task, int error)
 {
 	/* check for signal type interrupt code */
 	if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
 		error = -linux_schedule_get_interrupt_value(task);
 		if (error == 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 static int
 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
     const struct file_operations *fop, u_long cmd, caddr_t data,
     struct thread *td)
 {
 	struct task_struct *task = current;
 	unsigned size;
 	int error;
 
 	size = IOCPARM_LEN(cmd);
 	/* refer to logic in sys_ioctl() */
 	if (size > 0) {
 		/*
 		 * Setup hint for linux_copyin() and linux_copyout().
 		 *
 		 * Background: Linux code expects a user-space address
 		 * while FreeBSD supplies a kernel-space address.
 		 */
 		task->bsd_ioctl_data = data;
 		task->bsd_ioctl_len = size;
 		data = (void *)LINUX_IOCTL_MIN_PTR;
 	} else {
 		/* fetch user-space pointer */
 		data = *(void **)data;
 	}
 #if defined(__amd64__)
 	if (td->td_proc->p_elf_machine == EM_386) {
 		/* try the compat IOCTL handler first */
 		if (fop->compat_ioctl != NULL) {
 			error = -OPW(fp, td, fop->compat_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 
 		/* fallback to the regular IOCTL handler, if any */
 		if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		}
 	} else
 #endif
 	{
 		if (fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 	}
 	if (size > 0) {
 		task->bsd_ioctl_data = NULL;
 		task->bsd_ioctl_len = 0;
 	}
 
 	if (error == EWOULDBLOCK) {
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	} else {
 		error = linux_get_error(task, error);
 	}
 	return (error);
 }
 
 #define	LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
 
 /*
  * This function atomically updates the poll wakeup state and returns
  * the previous state at the time of update.
  */
 static uint8_t
 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
 {
 	int c, old;
 
 	c = v->counter;
 
 	while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
 		c = old;
 
 	return (c);
 }
 
 
 static int
 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
 	};
 	struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_QUEUED:
 		linux_poll_wakeup(filp);
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 void
 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
 	};
 
 	/* check if we are called inside the select system call */
 	if (p == LINUX_POLL_TABLE_NORMAL)
 		selrecord(curthread, &filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_INIT:
 		/* NOTE: file handles can only belong to one wait-queue */
 		filp->f_wait_queue.wqh = wqh;
 		filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
 		add_wait_queue(wqh, &filp->f_wait_queue.wq);
 		atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 linux_poll_wait_dequeue(struct linux_file *filp)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT,	/* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
 	};
 
 	seldrain(&filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_NOT_READY:
 	case LINUX_FWQ_STATE_QUEUED:
 	case LINUX_FWQ_STATE_READY:
 		remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
 		break;
 	default:
 		break;
 	}
 }
 
 void
 linux_poll_wakeup(struct linux_file *filp)
 {
 	/* this function should be NULL-safe */
 	if (filp == NULL)
 		return;
 
 	selwakeup(&filp->f_selinfo);
 
 	spin_lock(&filp->f_kqlock);
 	filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE;
 
 	/* make sure the "knote" gets woken up */
 	KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static void
 linux_file_kqfilter_detach(struct knote *kn)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	spin_lock(&filp->f_kqlock);
 	knlist_remove(&filp->f_selinfo.si_note, kn, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
 }
 
 static int
 linux_file_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
 }
 
 static struct filterops linux_dev_kqfiltops_read = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_read_event,
 };
 
 static struct filterops linux_dev_kqfiltops_write = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_write_event,
 };
 
 static void
 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
 {
 	struct thread *td;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int temp;
 
 	if ((filp->f_kqflags & kqflags) == 0)
 		return;
 
 	td = curthread;
 
 	linux_get_fop(filp, &fop, &ldev);
 	/* get the latest polling state */
 	temp = OPW(filp->_file, td, fop->poll(filp, NULL));
 	linux_drop_fop(ldev);
 
 	spin_lock(&filp->f_kqlock);
 	/* clear kqflags */
 	filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE);
 	/* update kqflags */
 	if ((temp & (POLLIN | POLLOUT)) != 0) {
 		if ((temp & POLLIN) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
 		if ((temp & POLLOUT) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
 
 		/* make sure the "knote" gets woken up */
 		KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
 	}
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter(struct file *file, struct knote *kn)
 {
 	struct linux_file *filp;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	if (filp->f_op->poll == NULL)
 		return (EINVAL);
 
 	spin_lock(&filp->f_kqlock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
 		kn->kn_fop = &linux_dev_kqfiltops_read;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	case EVFILT_WRITE:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
 		kn->kn_fop = &linux_dev_kqfiltops_write;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	spin_unlock(&filp->f_kqlock);
 
 	if (error == 0) {
 		linux_set_current(td);
 
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	}
 	return (error);
 }
 
 static int
 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
     vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
     int nprot, struct thread *td)
 {
 	struct task_struct *task;
 	struct vm_area_struct *vmap;
 	struct mm_struct *mm;
 	struct linux_file *filp;
 	vm_memattr_t attr;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 
 	if (fop->mmap == NULL)
 		return (EOPNOTSUPP);
 
 	linux_set_current(td);
 
 	/*
 	 * The same VM object might be shared by multiple processes
 	 * and the mm_struct is usually freed when a process exits.
 	 *
 	 * The atomic reference below makes sure the mm_struct is
 	 * available as long as the vmap is in the linux_vma_head.
 	 */
 	task = current;
 	mm = task->mm;
 	if (atomic_inc_not_zero(&mm->mm_users) == 0)
 		return (EINVAL);
 
 	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
 	vmap->vm_start = 0;
 	vmap->vm_end = size;
 	vmap->vm_pgoff = *offset / PAGE_SIZE;
 	vmap->vm_pfn = 0;
 	vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
 	vmap->vm_ops = NULL;
 	vmap->vm_file = get_file(filp);
 	vmap->vm_mm = mm;
 
 	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
 		error = linux_get_error(task, EINTR);
 	} else {
 		error = -OPW(fp, td, fop->mmap(filp, vmap));
 		error = linux_get_error(task, error);
 		up_write(&vmap->vm_mm->mmap_sem);
 	}
 
 	if (error != 0) {
 		linux_cdev_handle_free(vmap);
 		return (error);
 	}
 
 	attr = pgprot2cachemode(vmap->vm_page_prot);
 
 	if (vmap->vm_ops != NULL) {
 		struct vm_area_struct *ptr;
 		void *vm_private_data;
 		bool vm_no_fault;
 
 		if (vmap->vm_ops->open == NULL ||
 		    vmap->vm_ops->close == NULL ||
 		    vmap->vm_private_data == NULL) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			return (EINVAL);
 		}
 
 		vm_private_data = vmap->vm_private_data;
 
 		rw_wlock(&linux_vma_lock);
 		TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
 			if (ptr->vm_private_data == vm_private_data)
 				break;
 		}
 		/* check if there is an existing VM area struct */
 		if (ptr != NULL) {
 			/* check if the VM area structure is invalid */
 			if (ptr->vm_ops == NULL ||
 			    ptr->vm_ops->open == NULL ||
 			    ptr->vm_ops->close == NULL) {
 				error = ESTALE;
 				vm_no_fault = 1;
 			} else {
 				error = EEXIST;
 				vm_no_fault = (ptr->vm_ops->fault == NULL);
 			}
 		} else {
 			/* insert VM area structure into list */
 			TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
 			error = 0;
 			vm_no_fault = (vmap->vm_ops->fault == NULL);
 		}
 		rw_wunlock(&linux_vma_lock);
 
 		if (error != 0) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			/* check for stale VM area struct */
 			if (error != EEXIST)
 				return (error);
 		}
 
 		/* check if there is no fault handler */
 		if (vm_no_fault) {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
 			    &linux_cdev_pager_ops[1], size, nprot, *offset,
 			    td->td_ucred);
 		} else {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
 			    &linux_cdev_pager_ops[0], size, nprot, *offset,
 			    td->td_ucred);
 		}
 
 		/* check if allocating the VM object failed */
 		if (*object == NULL) {
 			if (error == 0) {
 				/* remove VM area struct from list */
 				linux_cdev_handle_remove(vmap);
 				/* free allocated VM area struct */
 				linux_cdev_handle_free(vmap);
 			}
 			return (EINVAL);
 		}
 	} else {
 		struct sglist *sg;
 
 		sg = sglist_alloc(1, M_WAITOK);
 		sglist_append_phys(sg,
 		    (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
 
 		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
 		    nprot, 0, td->td_ucred);
 
 		linux_cdev_handle_free(vmap);
 
 		if (*object == NULL) {
 			sglist_free(sg);
 			return (EINVAL);
 		}
 	}
 
 	if (attr != VM_MEMATTR_DEFAULT) {
 		VM_OBJECT_WLOCK(*object);
 		vm_object_set_memattr(*object, attr);
 		VM_OBJECT_WUNLOCK(*object);
 	}
 	*offset = 0;
 	return (0);
 }
 
 struct cdevsw linuxcdevsw = {
 	.d_version = D_VERSION,
 	.d_fdopen = linux_dev_fdopen,
 	.d_name = "lkpidev",
 };
 
 static int
 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->read != NULL) {
 		bytes = OPW(file, td, fop->read(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->write != NULL) {
 		bytes = OPW(file, td, fop->write(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 			error = 0;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
 
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int revents;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->poll != NULL) {
 		revents = OPW(file, td, fop->poll(filp,
 		    LINUX_POLL_TABLE_NORMAL)) & events;
 	} else {
 		revents = 0;
 	}
 	linux_drop_fop(ldev);
 	return (revents);
 }
 
 static int
 linux_file_close(struct file *file, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 
 	KASSERT(file_count(filp) == 0,
 	    ("File refcount(%d) is not zero", file_count(filp)));
 
 	error = 0;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_poll_wait_dequeue(filp);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->release != NULL)
 		error = -OPW(file, td, fop->release(filp->f_vnode, filp));
 	funsetown(&filp->f_sigio);
 	if (filp->f_vnode != NULL)
 		vdrop(filp->f_vnode);
 	linux_drop_fop(ldev);
 	if (filp->f_cdev != NULL)
 		linux_cdev_deref(filp->f_cdev);
 	kfree(filp);
 
 	return (error);
 }
 
 static int
 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 	linux_get_fop(filp, &fop, &ldev);
 
 	linux_set_current(td);
 	switch (cmd) {
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		if (fop->fasync == NULL)
 			break;
 		error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
 		break;
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &filp->f_sigio);
 		if (error == 0) {
 			if (fop->fasync == NULL)
 				break;
 			error = -OPW(fp, td, fop->fasync(0, filp,
 			    fp->f_flag & FASYNC));
 		}
 		break;
 	case FIOGETOWN:
 		*(int *)data = fgetown(&filp->f_sigio);
 		break;
 	default:
 		error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
 		break;
 	}
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t *maxprotp, int *flagsp, struct file *fp,
     vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
 {
 	/*
 	 * Character devices do not provide private mappings
 	 * of any kind:
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0)
 		return (EINVAL);
 
 	return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
 	    (int)prot, td));
 }
 
 static int
 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 
 	vp = filp->f_vnode;
 	if (vp == NULL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 *
 	 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
 	 * requests rather than doing it here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	linux_get_fop(filp, &fop, &ldev);
 	error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp,
 	    &foff, fop, &object);
 	if (error != 0)
 		goto out;
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 out:
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	if (filp->f_vnode == NULL)
 		return (EOPNOTSUPP);
 
 	vp = filp->f_vnode;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	VOP_UNLOCK(vp, 0);
 
 	return (error);
 }
 
 static int
 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = fp->f_data;
 	vp = filp->f_vnode;
 	if (vp == NULL) {
 		error = 0;
 		kif->kf_type = KF_TYPE_DEV;
 	} else {
 		vref(vp);
 		FILEDESC_SUNLOCK(fdp);
 		error = vn_fill_kinfo_vnode(vp, kif);
 		vrele(vp);
 		kif->kf_type = KF_TYPE_VNODE;
 		FILEDESC_SLOCK(fdp);
 	}
 	return (error);
 }
 
 unsigned int
 linux_iminor(struct inode *inode)
 {
 	struct linux_cdev *ldev;
 
 	if (inode == NULL || inode->v_rdev == NULL ||
 	    inode->v_rdev->si_devsw != &linuxcdevsw)
 		return (-1U);
 	ldev = inode->v_rdev->si_drv1;
 	if (ldev == NULL)
 		return (-1U);
 
 	return (minor(ldev->dev));
 }
 
 struct fileops linuxfileops = {
 	.fo_read = linux_file_read,
 	.fo_write = linux_file_write,
 	.fo_truncate = invfo_truncate,
 	.fo_kqfilter = linux_file_kqfilter,
 	.fo_stat = linux_file_stat,
 	.fo_fill_kinfo = linux_file_fill_kinfo,
 	.fo_poll = linux_file_poll,
 	.fo_close = linux_file_close,
 	.fo_ioctl = linux_file_ioctl,
 	.fo_mmap = linux_file_mmap,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Hash of vmmap addresses.  This is infrequently accessed and does not
  * need to be particularly large.  This is done because we must store the
  * caller's idea of the map size to properly unmap.
  */
 struct vmmap {
 	LIST_ENTRY(vmmap)	vm_next;
 	void 			*vm_addr;
 	unsigned long		vm_size;
 };
 
 struct vmmaphd {
 	struct vmmap *lh_first;
 };
 #define	VMMAP_HASH_SIZE	64
 #define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
 #define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
 static struct mtx vmmaplock;
 
 static void
 vmmap_add(void *addr, unsigned long size)
 {
 	struct vmmap *vmmap;
 
 	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
 	mtx_lock(&vmmaplock);
 	vmmap->vm_size = size;
 	vmmap->vm_addr = addr;
 	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 }
 
 static struct vmmap *
 vmmap_remove(void *addr)
 {
 	struct vmmap *vmmap;
 
 	mtx_lock(&vmmaplock);
 	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
 		if (vmmap->vm_addr == addr)
 			break;
 	if (vmmap)
 		LIST_REMOVE(vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 
 	return (vmmap);
 }
 
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 void *
 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
 {
 	void *addr;
 
 	addr = pmap_mapdev_attr(phys_addr, size, attr);
 	if (addr == NULL)
 		return (NULL);
 	vmmap_add(addr, size);
 
 	return (addr);
 }
 #endif
 
 void
 iounmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
 #endif
 	kfree(vmmap);
 }
 
 
 void *
 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
 {
 	vm_offset_t off;
 	size_t size;
 
 	size = count * PAGE_SIZE;
 	off = kva_alloc(size);
 	if (off == 0)
 		return (NULL);
 	vmmap_add((void *)off, size);
 	pmap_qenter(off, pages, count);
 
 	return ((void *)off);
 }
 
 void
 vunmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
 	kva_free((vm_offset_t)addr, vmmap->vm_size);
 	kfree(vmmap);
 }
 
 char *
 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
 	unsigned int len;
 	char *p;
 	va_list aq;
 
 	va_copy(aq, ap);
 	len = vsnprintf(NULL, 0, fmt, aq);
 	va_end(aq);
 
 	p = kmalloc(len + 1, gfp);
 	if (p != NULL)
 		vsnprintf(p, len + 1, fmt, ap);
 
 	return (p);
 }
 
 char *
 kasprintf(gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = kvasprintf(gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 static void
 linux_timer_callback_wrapper(void *context)
 {
 	struct timer_list *timer;
 
 	linux_set_current(curthread);
 
 	timer = context;
 	timer->function(timer->data);
 }
 
 void
 mod_timer(struct timer_list *timer, int expires)
 {
 
 	timer->expires = expires;
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer(struct timer_list *timer)
 {
 
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer_on(struct timer_list *timer, int cpu)
 {
 
 	callout_reset_on(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer, cpu);
 }
 
 int
 del_timer(struct timer_list *timer)
 {
 
-	if (callout_stop(&(timer)->callout) == -1)
-		return (0);
-	return (1);
+	return (callout_stop(&(timer)->callout).was_cancelled);
 }
 
 static void
 linux_timer_init(void *arg)
 {
 
 	/*
 	 * Compute an internal HZ value which can divide 2**32 to
 	 * avoid timer rounding problems when the tick value wraps
 	 * around 2**32:
 	 */
 	linux_timer_hz_mask = 1;
 	while (linux_timer_hz_mask < (unsigned long)hz)
 		linux_timer_hz_mask *= 2;
 	linux_timer_hz_mask--;
 }
 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
 
 void
 linux_complete_common(struct completion *c, int all)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(c);
 	if (all) {
 		c->done = UINT_MAX;
 		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
 	} else {
 		if (c->done != UINT_MAX)
 			c->done++;
 		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
 	}
 	sleepq_release(c);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Indefinite wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_common(struct completion *c, int flags)
 {
 	struct task_struct *task;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 	error = 0;
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		if (flags & SLEEPQ_INTERRUPTIBLE) {
 			DROP_GIANT();
 			error = -sleepq_wait_sig(c, 0);
 			PICKUP_GIANT();
 			if (error != 0) {
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 				goto intr;
 			}
 		} else {
 			DROP_GIANT();
 			sleepq_wait(c, 0);
 			PICKUP_GIANT();
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 intr:
 	return (error);
 }
 
 /*
  * Time limited wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
 {
 	struct task_struct *task;
 	int end = jiffies + timeout;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
 
 		DROP_GIANT();
 		if (flags & SLEEPQ_INTERRUPTIBLE)
 			error = -sleepq_timedwait_sig(c, 0);
 		else
 			error = -sleepq_timedwait(c, 0);
 		PICKUP_GIANT();
 
 		if (error != 0) {
 			/* check for timeout */
 			if (error == -EWOULDBLOCK) {
 				error = 0;	/* timeout */
 			} else {
 				/* signal happened */
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 			}
 			goto done;
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 	/* return how many jiffies are left */
 	error = linux_timer_jiffies_until(end);
 done:
 	return (error);
 }
 
 int
 linux_try_wait_for_completion(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	if (c->done != 0 && c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 	return (isdone);
 }
 
 int
 linux_completion_done(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	sleepq_release(c);
 	return (isdone);
 }
 
 static void
 linux_cdev_deref(struct linux_cdev *ldev)
 {
 
 	if (refcount_release(&ldev->refs))
 		kfree(ldev);
 }
 
 static void
 linux_cdev_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	linux_cdev_deref(cdev);
 	kobject_put(parent);
 }
 
 static void
 linux_cdev_static_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	kobject_put(parent);
 }
 
 void
 linux_destroy_dev(struct linux_cdev *ldev)
 {
 
 	if (ldev->cdev == NULL)
 		return;
 
 	MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 	atomic_set_int(&ldev->siref, LDEV_SI_DTR);
 	while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
 		pause("ldevdtr", hz / 4);
 
 	destroy_dev(ldev->cdev);
 	ldev->cdev = NULL;
 }
 
 const struct kobj_type linux_cdev_ktype = {
 	.release = linux_cdev_release,
 };
 
 const struct kobj_type linux_cdev_static_ktype = {
 	.release = linux_cdev_static_release,
 };
 
 static void
 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	if (linkstate == LINK_STATE_UP)
 		nb->notifier_call(nb, NETDEV_UP, ifp);
 	else
 		nb->notifier_call(nb, NETDEV_DOWN, ifp);
 }
 
 static void
 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_REGISTER, ifp);
 }
 
 static void
 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
 }
 
 static void
 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
 }
 
 static void
 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 
 	nb = arg;
 	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
 }
 
 int
 register_netdevice_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
 	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
 	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
 	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
 	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
 	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
 
 	return (0);
 }
 
 int
 register_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
 	    ifaddr_event, linux_handle_ifaddr_event, nb, 0);
 	return (0);
 }
 
 int
 unregister_netdevice_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_link_event,
 	    nb->tags[NETDEV_UP]);
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 	    nb->tags[NETDEV_REGISTER]);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    nb->tags[NETDEV_UNREGISTER]);
 	EVENTHANDLER_DEREGISTER(iflladdr_event,
 	    nb->tags[NETDEV_CHANGEADDR]);
 
 	return (0);
 }
 
 int
 unregister_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event,
 	    nb->tags[NETDEV_CHANGEIFADDR]);
 
 	return (0);
 }
 
 struct list_sort_thunk {
 	int (*cmp)(void *, struct list_head *, struct list_head *);
 	void *priv;
 };
 
 static inline int
 linux_le_cmp(void *priv, const void *d1, const void *d2)
 {
 	struct list_head *le1, *le2;
 	struct list_sort_thunk *thunk;
 
 	thunk = priv;
 	le1 = *(__DECONST(struct list_head **, d1));
 	le2 = *(__DECONST(struct list_head **, d2));
 	return ((thunk->cmp)(thunk->priv, le1, le2));
 }
 
 void
 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
     struct list_head *a, struct list_head *b))
 {
 	struct list_sort_thunk thunk;
 	struct list_head **ar, *le;
 	size_t count, i;
 
 	count = 0;
 	list_for_each(le, head)
 		count++;
 	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
 	i = 0;
 	list_for_each(le, head)
 		ar[i++] = le;
 	thunk.cmp = cmp;
 	thunk.priv = priv;
 	qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
 	INIT_LIST_HEAD(head);
 	for (i = 0; i < count; i++)
 		list_add_tail(ar[i], head);
 	free(ar, M_KMALLOC);
 }
 
 void
 linux_irq_handler(void *ent)
 {
 	struct irq_ent *irqe;
 
 	linux_set_current(curthread);
 
 	irqe = ent;
 	irqe->handler(irqe->irq, irqe->arg);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 int
 linux_wbinvd_on_all_cpus(void)
 {
 
 	pmap_invalidate_cache();
 	return (0);
 }
 #endif
 
 int
 linux_on_each_cpu(void callback(void *), void *data)
 {
 
 	smp_rendezvous(smp_no_rendezvous_barrier, callback,
 	    smp_no_rendezvous_barrier, data);
 	return (0);
 }
 
 int
 linux_in_atomic(void)
 {
 
 	return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
 }
 
 struct linux_cdev *
 linux_find_cdev(const char *name, unsigned major, unsigned minor)
 {
 	dev_t dev = MKDEV(major, minor);
 	struct cdev *cdev;
 
 	dev_lock();
 	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
 		struct linux_cdev *ldev = cdev->si_drv1;
 		if (ldev->dev == dev &&
 		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
 			break;
 		}
 	}
 	dev_unlock();
 
 	return (cdev != NULL ? cdev->si_drv1 : NULL);
 }
 
 int
 __register_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add(cdev, makedev(major, i), 1);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 int
 __register_chrdev_p(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops, uid_t uid,
     gid_t gid, int mode)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 void
 __unregister_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name)
 {
 	struct linux_cdev *cdevp;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdevp = linux_find_cdev(name, major, i);
 		if (cdevp != NULL)
 			cdev_del(cdevp);
 	}
 }
 
 void
 linux_dump_stack(void)
 {
 #ifdef STACK
 	struct stack st;
 
 	stack_zero(&st);
 	stack_save(&st);
 	stack_print(&st);
 #endif
 }
 
 #if defined(__i386__) || defined(__amd64__)
 bool linux_cpu_has_clflush;
 #endif
 
 static void
 linux_compat_init(void *arg)
 {
 	struct sysctl_oid *rootoid;
 	int i;
 
 #if defined(__i386__) || defined(__amd64__)
 	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
 #endif
 	rw_init(&linux_vma_lock, "lkpi-vma-lock");
 
 	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
 	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
 	kobject_init(&linux_class_root, &linux_class_ktype);
 	kobject_set_name(&linux_class_root, "class");
 	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
 	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
 	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
 	kobject_set_name(&linux_root_device.kobj, "device");
 	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
 	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
 	    "device");
 	linux_root_device.bsddev = root_bus;
 	linux_class_misc.name = "misc";
 	class_register(&linux_class_misc);
 	INIT_LIST_HEAD(&pci_drivers);
 	INIT_LIST_HEAD(&pci_devices);
 	spin_lock_init(&pci_lock);
 	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
 	for (i = 0; i < VMMAP_HASH_SIZE; i++)
 		LIST_INIT(&vmmaphead[i]);
 }
 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
 
 static void
 linux_compat_uninit(void *arg)
 {
 	linux_kobject_kfree_name(&linux_class_root);
 	linux_kobject_kfree_name(&linux_root_device.kobj);
 	linux_kobject_kfree_name(&linux_class_misc.kobj);
 
 	mtx_destroy(&vmmaplock);
 	spin_lock_destroy(&pci_lock);
 	rw_destroy(&linux_vma_lock);
 }
 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
 
 /*
  * NOTE: Linux frequently uses "unsigned long" for pointer to integer
  * conversion and vice versa, where in FreeBSD "uintptr_t" would be
  * used. Assert these types have the same size, else some parts of the
  * LinuxKPI may not work like expected:
  */
 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
Index: projects/hps_callouts/sys/kern/kern_timeout.c
===================================================================
--- projects/hps_callouts/sys/kern/kern_timeout.c	(revision 352141)
+++ projects/hps_callouts/sys/kern/kern_timeout.c	(revision 352142)
@@ -1,1720 +1,1720 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
 #include "opt_ddb.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #include <machine/_inttypes.h>
 #endif
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 #ifdef	RSS
 static int pin_default_swi = 1;
 static int pin_pcpu_swi = 1;
 #else
 static int pin_default_swi = 0;
 static int pin_pcpu_swi = 0;
 #endif
 
 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_curr;
 	void			(*cc_drain)(void *);
 	void			*cc_last_func;
 	void			*cc_last_arg;
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 	int			ce_migration_cpu;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_next;
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
 	u_int			cc_inited;
 	char			cc_ktr_event_name[20];
 };
 
 #define	callout_migrating(c)	((c)->c_iflags & CALLOUT_DFRMIGRATION)
 
 #define	cc_exec_curr(cc, dir)		cc->cc_exec_entity[dir].cc_curr
 #define	cc_exec_last_func(cc, dir)	cc->cc_exec_entity[dir].cc_last_func
 #define	cc_exec_last_arg(cc, dir)	cc->cc_exec_entity[dir].cc_last_arg
 #define	cc_exec_drain(cc, dir)		cc->cc_exec_entity[dir].cc_drain
 #define	cc_exec_next(cc)		cc->cc_next
 #define	cc_exec_cancel(cc, dir)		cc->cc_exec_entity[dir].cc_cancel
 #define	cc_exec_waiting(cc, dir)	cc->cc_exec_entity[dir].cc_waiting
 #ifdef SMP
 #define	cc_migration_func(cc, dir)	cc->cc_exec_entity[dir].ce_migration_func
 #define	cc_migration_arg(cc, dir)	cc->cc_exec_entity[dir].ce_migration_arg
 #define	cc_migration_cpu(cc, dir)	cc->cc_exec_entity[dir].ce_migration_cpu
 #define	cc_migration_time(cc, dir)	cc->cc_exec_entity[dir].ce_migration_time
 #define	cc_migration_prec(cc, dir)	cc->cc_exec_entity[dir].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
 static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc_exec_curr(cc, direct) = NULL;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_waiting(cc, direct) = false;
 #ifdef SMP
 	cc_migration_cpu(cc, direct) = CPUBLOCK;
 	cc_migration_time(cc, direct) = 0;
 	cc_migration_prec(cc, direct) = 0;
 	cc_migration_func(cc, direct) = NULL;
 	cc_migration_arg(cc, direct) = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc_migration_cpu(cc, direct) != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on the BSP during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	memset(CC_CPU(curcpu), 0, sizeof(cc_cpu));
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Fetch whether we're pinning the swi's or not.
 	 */
 	TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
 	TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
 
 	/*
 	 * Only BSP handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
 	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
 callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_inited = 1;
 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
 	    M_CALLOUT, M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
 	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only BSP handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 	char name[MAXCOMLEN];
 #ifdef SMP
 	int cpu;
 	struct intr_event *ie;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
 	if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 	if (pin_default_swi &&
 	    (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
 		printf("%s: timeout clock couldn't be pinned to cpu %d\n",
 		    __func__,
 		    timeout_cpu);
 	}
 
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only BSP handles timeout(9). */
 		callout_cpu_init(cc, cpu);
 		snprintf(name, sizeof(name), "clock (%d)", cpu);
 		ie = NULL;
 		if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 		if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
 			printf("%s: per-cpu clock couldn't be pinned to "
 			    "cpu %d\n",
 			    __func__,
 			    cpu);
 		}
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_iflags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc_exec_next(cc) =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc_exec_next(cc);
 					cc_exec_next(cc) = NULL;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_iflags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_iflags |= CALLOUT_PENDING;
 	c->c_iflags &= ~CALLOUT_PROCESSED;
 	c->c_flags |= CALLOUT_ACTIVE;
 	if (flags & C_DIRECT_EXEC)
 		c->c_iflags |= CALLOUT_DIRECT;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc_exec_next(cc) = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (SBT_MAX - c->c_time < c->c_precision)
 		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_iflags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
 	    ("softclock_call_cc: pend %p %x", c, c->c_iflags));
 	KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
 	    ("softclock_call_cc: act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_iflags = c->c_iflags;
 	if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
 		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_iflags &= ~CALLOUT_PENDING;
 	
 	cc_exec_curr(cc, direct) = c;
 	cc_exec_last_func(cc, direct) = c_func;
 	cc_exec_last_arg(cc, direct) = c_arg;
 	cc_exec_cancel(cc, direct) = false;
 	cc_exec_drain(cc, direct) = NULL;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc_exec_cancel(cc, direct)) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc_exec_cancel(cc, direct) = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
 	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
 	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE1(callout_execute, , , callout__start, c);
 	c_func(c_arg);
 	SDT_PROBE1(callout_execute, , , callout__end, c);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
 	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
 	cc_exec_curr(cc, direct) = NULL;
 	if (cc_exec_drain(cc, direct)) {
 		void (*drain)(void *);
 		
 		drain = cc_exec_drain(cc, direct);
 		cc_exec_drain(cc, direct) = NULL;
 		CC_UNLOCK(cc);
 		drain(c_arg);
 		CC_LOCK(cc);
 	}
 	if (cc_exec_waiting(cc, direct)) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc_exec_waiting(cc, direct) = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc_exec_waiting(cc, direct));
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc_migration_cpu(cc, direct);
 		new_time = cc_migration_time(cc, direct);
 		new_prec = cc_migration_prec(cc, direct);
 		new_func = cc_migration_func(cc, direct);
 		new_arg = cc_migration_arg(cc, direct);
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if (!callout_migrating(c)) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_iflags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_iflags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_iflags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(timeout_t *ftn, void *arg, int to_ticks)
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 void
 callout_when(sbintime_t sbt, sbintime_t precision, int flags,
     sbintime_t *res, sbintime_t *prec_res)
 {
 	sbintime_t to_sbt, to_pr;
 
 	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
 		*res = sbt;
 		*prec_res = precision;
 		return;
 	}
 	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
 		sbt = tick_sbt;
 	if ((flags & C_HARDCLOCK) != 0 ||
 #ifdef NO_EVENTTIMERS
 	    sbt >= sbt_timethreshold) {
 		to_sbt = getsbinuptime();
 
 		/* Add safety belt for the case of hz > 1000. */
 		to_sbt += tc_tick_sbt - tick_sbt;
 #else
 	    sbt >= sbt_tickthreshold) {
 		/*
 		 * Obtain the time of the last hardclock() call on
 		 * this CPU directly from the kern_clocksource.c.
 		 * This value is per-CPU, but it is equal for all
 		 * active ones.
 		 */
 #ifdef __LP64__
 		to_sbt = DPCPU_GET(hardclocktime);
 #else
 		spinlock_enter();
 		to_sbt = DPCPU_GET(hardclocktime);
 		spinlock_exit();
 #endif
 #endif
 		if (cold && to_sbt == 0)
 			to_sbt = sbinuptime();
 		if ((flags & C_HARDCLOCK) == 0)
 			to_sbt += tick_sbt;
 	} else
 		to_sbt = sbinuptime();
 	if (SBT_MAX - to_sbt < sbt)
 		to_sbt = SBT_MAX;
 	else
 		to_sbt += sbt;
 	*res = to_sbt;
 	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 	    sbt >> C_PRELGET(flags));
 	*prec_res = to_pr > precision ? to_pr : precision;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 callout_ret_t
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
 	callout_ret_t retval = {};
 	int direct;
 	int ignore_cpu=0;
 
 	if (cpu == -1) {
 		ignore_cpu = 1;
 	} else if ((cpu >= MAXCPU) ||
 		   ((CC_CPU(cpu))->cc_inited == 0)) {
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
 	callout_when(sbt, prec, flags, &to_sbt, &precision);
 
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
 	 * wrong direct flag if we don't do it before we add.
 	 */
 	if (flags & C_DIRECT_EXEC) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced or handle the case where the user does
 	 * not care. 
 	 */
 	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
 	    ignore_cpu) {
 		cpu = c->c_cpu;
 	}
 
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		retval.is_executing = 1;
 
 		if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) {
 			cc_exec_cancel(cc, direct) = true;
 			retval.was_cancelled = 1;
 		}
 
 		if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
-			    cancelled ? "cancelled" : "failed to cancel",
+			    retval.was_cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			goto done;
 		}
 #ifdef SMP
 		if (callout_migrating(c)) {
 			/* 
 			 * This only occurs when a second callout_reset_sbt_on
 			 * is made after a previous one moved it into
 			 * deferred migration (below). Note we do *not* change
 			 * the prev_cpu even though the previous target may
 			 * be different.
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			retval.was_cancelled = 1;
 			goto done;
 		}
 #endif
 	}
 	if (c->c_iflags & CALLOUT_PENDING) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 		retval.was_cancelled = 1;
 		c->c_iflags &= ~ CALLOUT_PENDING;
 		c->c_flags &= ~ CALLOUT_ACTIVE;
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc_exec_curr(cc, direct) == c) {
 			/* 
 			 * Pending will have been removed since we are
 			 * actually executing the callout on another
 			 * CPU. That callout should be waiting on the
 			 * lock the caller holds. If we set both
 			 * active/and/pending after we return and the
 			 * lock on the executing callout proceeds, it
 			 * will then see pending is true and return.
 			 * At the return from the actual callout execution
 			 * the migration will occur in softclock_call_cc
 			 * and this new callout will be placed on the 
 			 * new CPU via a call to callout_cpu_switch() which
 			 * will get the lock on the right CPU followed
 			 * by a call callout_cc_add() which will add it there.
 			 * (see above in softclock_call_cc()).
 			 */
 			cc_migration_cpu(cc, direct) = cpu;
 			cc_migration_time(cc, direct) = to_sbt;
 			cc_migration_prec(cc, direct) = precision;
 			cc_migration_func(cc, direct) = ftn;
 			cc_migration_arg(cc, direct) = arg;
 			c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
 			c->c_flags |= CALLOUT_ACTIVE;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			goto done;
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
-	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+	    retval.was_cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 done:
 	CC_UNLOCK(cc);
 	return (retval);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 callout_ret_t
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 callout_ret_t
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 callout_ret_t
 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	callout_ret_t retval = {};
 	int direct, sq_locked, use_lock;
 	int not_on_a_list;
 
 	if ((flags & CS_DRAIN) != 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
 		    "calling %s", __func__);
 
 	KASSERT((flags & CS_DRAIN) == 0 || drain == NULL,
 	    ("Cannot set drain callback when CS_DRAIN flag is set"));
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	if (c->c_iflags & CALLOUT_DIRECT) {
 		direct = 1;
 	} else {
 		direct = 0;
 	}
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
 	    (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
 	    ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
 		/*
 		 * Special case where this slipped in while we
 		 * were migrating *as* the callout is about to
 		 * execute. The caller probably holds the lock
 		 * the callout wants.
 		 *
 		 * Get rid of the migration first. Then set
 		 * the flag that tells this code *not* to
 		 * try to remove it from any lists (its not
 		 * on one yet). When the callout wheel runs,
 		 * it will ignore this callout.
 		 */
 		c->c_iflags &= ~CALLOUT_PENDING;
 		c->c_flags &= ~CALLOUT_ACTIVE;
 		not_on_a_list = 1;
 	} else {
 		not_on_a_list = 0;
 	}
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&cc_exec_waiting(old_cc, direct));
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout is running, try to stop it or drain it.
 	 */
 	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * Succeed we to stop it or not, we must clear the
 		 * active flag - this is what API users expect.  If we're
 		 * draining and the callout is currently executing, first wait
 		 * until it finishes.
 		 */
 		if ((flags & CS_DRAIN) == 0)
 			c->c_flags &= ~CALLOUT_ACTIVE;
 
 		if ((flags & CS_DRAIN) != 0) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc_exec_curr(cc, direct) == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					    &cc_exec_waiting(cc, direct));
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc_exec_waiting(cc, direct) = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc_exec_waiting(cc, direct),
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc_exec_waiting(cc, direct),
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 			c->c_flags &= ~CALLOUT_ACTIVE;
 		} else if (use_lock &&
 			   !cc_exec_cancel(cc, direct) && (drain == NULL)) {
 			
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock(). This *only* works with a
 			 * callout_stop() *not* callout_drain() or
 			 * callout_async_drain().
 			 */
 			cc_exec_cancel(cc, direct) = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			if (callout_migrating(c)) {
 				c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 				cc_migration_cpu(cc, direct) = CPUBLOCK;
 				cc_migration_time(cc, direct) = 0;
 				cc_migration_prec(cc, direct) = 0;
 				cc_migration_func(cc, direct) = NULL;
 				cc_migration_arg(cc, direct) = NULL;
 #endif
 			}
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			retval.was_cancelled = 1;
 			retval.is_executing = 1;
 			goto done;
 		} else if (callout_migrating(c)) {
 			/*
 			 * The callout is currently being serviced
 			 * and the "next" callout is scheduled at
 			 * its completion with a migration. We remove
 			 * the migration flag so it *won't* get rescheduled,
 			 * but we can't stop the one that's running.
 			 */
 			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 #ifdef SMP
 			/* 
 			 * We can't call cc_cce_cleanup here since
 			 * if we do it will remove .ce_curr and
 			 * its still running. This will prevent a
 			 * reschedule of the callout when the 
 			 * execution completes.
 			 */
 			cc_migration_cpu(cc, direct) = CPUBLOCK;
 			cc_migration_time(cc, direct) = 0;
 			cc_migration_prec(cc, direct) = 0;
 			cc_migration_func(cc, direct) = NULL;
 			cc_migration_arg(cc, direct) = NULL;
 #endif
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
  			if (drain) {
 				cc_exec_drain(cc, direct) = drain;
 			}
 			retval.is_executing = 1;
 			goto done;
 		} else {
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			if (drain) {
 				cc_exec_drain(cc, direct) = drain;
 			}
 			retval.is_executing = 1;
 		}
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 	}
 
 	if (sq_locked)
 		sleepq_release(&cc_exec_waiting(cc, direct));
 
 	if ((c->c_iflags & CALLOUT_PENDING) == 0) {
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		goto done;
 	}
 
 	retval.was_cancelled = 1;
 
 	c->c_iflags &= ~CALLOUT_PENDING;
 	c->c_flags &= ~CALLOUT_ACTIVE;
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if (not_on_a_list == 0) {
 		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
 			if (cc_exec_next(cc) == c)
 				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else {
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
 	}
 	callout_cc_del(c, cc);
 done:
 	CC_UNLOCK(cc);
 	return (retval);
 }
 
 void
 callout_init(struct callout *c, int mpsafe)
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_iflags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_iflags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independent on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(struct timeval *time_change)
 {
 	struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = howmany(time_change->tv_sec * 1000000 +
 		    time_change->tv_usec, tick) + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 		    howmany(time_change->tv_usec, tick) + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
 
 #ifdef DDB
 static void
 _show_callout(struct callout *c)
 {
 
 	db_printf("callout %p\n", c);
 #define	C_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, c->e);
 	db_printf("   &c_links = %p\n", &(c->c_links));
 	C_DB_PRINTF("%" PRId64,	c_time);
 	C_DB_PRINTF("%" PRId64,	c_precision);
 	C_DB_PRINTF("%p",	c_arg);
 	C_DB_PRINTF("%p",	c_func);
 	C_DB_PRINTF("%p",	c_lock);
 	C_DB_PRINTF("%#x",	c_flags);
 	C_DB_PRINTF("%#x",	c_iflags);
 	C_DB_PRINTF("%d",	c_cpu);
 #undef	C_DB_PRINTF
 }
 
 DB_SHOW_COMMAND(callout, db_show_callout)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show callout <struct callout *>\n");
 		return;
 	}
 
 	_show_callout((struct callout *)addr);
 }
 
 static void
 _show_last_callout(int cpu, int direct, const char *dirstr)
 {
 	struct callout_cpu *cc;
 	void *func, *arg;
 
 	cc = CC_CPU(cpu);
 	func = cc_exec_last_func(cc, direct);
 	arg = cc_exec_last_arg(cc, direct);
 	db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func);
 	db_printsym((db_expr_t)func, DB_STGY_ANY);
 	db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg);
 }
 
 DB_SHOW_COMMAND(callout_last, db_show_callout_last)
 {
 	int cpu, last;
 
 	if (have_addr) {
 		if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) {
 			db_printf("no such cpu: %d\n", (int)addr);
 			return;
 		}
 		cpu = last = addr;
 	} else {
 		cpu = 0;
 		last = mp_maxid;
 	}
 
 	while (cpu <= last) {
 		if (!CPU_ABSENT(cpu)) {
 			_show_last_callout(cpu, 0, "");
 			_show_last_callout(cpu, 1, " direct");
 		}
 		cpu++;
 	}
 }
 #endif /* DDB */
Index: projects/hps_callouts/sys/netinet/tcp_hpts.c
===================================================================
--- projects/hps_callouts/sys/netinet/tcp_hpts.c	(revision 352141)
+++ projects/hps_callouts/sys/netinet/tcp_hpts.c	(revision 352142)
@@ -1,2025 +1,2025 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 /**
  * Some notes about usage.
  *
  * The tcp_hpts system is designed to provide a high precision timer
  * system for tcp. Its main purpose is to provide a mechanism for 
  * pacing packets out onto the wire. It can be used in two ways
  * by a given TCP stack (and those two methods can be used simultaneously).
  *
  * First, and probably the main thing its used by Rack and BBR, it can
  * be used to call tcp_output() of a transport stack at some time in the future.
  * The normal way this is done is that tcp_output() of the stack schedules
  * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
  * slot is the time from now that the stack wants to be called but it
  * must be converted to tcp_hpts's notion of slot. This is done with
  * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
  * call from the tcp_output() routine might look like:
  *
  * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
  *
  * The above would schedule tcp_ouput() to be called in 550 useconds.
  * Note that if using this mechanism the stack will want to add near
  * its top a check to prevent unwanted calls (from user land or the
  * arrival of incoming ack's). So it would add something like:
  *
  * if (inp->inp_in_hpts)
  *    return;
  *
  * to prevent output processing until the time alotted has gone by.
  * Of course this is a bare bones example and the stack will probably
  * have more consideration then just the above.
  * 
  * Now the second function (actually two functions I guess :D)
  * the tcp_hpts system provides is the  ability to either abort 
  * a connection (later) or process input on a connection. 
  * Why would you want to do this? To keep processor locality
  * and or not have to worry about untangling any recursive
  * locks. The input function now is hooked to the new LRO
  * system as well. 
  *
  * In order to use the input redirection function the
  * tcp stack must define an input function for 
  * tfb_do_queued_segments(). This function understands
  * how to dequeue a array of packets that were input and
  * knows how to call the correct processing routine. 
  *
  * Locking in this is important as well so most likely the 
  * stack will need to define the tfb_do_segment_nounlock()
  * splitting tfb_do_segment() into two parts. The main processing
  * part that does not unlock the INP and returns a value of 1 or 0.
  * It returns 0 if all is well and the lock was not released. It
  * returns 1 if we had to destroy the TCB (a reset received etc).
  * The remains of tfb_do_segment() then become just a simple call
  * to the tfb_do_segment_nounlock() function and check the return
  * code and possibly unlock.
  * 
  * The stack must also set the flag on the INP that it supports this
  * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
  * this flag as well and will queue packets when it is set.
  * There are other flags as well INP_MBUF_QUEUE_READY and
  * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
  * that we are in the pacer for output so there is no
  * need to wake up the hpts system to get immediate
  * input. The second tells the LRO code that its okay
  * if a SACK arrives you can still defer input and let
  * the current hpts timer run (this is usually set when
  * a rack timer is up so we know SACK's are happening
  * on the connection already and don't want to wakeup yet).
  *
  * There is a common functions within the rack_bbr_common code
  * version i.e. ctf_do_queued_segments(). This function
  * knows how to take the input queue of packets from 
  * tp->t_in_pkts and process them digging out 
  * all the arguments, calling any bpf tap and 
  * calling into tfb_do_segment_nounlock(). The common
  * function (ctf_do_queued_segments())  requires that 
  * you have defined the tfb_do_segment_nounlock() as
  * described above.
  *
  * The second feature of the input side of hpts is the
  * dropping of a connection. This is due to the way that
  * locking may have occured on the INP_WLOCK. So if
  * a stack wants to drop a connection it calls:
  *
  *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
  * 
  * To schedule the tcp_hpts system to call 
  * 
  *    tcp_drop(tp, drop_reason)
  *
  * at a future point. This is quite handy to prevent locking
  * issues when dropping connections.
  *
  */
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <sys/time.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 
 #ifdef tcpdebug
 #include <netinet/tcp_debug.h>
 #endif				/* tcpdebug */
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
 #include "opt_rss.h"
 
 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
 static int tcp_bind_threads = 1;
 #else
 static int tcp_bind_threads = 2;
 #endif
 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
 
 static struct tcp_hptsi tcp_pace;
 static int hpts_does_tp_logging = 0;
 
 static void tcp_wakehpts(struct tcp_hpts_entry *p);
 static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
 static void tcp_hptsi(struct tcp_hpts_entry *hpts);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
 
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int32_t tcp_hpts_callout_skip_swi = 0;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 
 static int32_t tcp_hpts_precision = 120;
 
 struct hpts_domain_info {
 	int count;
 	int cpu[MAXCPU];
 };
 
 struct hpts_domain_info hpts_domains[MAXMEMDOM];
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
     &tcp_hpts_precision, 120,
     "Value for PRE() precision of callout");
 
 counter_u64_t hpts_hopelessly_behind;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
     &hpts_hopelessly_behind,
     "Number of times hpts could not catch up and was behind hopelessly");
 
 counter_u64_t hpts_loops;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
     &hpts_loops, "Number of times hpts had to loop to catch up");
 
 
 counter_u64_t back_tosleep;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
     &back_tosleep, "Number of times hpts found no tcbs");
 
 counter_u64_t combined_wheel_wrap;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
     &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
 counter_u64_t wheel_wrap;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
     &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
 static int32_t out_ts_percision = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
     &out_ts_percision, 0,
     "Do we use a percise timestamp for every output cts");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
     &hpts_does_tp_logging, 0,
     "Do we add to any tp that has logging on pacer logs");
 
 static int32_t max_pacer_loops = 10;
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
     &max_pacer_loops, 10,
     "What is the maximum number of times the pacer will loop trying to catch up");
 
 #define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
 
 static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
 
 
 static int
 sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = hpts_sleep_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
 		    (new > HPTS_MAX_SLEEP_ALLOWED)) 
 			error = EINVAL;
 		else
 			hpts_sleep_max = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
     CTLTYPE_UINT | CTLFLAG_RW,
     &hpts_sleep_max, 0,
     &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
     "Maximum time hpts will sleep");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
     &tcp_min_hptsi_time, 0,
     "The minimum time the hpts must sleep before processing more slots");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
     &tcp_hpts_callout_skip_swi, 0,
     "Do we have the callout call directly to the hpts?");
 
 static void
 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
 	     int ticks_to_run, int idx)
 {
 	union tcp_log_stackspecific log;
 	
 	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 	log.u_bbr.flex1 = hpts->p_nxt_slot;
 	log.u_bbr.flex2 = hpts->p_cur_slot;
 	log.u_bbr.flex3 = hpts->p_prev_slot;
 	log.u_bbr.flex4 = idx;
 	log.u_bbr.flex5 = hpts->p_curtick;
 	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
 	log.u_bbr.use_lt_bw = 1;
 	log.u_bbr.inflight = ticks_to_run;
 	log.u_bbr.applimited = hpts->overidden_sleep;
 	log.u_bbr.delivered = hpts->saved_curtick;
 	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
 	log.u_bbr.epoch = hpts->saved_curslot;
 	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
 	log.u_bbr.pkts_out = hpts->p_delayed_by;
 	log.u_bbr.lost = hpts->p_hpts_sleep_time;
 	log.u_bbr.cur_del_rate = hpts->p_runningtick;
 	TCP_LOG_EVENTP(tp, NULL,
 		       &tp->t_inpcb->inp_socket->so_rcv,
 		       &tp->t_inpcb->inp_socket->so_snd,
 		       BBR_LOG_HPTSDIAG, 0,
 		       0, &log, false, tv);
 }
 
 static void
 hpts_timeout_swi(void *arg)
 {
 	struct tcp_hpts_entry *hpts;
 
 	hpts = (struct tcp_hpts_entry *)arg;
 	swi_sched(hpts->ie_cookie, 0);
 }
 
 static void
 hpts_timeout_dir(void *arg)
 {
 	tcp_hpts_thread(arg);
 }
 
 static inline void
 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_hpts == 0) {
 		/* We are not on the hpts? */
 		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_REMOVE(head, inp, inp_hpts);
 	hpts->p_on_queue_cnt--;
 	if (hpts->p_on_queue_cnt < 0) {
 		/* Count should not go negative .. */
 #ifdef INVARIANTS
 		panic("Hpts goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_queue_cnt = 0;
 	}
 	if (clear) {
 		inp->inp_hpts_request = 0;
 		inp->inp_in_hpts = 0;
 	}
 }
 
 static inline void
 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
 		/* We are already on the hpts? */
 		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
 	inp->inp_in_hpts = 1;
 	hpts->p_on_queue_cnt++;
 	if (noref == 0) {
 		in_pcbref(inp);
 	}
 }
 
 static inline void
 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 0) {
 		/* We are not on the input hpts? */
 		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
 	hpts->p_on_inqueue_cnt--;
 	if (hpts->p_on_inqueue_cnt < 0) {
 #ifdef INVARIANTS
 		panic("Hpts in goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_inqueue_cnt = 0;
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		/* We should not be empty with a queue count */
 		panic("%s hpts:%p in_hpts input empty but cnt:%d",
 		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	if (clear)
 		inp->inp_in_input = 0;
 }
 
 static inline void
 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 1) {
 		/* We are already on the input hpts? */
 		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
 	inp->inp_in_input = 1;
 	hpts->p_on_inqueue_cnt++;
 	in_pcbref(inp);
 }
 
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_hpts_wake_scheduled == 0) {
 		hpts->p_hpts_wake_scheduled = 1;
 		swi_sched(hpts->ie_cookie, 0);
 	}
 }
 
 static void
 tcp_wakeinput(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_hpts_wake_scheduled == 0) {
 		hpts->p_hpts_wake_scheduled = 1;
 		swi_sched(hpts->ie_cookie, 0);
 	}
 }
 
 struct tcp_hpts_entry *
 tcp_cur_hpts(struct inpcb *inp)
 {
 	int32_t hpts_num;
 	struct tcp_hpts_entry *hpts;
 
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_hpts_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_hpts_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_input_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_input_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_input_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 static void
 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
 {
 	int32_t add_freed;
 
 	if (inp->inp_flags2 & INP_FREED) {
 		/*
 		 * Need to play a special trick so that in_pcbrele_wlocked
 		 * does not return 1 when it really should have returned 0.
 		 */
 		add_freed = 1;
 		inp->inp_flags2 &= ~INP_FREED;
 	} else {
 		add_freed = 0;
 	}
 #ifndef INP_REF_DEBUG
 	if (in_pcbrele_wlocked(inp)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #else
 	if (__in_pcbrele_wlocked(inp, line)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #endif
 	if (add_freed) {
 		inp->inp_flags2 |= INP_FREED;
 	}
 }
 
 static void
 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	if (inp->inp_in_hpts) {
 		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 static void
 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input) {
 		hpts_sane_input_remove(hpts, inp, 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 /*
  * Called normally with the INP_LOCKED but it
  * does not matter, the hpts lock is the key
  * but the lock order allows us to hold the
  * INP lock and then get the hpts lock.
  *
  * Valid values in the flags are
  * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
  * HPTS_REMOVE_INPUT - remove from the input of the hpts.
  * Note that you can use one or both values together 
  * and get two actions.
  */
 void
 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	if (flags & HPTS_REMOVE_OUTPUT) {
 		hpts = tcp_hpts_lock(inp);
 		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 	if (flags & HPTS_REMOVE_INPUT) {
 		hpts = tcp_input_lock(inp);
 		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 }
 
 static inline int
 hpts_tick(uint32_t wheel_tick, uint32_t plus)
 {
 	/*
 	 * Given a slot on the wheel, what slot
 	 * is that plus ticks out?
 	 */
 	KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
 	return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
 tick_to_wheel(uint32_t cts_in_wticks)
 {
 	/* 
 	 * Given a timestamp in wheel ticks (10usec inc's)
 	 * map it to our limited space wheel.
 	 */
 	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
 hpts_ticks_diff(int prev_tick, int tick_now)
 {
 	/*
 	 * Given two ticks that are someplace
 	 * on our wheel. How far are they apart?
 	 */
 	if (tick_now > prev_tick)
 		return (tick_now - prev_tick);
 	else if (tick_now == prev_tick)
 		/* 
 		 * Special case, same means we can go all of our 
 		 * wheel less one slot.
 		 */
 		return (NUM_OF_HPTSI_SLOTS - 1);
 	else
 		return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
 }
 
 /*
  * Given a tick on the wheel that is the current time
  * mapped to the wheel (wheel_tick), what is the maximum
  * distance forward that can be obtained without
  * wrapping past either prev_tick or running_tick
  * depending on the htps state? Also if passed
  * a uint32_t *, fill it with the tick location.
  *
  * Note if you do not give this function the current
  * time (that you think it is) mapped to the wheel 
  * then the results will not be what you expect and
  * could lead to invalid inserts.
  */
 static inline int32_t
 max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
 {
 	uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
 
 	if ((hpts->p_hpts_active == 1) &&
 	    (hpts->p_wheel_complete == 0)) {
 		end_tick = hpts->p_runningtick;
 		/* Back up one tick */
 		if (end_tick == 0)
 			end_tick = NUM_OF_HPTSI_SLOTS - 1;
 		else
 			end_tick--;
 		if (target_tick)
 			*target_tick = end_tick;
 	} else {
 		/*
 		 * For the case where we are
 		 * not active, or we have
 		 * completed the pass over
 		 * the wheel, we can use the
 		 * prev tick and subtract one from it. This puts us
 		 * as far out as possible on the wheel.
 		 */
 		end_tick = hpts->p_prev_slot;
 		if (end_tick == 0)
 			end_tick = NUM_OF_HPTSI_SLOTS - 1;
 		else
 			end_tick--;
 		if (target_tick)
 			*target_tick = end_tick;
 		/* 
 		 * Now we have close to the full wheel left minus the 
 		 * time it has been since the pacer went to sleep. Note
 		 * that wheel_tick, passed in, should be the current time
 		 * from the perspective of the caller, mapped to the wheel.
 		 */
 		if (hpts->p_prev_slot != wheel_tick)
 			dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
 		else
 			dis_to_travel = 1;
 		/* 
 		 * dis_to_travel in this case is the space from when the 
 		 * pacer stopped (p_prev_slot) and where our wheel_tick 
 		 * is now. To know how many slots we can put it in we 
 		 * subtract from the wheel size. We would not want
 		 * to place something after p_prev_slot or it will
 		 * get ran too soon.
 		 */
 		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
 	}
 	/* 
 	 * So how many slots are open between p_runningtick -> p_cur_slot 
 	 * that is what is currently un-available for insertion. Special
 	 * case when we are at the last slot, this gets 1, so that
 	 * the answer to how many slots are available is all but 1.
 	 */
 	if (hpts->p_runningtick == hpts->p_cur_slot)
 		dis_to_travel = 1;
 	else
 		dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
 	/* 
 	 * How long has the pacer been running?
 	 */
 	if (hpts->p_cur_slot != wheel_tick) {
 		/* The pacer is a bit late */
 		pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
 	} else {
 		/* The pacer is right on time, now == pacers start time */
 		pacer_to_now = 0;
 	}
 	/* 
 	 * To get the number left we can insert into we simply
 	 * subract the distance the pacer has to run from how
 	 * many slots there are.
 	 */
 	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
 	/* 
 	 * Now how many of those we will eat due to the pacer's 
 	 * time (p_cur_slot) of start being behind the 
 	 * real time (wheel_tick)?
 	 */
 	if (avail_on_wheel <= pacer_to_now) {
 		/* 
 		 * Wheel wrap, we can't fit on the wheel, that
 		 * is unusual the system must be way overloaded!
 		 * Insert into the assured tick, and return special
 		 * "0".
 		 */
 		counter_u64_add(combined_wheel_wrap, 1);
 		*target_tick = hpts->p_nxt_slot;
 		return (0);
 	} else {
 		/* 
 		 * We know how many slots are open
 		 * on the wheel (the reverse of what
 		 * is left to run. Take away the time
 		 * the pacer started to now (wheel_tick)
 		 * and that tells you how many slots are
 		 * open that can be inserted into that won't
 		 * be touched by the pacer until later.
 		 */
 		return (avail_on_wheel - pacer_to_now);
 	}
 }
 
 static int
 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
 {
 	uint32_t need_wake = 0;
 	
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_hpts == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		inp->inp_hpts_request = 0;
 		if ((hpts->p_hpts_active == 0) ||
 		    (hpts->p_wheel_complete)) {
 			/*
 			 * A sleeping hpts we want in next slot to run 
 			 * note that in this state p_prev_slot == p_cur_slot
 			 */
 			inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
 			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
 				need_wake = 1;
 		} else if ((void *)inp == hpts->p_inp) {
 			/*
 			 * The hpts system is running and the caller
 			 * was awoken by the hpts system. 
 			 * We can't allow you to go into the same slot we
 			 * are in (we don't want a loop :-D).
 			 */
 			inp->inp_hptsslot = hpts->p_nxt_slot;
 		} else
 			inp->inp_hptsslot = hpts->p_runningtick;
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
 		if (need_wake) {
 			/*
 			 * Activate the hpts if it is sleeping and its
 			 * timeout is not 1.
 			 */
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 		}
 	}
 	return (need_wake);
 }
 
 int
 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
 {
 	int32_t ret;
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
 #ifdef INVARIANTS
 static void
 check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
 {
 	/*
 	 * Sanity checks for the pacer with invariants 
 	 * on insert.
 	 */
 	if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
 		panic("hpts:%p inp:%p slot:%d > max",
 		      hpts, inp, inp_hptsslot);
 	if ((hpts->p_hpts_active) &&
 	    (hpts->p_wheel_complete == 0)) {
 		/* 
 		 * If the pacer is processing a arc
 		 * of the wheel, we need to make
 		 * sure we are not inserting within
 		 * that arc.
 		 */
 		int distance, yet_to_run;
 
 		distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
 		if (hpts->p_runningtick != hpts->p_cur_slot)
 			yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
 		else
 			yet_to_run = 0;	/* processing last slot */
 		if (yet_to_run > distance) {
 			panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
 			      hpts, inp, inp_hptsslot,
 			      distance, yet_to_run,
 			      hpts->p_runningtick, hpts->p_cur_slot);
 		}
 	}
 }
 #endif
 
 static void
 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
 		       struct hpts_diag *diag, struct timeval *tv)
 {
 	uint32_t need_new_to = 0;
 	uint32_t wheel_cts, last_tick;
 	int32_t wheel_tick, maxticks;
 	int8_t need_wakeup = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
 		diag->p_hpts_active = hpts->p_hpts_active;
 		diag->p_prev_slot = hpts->p_prev_slot;
 		diag->p_runningtick = hpts->p_runningtick;
 		diag->p_nxt_slot = hpts->p_nxt_slot;
 		diag->p_cur_slot = hpts->p_cur_slot;
 		diag->p_curtick = hpts->p_curtick;
 		diag->p_lasttick = hpts->p_lasttick;
 		diag->slot_req = slot;
 		diag->p_on_min_sleep = hpts->p_on_min_sleep;
 		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 	}
 	if (inp->inp_in_hpts == 0) {
 		if (slot == 0) {
 			/* Immediate */
 			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
 			return;
 		}
 		/* Get the current time relative to the wheel */
 		wheel_cts = tcp_tv_to_hptstick(tv);
 		/* Map it onto the wheel */
 		wheel_tick = tick_to_wheel(wheel_cts);
 		/* Now what's the max we can place it at? */
 		maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
 		if (diag) {
 			diag->wheel_tick = wheel_tick;
 			diag->maxticks = maxticks;
 			diag->wheel_cts = wheel_cts;
 		}
 		if (maxticks == 0) {
 			/* The pacer is in a wheel wrap behind, yikes! */
 			if (slot > 1) {
 				/* 
 				 * Reduce by 1 to prevent a forever loop in
 				 * case something else is wrong. Note this
 				 * probably does not hurt because the pacer
 				 * if its true is so far behind we will be
 				 * > 1second late calling anyway.
 				 */
 				slot--;
 			}
 			inp->inp_hptsslot = last_tick;
 			inp->inp_hpts_request = slot;
 		} else 	if (maxticks >= slot) {
 			/* It all fits on the wheel */
 			inp->inp_hpts_request = 0;
 			inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
 		} else {
 			/* It does not fit */
 			inp->inp_hpts_request = slot - maxticks;
 			inp->inp_hptsslot = last_tick;
 		}
 		if (diag) {
 			diag->slot_remaining = inp->inp_hpts_request;
 			diag->inp_hptsslot = inp->inp_hptsslot;
 		}
 #ifdef INVARIANTS
 		check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
 #endif
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
 		if ((hpts->p_hpts_active == 0) &&
 		    (inp->inp_hpts_request == 0) &&
 		    (hpts->p_on_min_sleep == 0)) {
 			/*
 			 * The hpts is sleeping and not on a minimum
 			 * sleep time, we need to figure out where
 			 * it will wake up at and if we need to reschedule
 			 * its time-out.
 			 */
 			uint32_t have_slept, yet_to_sleep;
 
 			/* Now do we need to restart the hpts's timer? */
 			have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
 			if (have_slept < hpts->p_hpts_sleep_time)
 				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
 			else {
 				/* We are over-due */
 				yet_to_sleep = 0;
 				need_wakeup = 1;
 			}
 			if (diag) {
 				diag->have_slept = have_slept;
 				diag->yet_to_sleep = yet_to_sleep;
 			}
 			if (yet_to_sleep &&
 			    (yet_to_sleep > slot)) {
 				/*
 				 * We need to reschedule the hpts's time-out.
 				 */
 				hpts->p_hpts_sleep_time = slot;
 				need_new_to = slot * HPTS_TICKS_PER_USEC;
 			}
 		}
 		/*
 		 * Now how far is the hpts sleeping to? if active is 1, its
 		 * up and ticking we do nothing, otherwise we may need to
 		 * reschedule its callout if need_new_to is set from above.
 		 */
 		if (need_wakeup) {
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 			if (diag) {
 				diag->need_new_to = 0;
-				diag->co_ret = 0xffff0000;
+				diag->co_ret = (callout_ret_t){};
 			}
 		} else if (need_new_to) {
-			int32_t co_ret;
+			callout_ret_t co_ret;
 			struct timeval tv;
 			sbintime_t sb;
 
 			tv.tv_sec = 0;
 			tv.tv_usec = 0;
 			while (need_new_to > HPTS_USEC_IN_SEC) {
 				tv.tv_sec++;
 				need_new_to -= HPTS_USEC_IN_SEC;
 			}
 			tv.tv_usec = need_new_to;
 			sb = tvtosbt(tv);
 			if (tcp_hpts_callout_skip_swi == 0) {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_swi, hpts, hpts->p_cpu,
 				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 			} else {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_dir, hpts,
 				    hpts->p_cpu,
 				    C_PREL(tcp_hpts_precision));
 			}
 			if (diag) {
 				diag->need_new_to = need_new_to;
 				diag->co_ret = co_ret;
 			}
 		}
 	} else {
 #ifdef INVARIANTS
 		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
 #endif
 	}
 }
 
 uint32_t
 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
 {
 	struct tcp_hpts_entry *hpts;
 	uint32_t slot_on;
 	struct timeval tv;
 
 	/*
 	 * We now return the next-slot the hpts will be on, beyond its
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	microuptime(&tv);
 	tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
 	slot_on = hpts->p_nxt_slot;
 	mtx_unlock(&hpts->p_mtx);
 	return (slot_on);
 }
 
 uint32_t
 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
 	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
 }
 int
 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
 {
 	int32_t retval = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		retval = 1;
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			retval = 2;
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		retval = 4;
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	return (retval);
 }
 
 int32_t
 __tcp_queue_to_input(struct inpcb *inp, int line)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t ret;
 
 	hpts = tcp_input_lock(inp);
 	ret = __tcp_queue_to_input_locked(inp, hpts, line);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
 void
 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 	struct tcpcb *tp;
 
 	tp = intotcpcb(inp);
 	hpts = tcp_input_lock(tp->t_inpcb);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	inp->inp_hpts_drop_reas = reason;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static uint16_t
 hpts_random_cpu(struct inpcb *inp){
 	/*
 	 * No flow type set distribute the load randomly.
 	 */
 	uint16_t cpuid;
 	uint32_t ran;
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* Nothing set use a random number */
 	ran = arc4random();
 	cpuid = (ran & 0xffff) % mp_ncpus;
 	return (cpuid);
 }
 
 static uint16_t
 hpts_cpuid(struct inpcb *inp){
 	u_int cpuid;
 #ifdef NUMA
 	struct hpts_domain_info *di;
 #endif
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* If one is set the other must be the same */
 #ifdef	RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
 		return (hpts_random_cpu(inp));
 	else
 		return (cpuid);
 #else
 	/*
 	 * We don't have a flowid -> cpuid mapping, so cheat and just map
 	 * unknown cpuids to curcpu.  Not the best, but apparently better
 	 * than defaulting to swi 0.
 	 */
 	
 	if (inp->inp_flowtype == M_HASHTYPE_NONE)
 		return (hpts_random_cpu(inp));
 	/*
 	 * Hash to a thread based on the flowid.  If we are using numa,
 	 * then restrict the hash to the numa domain where the inp lives.
 	 */
 #ifdef NUMA
 	if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
 		di = &hpts_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	} else
 #endif
 		cpuid = inp->inp_flowid % mp_ncpus;
 
 	return (cpuid);
 #endif
 }
 
 static void
 tcp_drop_in_pkts(struct tcpcb *tp)
 {
 	struct mbuf *m, *n;
 	
 	m = tp->t_in_pkt;
 	if (m)
 		n = m->m_nextpkt;
 	else
 		n = NULL;
 	tp->t_in_pkt = NULL;
 	while (m) {
 		m_freem(m);
 		m = n;
 		if (m)
 			n = m->m_nextpkt;
 	}
 }
 
 /*
  * Do NOT try to optimize the processing of inp's
  * by first pulling off all the inp's into a temporary
  * list (e.g. TAILQ_CONCAT). If you do that the subtle
  * interactions of switching CPU's will kill because of
  * problems in the linked list manipulation. Basically
  * you would switch cpu's with the hpts mutex locked
  * but then while you were processing one of the inp's
  * some other one that you switch will get a new
  * packet on the different CPU. It will insert it
  * on the new hpts's input list. Creating a temporary
  * link in the inp will not fix it either, since
  * the other hpts will be doing the same thing and
  * you will both end up using the temporary link.
  *
  * You will die in an ASSERT for tailq corruption if you
  * run INVARIANTS or you will die horribly without
  * INVARIANTS in some unknown way with a corrupt linked
  * list.
  */
 static void
 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	uint16_t drop_reason;
 	int16_t set_cpu;
 	uint32_t did_prefetch = 0;
 	int dropped;
 	struct epoch_tracker et;
 
 	HPTS_MTX_ASSERT(hpts);
 #ifndef VIMAGE
 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 #endif
 	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
 		HPTS_MTX_ASSERT(hpts);
 		hpts_sane_input_remove(hpts, inp, 0);
 		if (inp->inp_input_cpu_set == 0) {
 			set_cpu = 1;
 		} else {
 			set_cpu = 0;
 		}
 		hpts->p_inp = inp;
 		drop_reason = inp->inp_hpts_drop_reas;
 		inp->inp_in_input = 0;
 		mtx_unlock(&hpts->p_mtx);
 		INP_WLOCK(inp);
 #ifdef VIMAGE
 		CURVNET_SET(inp->inp_vnet);
 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 #endif
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 		    (inp->inp_flags2 & INP_FREED)) {
 out:
 			hpts->p_inp = NULL;
 			if (in_pcbrele_wlocked(inp) == 0) {
 				INP_WUNLOCK(inp);
 			}
 #ifdef VIMAGE
 			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
 #endif
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		tp = intotcpcb(inp);
 		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 			goto out;
 		}
 		if (drop_reason) {
 			/* This tcb is being destroyed for drop_reason */
 			tcp_drop_in_pkts(tp);
 			tp = tcp_drop(tp, drop_reason);
 			if (tp == NULL) {
 				INP_WLOCK(inp);
 			}
 			if (in_pcbrele_wlocked(inp) == 0)
 				INP_WUNLOCK(inp);
 #ifdef VIMAGE
 			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
 #endif
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		if (set_cpu) {
 			/*
 			 * Setup so the next time we will move to the right
 			 * CPU. This should be a rare event. It will
 			 * sometimes happens when we are the client side
 			 * (usually not the server). Somehow tcp_output()
 			 * gets called before the tcp_do_segment() sets the
 			 * intial state. This means the r_cpu and r_hpts_cpu
 			 * is 0. We get on the hpts, and then tcp_input()
 			 * gets called setting up the r_cpu to the correct
 			 * value. The hpts goes off and sees the mis-match.
 			 * We simply correct it here and the CPU will switch
 			 * to the new hpts nextime the tcb gets added to the
 			 * the hpts (not this time) :-)
 			 */
 			tcp_set_hpts(inp);
 		}
 		if (tp->t_fb_ptr != NULL) {
 			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 			did_prefetch = 1;
 		}
 		if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
 			if (inp->inp_in_input)
 				tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
 			dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
 			if (dropped) {
 				/* Re-acquire the wlock so we can release the reference */
 				INP_WLOCK(inp);
 			}
 		} else if (tp->t_in_pkt) {
 			/* 
 			 * We reach here only if we had a 
 			 * stack that supported INP_SUPPORTS_MBUFQ
 			 * and then somehow switched to a stack that
 			 * does not. The packets are basically stranded
 			 * and would hang with the connection until
 			 * cleanup without this code. Its not the
 			 * best way but I know of no other way to
 			 * handle it since the stack needs functions
 			 * it does not have to handle queued packets.
 			 */
 			tcp_drop_in_pkts(tp);
 		}
 		if (in_pcbrele_wlocked(inp) == 0)
 			INP_WUNLOCK(inp);
 		INP_UNLOCK_ASSERT(inp);
 #ifdef VIMAGE
 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		CURVNET_RESTORE();
 #endif
 		mtx_lock(&hpts->p_mtx);
 		hpts->p_inp = NULL;
 	}
 #ifndef VIMAGE
 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 #endif
 }
 
 static void
 tcp_hptsi(struct tcp_hpts_entry *hpts)
 {
 	struct epoch_tracker et;
 	struct tcpcb *tp;
 	struct inpcb *inp = NULL, *ninp;
 	struct timeval tv;
 	int32_t ticks_to_run, i, error;
 	int32_t paced_cnt = 0;
 	int32_t loop_cnt = 0;
 	int32_t did_prefetch = 0;
 	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
 	int32_t wrap_loop_cnt = 0;
 	int16_t set_cpu;
 
 	HPTS_MTX_ASSERT(hpts);
 	/* record previous info for any logging */
 	hpts->saved_lasttick = hpts->p_lasttick;
 	hpts->saved_curtick = hpts->p_curtick;
 	hpts->saved_curslot = hpts->p_cur_slot;
 	hpts->saved_prev_slot = hpts->p_prev_slot;
 
 	hpts->p_lasttick = hpts->p_curtick;
 	hpts->p_curtick = tcp_gethptstick(&tv);
 	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 	if ((hpts->p_on_queue_cnt == 0) ||
 	    (hpts->p_lasttick == hpts->p_curtick)) {
 		/* 
 		 * No time has yet passed, 
 		 * or nothing to do.
 		 */
 		hpts->p_prev_slot = hpts->p_cur_slot;
 		hpts->p_lasttick = hpts->p_curtick;
 		goto no_run;
 	}
 again:
 	hpts->p_wheel_complete = 0;
 	HPTS_MTX_ASSERT(hpts);
 	ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
 	if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
 	    (hpts->p_on_queue_cnt != 0)) {
 		/* 
 		 * Wheel wrap is occuring, basically we
 		 * are behind and the distance between
 		 * run's has spread so much it has exceeded
 		 * the time on the wheel (1.024 seconds). This
 		 * is ugly and should NOT be happening. We
 		 * need to run the entire wheel. We last processed
 		 * p_prev_slot, so that needs to be the last slot
 		 * we run. The next slot after that should be our
 		 * reserved first slot for new, and then starts
 		 * the running postion. Now the problem is the
 		 * reserved "not to yet" place does not exist
 		 * and there may be inp's in there that need
 		 * running. We can merge those into the
 		 * first slot at the head.
 		 */
 		wrap_loop_cnt++;
 		hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
 		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
 		/* 
 		 * Adjust p_cur_slot to be where we are starting from
 		 * hopefully we will catch up (fat chance if something
 		 * is broken this bad :( )
 		 */
 		hpts->p_cur_slot = hpts->p_prev_slot;
 		/*
 		 * The next slot has guys to run too, and that would
 		 * be where we would normally start, lets move them into
 		 * the next slot (p_prev_slot + 2) so that we will
 		 * run them, the extra 10usecs of late (by being
 		 * put behind) does not really matter in this situation.
 		 */
 #ifdef INVARIANTS
 		/* 
 		 * To prevent a panic we need to update the inpslot to the
 		 * new location. This is safe since it takes both the
 		 * INP lock and the pacer mutex to change the inp_hptsslot.
 		 */
 		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
 			inp->inp_hptsslot = hpts->p_runningtick;
 		}
 #endif
 		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
 			     &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
 		ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
 		counter_u64_add(wheel_wrap, 1);
 	} else {
 		/* 
 		 * Nxt slot is always one after p_runningtick though
 		 * its not used usually unless we are doing wheel wrap.
 		 */
 		hpts->p_nxt_slot = hpts->p_prev_slot;
 		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
 		      hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
 	}
 	HPTS_MTX_ASSERT(hpts);
 #ifndef VIMAGE
 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 #endif
 	for (i = 0; i < ticks_to_run; i++) {
 		/*
 		 * Calculate our delay, if there are no extra ticks there
 		 * was not any (i.e. if ticks_to_run == 1, no delay).
 		 */
 		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
 		HPTS_MTX_ASSERT(hpts);
 		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
 			/* For debugging */
 			hpts->p_inp = inp;
 			paced_cnt++;
 #ifdef INVARIANTS
 			if (hpts->p_runningtick != inp->inp_hptsslot) {
 				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
 				      hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
 			}
 #endif
 			/* Now pull it */
 			if (inp->inp_hpts_cpu_set == 0) {
 				set_cpu = 1;
 			} else {
 				set_cpu = 0;
 			}
 			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
 			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
 				/* We prefetch the next inp if possible */
 				kern_prefetch(ninp, &prefetch_ninp);
 				prefetch_ninp = 1;
 			}
 			if (inp->inp_hpts_request) {
 				/*
 				 * This guy is deferred out further in time
 				 * then our wheel had available on it. 
 				 * Push him back on the wheel or run it
 				 * depending.
 				 */
 				uint32_t maxticks, last_tick, remaining_slots;
 				
 				remaining_slots = ticks_to_run - (i + 1);
 				if (inp->inp_hpts_request > remaining_slots) {
 					/*
 					 * How far out can we go?
 					 */
 					maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
 					if (maxticks >= inp->inp_hpts_request) {
 						/* we can place it finally to be processed  */
 						inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
 						inp->inp_hpts_request = 0;
 					} else {
 						/* Work off some more time */
 						inp->inp_hptsslot = last_tick;
 						inp->inp_hpts_request-= maxticks;
 					}
 					hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
 					hpts->p_inp = NULL;
 					continue;
 				}
 				inp->inp_hpts_request = 0;
 				/* Fall through we will so do it now */
 			}
 			/*
 			 * We clear the hpts flag here after dealing with	
 			 * remaining slots. This way anyone looking with the
 			 * TCB lock will see its on the hpts until just
 			 * before we unlock.
 			 */
 			inp->inp_in_hpts = 0;
 			mtx_unlock(&hpts->p_mtx);
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp)) {
 				mtx_lock(&hpts->p_mtx);
 				hpts->p_inp = NULL;
 				continue;
 			}
 			if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 			    (inp->inp_flags2 & INP_FREED)) {
 			out_now:
 #ifdef INVARIANTS
 				if (mtx_owned(&hpts->p_mtx)) {
 					panic("Hpts:%p owns mtx prior-to lock line:%d",
 					      hpts, __LINE__);
 				}
 #endif
 				INP_WUNLOCK(inp);
 				mtx_lock(&hpts->p_mtx);
 				hpts->p_inp = NULL;
 				continue;
 			}
 			tp = intotcpcb(inp);
 			if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 				goto out_now;
 			}
 			if (set_cpu) {
 				/*
 				 * Setup so the next time we will move to
 				 * the right CPU. This should be a rare
 				 * event. It will sometimes happens when we
 				 * are the client side (usually not the
 				 * server). Somehow tcp_output() gets called
 				 * before the tcp_do_segment() sets the
 				 * intial state. This means the r_cpu and
 				 * r_hpts_cpu is 0. We get on the hpts, and
 				 * then tcp_input() gets called setting up
 				 * the r_cpu to the correct value. The hpts
 				 * goes off and sees the mis-match. We
 				 * simply correct it here and the CPU will
 				 * switch to the new hpts nextime the tcb
 				 * gets added to the the hpts (not this one)
 				 * :-)
 				 */
 				tcp_set_hpts(inp);
 			}
 #ifdef VIMAGE
 			CURVNET_SET(inp->inp_vnet);
 			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 #endif
 			/* Lets do any logging that we might want to */
 			if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 				tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
 			}
 			/*
 			 * There is a hole here, we get the refcnt on the
 			 * inp so it will still be preserved but to make
 			 * sure we can get the INP we need to hold the p_mtx
 			 * above while we pull out the tp/inp,  as long as
 			 * fini gets the lock first we are assured of having
 			 * a sane INP we can lock and test.
 			 */
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx before tcp-output:%d",
 				      hpts, __LINE__);
 			}
 #endif
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 				did_prefetch = 1;
 			}
 			if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
 				error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
 				if (error) {
 					/* The input killed the connection */
 					goto skip_pacing;
 				}
 			}
 			inp->inp_hpts_calls = 1;
 			error = tp->t_fb->tfb_tcp_output(tp);
 			inp->inp_hpts_calls = 0;
 			if (ninp && ninp->inp_ppcb) {
 				/*
 				 * If we have a nxt inp, see if we can
 				 * prefetch its ppcb. Note this may seem
 				 * "risky" since we have no locks (other
 				 * than the previous inp) and there no
 				 * assurance that ninp was not pulled while
 				 * we were processing inp and freed. If this
 				 * occured it could mean that either:
 				 *
 				 * a) Its NULL (which is fine we won't go
 				 * here) <or> b) Its valid (which is cool we
 				 * will prefetch it) <or> c) The inp got
 				 * freed back to the slab which was
 				 * reallocated. Then the piece of memory was
 				 * re-used and something else (not an
 				 * address) is in inp_ppcb. If that occurs
 				 * we don't crash, but take a TLB shootdown
 				 * performance hit (same as if it was NULL
 				 * and we tried to pre-fetch it).
 				 *
 				 * Considering that the likelyhood of <c> is
 				 * quite rare we will take a risk on doing
 				 * this. If performance drops after testing
 				 * we can always take this out. NB: the
 				 * kern_prefetch on amd64 actually has
 				 * protection against a bad address now via
 				 * the DMAP_() tests. This will prevent the
 				 * TLB hit, and instead if <c> occurs just
 				 * cause us to load cache with a useless
 				 * address (to us).
 				 */
 				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
 				prefetch_tp = 1;
 			}
 			INP_WUNLOCK(inp);
 		skip_pacing:
 #ifdef VIMAGE
 			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
 #endif
 			INP_UNLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx prior-to lock line:%d",
 				      hpts, __LINE__);
 			}
 #endif
 			mtx_lock(&hpts->p_mtx);
 			hpts->p_inp = NULL;
 		}
 		HPTS_MTX_ASSERT(hpts);
 		hpts->p_inp = NULL;
 		hpts->p_runningtick++;
 		if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
 			hpts->p_runningtick = 0;
 		}
 	}
 #ifndef VIMAGE
 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 #endif
 no_one:
 	HPTS_MTX_ASSERT(hpts);
 	hpts->p_delayed_by = 0;
 	/*
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
 		      hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	hpts->p_prev_slot = hpts->p_cur_slot;
 	hpts->p_lasttick = hpts->p_curtick;
 	if (loop_cnt > max_pacer_loops) {	    
 		/*
 		 * Something is serious slow we have
 		 * looped through processing the wheel
 		 * and by the time we cleared the
 		 * needs to run max_pacer_loops time
 		 * we still needed to run. That means
 		 * the system is hopelessly behind and
 		 * can never catch up :(
 		 *
 		 * We will just lie to this thread
 		 * and let it thing p_curtick is 
 		 * correct. When it next awakens
 		 * it will find itself further behind.
 		 */
 		counter_u64_add(hpts_hopelessly_behind, 1);
 		goto no_run;
 	}
 	hpts->p_curtick = tcp_gethptstick(&tv);
 	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 	if ((wrap_loop_cnt < 2) &&
 	    (hpts->p_lasttick != hpts->p_curtick)) {
 		counter_u64_add(hpts_loops, 1);
 		loop_cnt++;
 		goto again;
 	}
 no_run:
 	/*
 	 * Set flag to tell that we are done for
 	 * any slot input that happens during
 	 * input.
 	 */
 	hpts->p_wheel_complete = 1;
 	/* 
 	 * Run any input that may be there not covered
 	 * in running data.
 	 */
 	if (!TAILQ_EMPTY(&hpts->p_input)) {
 		tcp_input_data(hpts, &tv);
 		/*
 		 * Now did we spend too long running
 		 * input and need to run more ticks?
 		 */
 		KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
 			("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
 			 hpts->p_prev_slot, hpts->p_cur_slot));
 		KASSERT(hpts->p_lasttick == hpts->p_curtick,
 			("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
 			 hpts->p_lasttick, hpts->p_curtick));
 		hpts->p_curtick = tcp_gethptstick(&tv);
 		if (hpts->p_lasttick != hpts->p_curtick) {
 			counter_u64_add(hpts_loops, 1);
 			hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 			goto again;
 		}
 	}
 	{
 		uint32_t t = 0, i, fnd = 0;
 
 		if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
 			/*
 			 * Find next slot that is occupied and use that to
 			 * be the sleep time.
 			 */
 			for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
 				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
 					fnd = 1;
 					break;
 				}
 				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
 			}
 			if (fnd) {
 				hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
 			} else {
 #ifdef INVARIANTS
 				panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
 #endif
 				counter_u64_add(back_tosleep, 1);
 				hpts->p_on_queue_cnt = 0;
 				goto non_found;
 			}
 		} else if (wrap_loop_cnt >= 2) {
 			/* Special case handling */
 			hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
 		} else {
 			/* No one on the wheel sleep for all but 400 slots or sleep max  */
 		non_found:
 			hpts->p_hpts_sleep_time = hpts_sleep_max;
 		}
 	}
 }
 
 void
 __tcp_set_hpts(struct inpcb *inp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if ((inp->inp_in_hpts == 0) &&
 	    (inp->inp_hpts_cpu_set == 0)) {
 		inp->inp_hpts_cpu = hpts_cpuid(inp);
 		inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 	hpts = tcp_input_lock(inp);
 	if ((inp->inp_input_cpu_set == 0) &&
 	    (inp->inp_in_input == 0)) {
 		inp->inp_input_cpu = hpts_cpuid(inp);
 		inp->inp_input_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
 
 uint16_t
 tcp_hpts_delayedby(struct inpcb *inp){
 	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
 }
 
 static void
 tcp_hpts_thread(void *ctx)
 {
 	struct tcp_hpts_entry *hpts;
 	struct timeval tv;
 	sbintime_t sb;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
 	if (hpts->p_direct_wake) {
 		/* Signaled by input */
 		callout_stop(&hpts->co);
 	} else {
 		/* Timed out */
 		if (callout_pending(&hpts->co) ||
 		    !callout_active(&hpts->co)) {
 			mtx_unlock(&hpts->p_mtx);
 			return;
 		}
 		callout_deactivate(&hpts->co);
 	}
 	hpts->p_hpts_wake_scheduled = 0;
 	hpts->p_hpts_active = 1;
 	tcp_hptsi(hpts);
 	HPTS_MTX_ASSERT(hpts);
 	tv.tv_sec = 0;
 	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
 		hpts->overidden_sleep = tv.tv_usec;
 		tv.tv_usec = tcp_min_hptsi_time;
 		hpts->p_on_min_sleep = 1;
 	} else {
 		/* Clear the min sleep flag */
 		hpts->overidden_sleep = 0;
 		hpts->p_on_min_sleep = 0;
 	}
 	hpts->p_hpts_active = 0;
 	sb = tvtosbt(tv);
 	if (tcp_hpts_callout_skip_swi == 0) {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_swi, hpts, hpts->p_cpu,
 		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	} else {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_dir, hpts,
 		    hpts->p_cpu,
 		    C_PREL(tcp_hpts_precision));
 	}
 	hpts->p_direct_wake = 0;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 #undef	timersub
 
 static void
 tcp_init_hptsi(void *st)
 {
 	int32_t i, j, error, bound = 0, created = 0;
 	size_t sz, asz;
 	struct timeval tv;
 	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
 	struct pcpu *pc;
 	cpuset_t cs;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
 	int count, domain;
 
 	tcp_pace.rp_proc = NULL;
 	tcp_pace.rp_num_hptss = ncpus;
 	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
 	hpts_loops = counter_u64_alloc(M_WAITOK);
 	back_tosleep = counter_u64_alloc(M_WAITOK);
 	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
 	wheel_wrap = counter_u64_alloc(M_WAITOK);
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
 		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
 		    M_TCPHPTS, M_WAITOK);
 		hpts = tcp_pace.rp_ent[i];
 		/*
 		 * Init all the hpts structures that are not specifically
 		 * zero'd by the allocations. Also lets attach them to the
 		 * appropriate sysctl block as well.
 		 */
 		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
 		    "hpts", MTX_DEF | MTX_DUPOK);
 		TAILQ_INIT(&hpts->p_input);
 		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
 			TAILQ_INIT(&hpts->p_hptss[j]);
 		}
 		sysctl_ctx_init(&hpts->hpts_ctx);
 		sprintf(unit, "%d", i);
 		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
 		    OID_AUTO,
 		    unit,
 		    CTLFLAG_RW, 0,
 		    "");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_inqueue_cnt, 0,
 		    "Count TCB's awaiting input processing");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_queue_cnt, 0,
 		    "Count TCB's awaiting output processing");
 		SYSCTL_ADD_U16(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "active", CTLFLAG_RD,
 		    &hpts->p_hpts_active, 0,
 		    "Is the hpts active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curslot", CTLFLAG_RD,
 		    &hpts->p_cur_slot, 0,
 		    "What the current running pacers goal");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "runtick", CTLFLAG_RD,
 		    &hpts->p_runningtick, 0,
 		    "What the running pacers current slot is");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curtick", CTLFLAG_RD,
 		    &hpts->p_curtick, 0,
 		    "What the running pacers last tick mapped to the wheel was");
 		hpts->p_hpts_sleep_time = hpts_sleep_max;
 		hpts->p_num = i;
 		hpts->p_curtick = tcp_gethptstick(&tv);
 		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 		hpts->p_cpu = 0xffff;
 		hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
 	}
 
 	/* Don't try to bind to NUMA domains if we don't have any */
 	if (vm_ndomains == 1 && tcp_bind_threads == 2)
 		tcp_bind_threads = 0;
 
 	/*
 	 * Now lets start ithreads to handle the hptss.
 	 */
 	CPU_FOREACH(i) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
 		if (error) {
 			panic("Can't add hpts:%p i:%d err:%d",
 			    hpts, i, error);
 		}
 		created++;
 		if (tcp_bind_threads == 1) {
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
 		} else if (tcp_bind_threads == 2) {
 			pc = pcpu_find(i);
 			domain = pc->pc_domain;
 			CPU_COPY(&cpuset_domain[domain], &cs);
 			if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
 			    == 0) {
 				bound++;
 				count = hpts_domains[domain].count;
 				hpts_domains[domain].cpu[count] = i;
 				hpts_domains[domain].count++;
 			}
 		}
 		tv.tv_sec = 0;
 		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 		sb = tvtosbt(tv);
 		if (tcp_hpts_callout_skip_swi == 0) {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_swi, hpts, hpts->p_cpu,
 			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		} else {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_dir, hpts,
 			    hpts->p_cpu,
 			    C_PREL(tcp_hpts_precision));
 		}
 	}
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all htps threads.
 	 */
 	for (i = 0; i < vm_ndomains; i++) {
 		if (hpts_domains[i].count == 0) {
 			tcp_bind_threads = 0;
 			break;
 		}
 	}
 
 	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
 	    created, bound,
 	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
 }
 
 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
 MODULE_VERSION(tcphpts, 1);
Index: projects/hps_callouts/sys/netinet/tcp_hpts.h
===================================================================
--- projects/hps_callouts/sys/netinet/tcp_hpts.h	(revision 352141)
+++ projects/hps_callouts/sys/netinet/tcp_hpts.h	(revision 352142)
@@ -1,268 +1,268 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __tcp_hpts_h__
 #define __tcp_hpts_h__
 
 /*
  * The hpts uses a 102400 wheel. The wheel
  * defines the time in 10 usec increments (102400 x 10).
  * This gives a range of 10usec - 1024ms to place
  * an entry within. If the user requests more than
  * 1.024 second, a remaineder is attached and the hpts
  * when seeing the remainder will re-insert the
  * inpcb forward in time from where it is until
  * the remainder is zero.
  */
 
 #define NUM_OF_HPTSI_SLOTS 102400
 
 TAILQ_HEAD(hptsh, inpcb);
 
 /* Number of useconds in a hpts tick */
 #define HPTS_TICKS_PER_USEC 10
 #define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
 #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
 #define HPTS_USEC_IN_SEC 1000000
 #define HPTS_MSEC_IN_SEC 1000
 #define HPTS_USEC_IN_MSEC 1000
 
 
 struct hpts_diag {
 	uint32_t p_hpts_active; 	/* bbr->flex7 x */
 	uint32_t p_nxt_slot;		/* bbr->flex1 x */
 	uint32_t p_cur_slot;		/* bbr->flex2 x */
 	uint32_t p_prev_slot;		/* bbr->delivered */
 	uint32_t p_runningtick;		/* bbr->inflight */
 	uint32_t slot_req;		/* bbr->flex3 x */
 	uint32_t inp_hptsslot;		/* bbr->flex4 x */
 	uint32_t slot_remaining;	/* bbr->flex5 x */
 	uint32_t have_slept;		/* bbr->epoch x */
 	uint32_t hpts_sleep_time;	/* bbr->applimited x */
 	uint32_t yet_to_sleep;		/* bbr->lt_epoch x */
 	uint32_t need_new_to;		/* bbr->flex6 x  */
 	uint32_t wheel_tick;		/* bbr->bw_inuse x */
 	uint32_t maxticks;		/* bbr->delRate x */
 	uint32_t wheel_cts;		/* bbr->rttProp x */
-	int32_t co_ret; 		/* bbr->pkts_out x */
+	callout_ret_t co_ret; 		/* bbr->pkts_out x */
 	uint32_t p_curtick;		/* upper bbr->cur_del_rate */
 	uint32_t p_lasttick;		/* lower bbr->cur_del_rate */
 	uint8_t p_on_min_sleep; 	/* bbr->flex8 x */
 };
 
 /* Magic flags to tell whats cooking on the pacing wheel */
 #define PACE_TMR_DELACK 0x01	/* Delayed ack timer running */
 #define PACE_TMR_RACK   0x02	/* RACK timer running */
 #define PACE_TMR_TLP    0x04	/* TLP timer running */
 #define PACE_TMR_RXT    0x08	/* Retransmit timer running */
 #define PACE_TMR_PERSIT 0x10	/* Persists timer running */
 #define PACE_TMR_KEEP   0x20	/* Keep alive timer running */
 #define PACE_PKT_OUTPUT 0x40	/* Output Packets being paced */
 #define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
 
 #ifdef _KERNEL
 /* Each hpts has its own p_mtx which is used for locking */
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
 	struct mtx p_mtx;	/* Mutex for hpts */
 	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
 	uint8_t p_hpts_wake_scheduled;	/* Have we scheduled a wakeup? */
 	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
 	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
 	uint32_t p_runningtick; /* Current tick we are at if we are running */
 	uint32_t p_prev_slot;	/* Previous slot we were on */
 	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
 	uint32_t p_nxt_slot;	/* The next slot outside the current range of
 				 * slots that the hpts is running on. */
 	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
 	uint32_t p_lasttick;	/* Last tick before the current one */
 	uint8_t p_direct_wake :1, /* boolean */
 		p_on_min_sleep:1, /* boolean */
 		p_avail:6; 
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	void *p_inp;
 	struct hptsh p_input;	/* For the tcp-input runner */
 	/* Hptsi wheel */
 	struct hptsh *p_hptss;
 	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
 	uint32_t hit_no_enobuf;
 	uint32_t p_dyn_adjust;
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
 	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
 	uint32_t saved_lasttick;	/* for logging */
 	uint32_t saved_curtick;		/* for logging */
 	uint32_t saved_curslot;		/* for logging */
 	uint32_t saved_prev_slot;       /* for logging */
 	uint32_t p_delayed_by;	/* How much were we delayed by */
 	/* Cache line 0x80 */
 	struct sysctl_ctx_list hpts_ctx;
 	struct sysctl_oid *hpts_root;
 	struct intr_event *ie;
 	void *ie_cookie;
 	uint16_t p_num;		/* The hpts number one per cpu */
 	uint16_t p_cpu;		/* The hpts CPU */
 	/* There is extra space in here */
 	/* Cache line 0x100 */
 	struct callout co __aligned(CACHE_LINE_SIZE);
 }               __aligned(CACHE_LINE_SIZE);
 
 struct tcp_hptsi {
 	struct proc *rp_proc;	/* Process structure for hpts */
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 };
 
 #endif
 
 #define HPTS_REMOVE_INPUT  0x01
 #define HPTS_REMOVE_OUTPUT 0x02
 #define HPTS_REMOVE_ALL    (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
 
 /*
  * When using the hpts, a TCP stack must make sure
  * that once a INP_DROPPED flag is applied to a INP
  * that it does not expect tcp_output() to ever be
  * called by the hpts. The hpts will *not* call
  * any output (or input) functions on a TCB that
  * is in the DROPPED state.
  *
  * This implies final ACK's and RST's that might
  * be sent when a TCB is still around must be
  * sent from a routine like tcp_respond().
  */
 #define DEFAULT_MIN_SLEEP 250	/* How many usec's is default for hpts sleep
 				 * this determines min granularity of the
 				 * hpts. If 0, granularity is 10useconds at
 				 * the cost of more CPU (context switching). */
 #ifdef _KERNEL
 #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
 struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
 struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
 int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
 #define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
 
 struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
 #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
 void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
 
 /*
  * To insert a TCB on the hpts you *must* be holding the
  * INP_WLOCK(). The hpts insert code will then acqurire
  * the hpts's lock and insert the TCB on the requested
  * slot possibly waking up the hpts if you are requesting
  * a time earlier than what the hpts is sleeping to (if
  * the hpts is sleeping). You may check the inp->inp_in_hpts
  * flag without the hpts lock. The hpts is the only one
  * that will clear this flag holding only the hpts lock. This
  * means that in your tcp_output() routine when you test for
  * it to be 1 (so you wont call output) it may be transitioning
  * to 0 (by the hpts). That will be fine since that will just
  * mean an extra call to tcp_output that most likely will find
  * the call you executed (when the mis-match occured) will have
  * put the TCB back on the hpts and it will return. If your
  * call did not add it back to the hpts then you will either
  * over-send or the cwnd will block you from sending more.
  *
  * Note you should also be holding the INP_WLOCK() when you
  * call the remove from the hpts as well. Thoug usually
  * you are either doing this from a timer, where you need
  * that INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  */
 uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
 #define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
 
 uint32_t
 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
 
 int
     __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
 #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
 int
 __tcp_queue_to_input(struct inpcb *inp, int32_t line);
 #define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
 
 uint16_t tcp_hpts_delayedby(struct inpcb *inp);
 
 void __tcp_set_hpts(struct inpcb *inp, int32_t line);
 #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
 
 void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
 #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
 
 extern int32_t tcp_min_hptsi_time;
 
 static __inline uint32_t
 tcp_tv_to_hptstick(struct timeval *sv)
 {
 	return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
 }
 
 static __inline uint32_t
 tcp_gethptstick(struct timeval *sv)
 {
 	struct timeval tv;
 
 	if (sv == NULL)
 		sv = &tv;
 	microuptime(sv);
 	return (tcp_tv_to_hptstick(sv));
 }
 
 static __inline uint32_t
 tcp_tv_to_usectick(struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
 }
 
 static __inline uint32_t
 tcp_tv_to_mssectick(struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
 }
 
 static __inline void
 tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
 {
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static __inline uint32_t
 tcp_get_usecs(struct timeval *tv)
 {
 	struct timeval tvd;
 
 	if (tv == NULL)
 		tv = &tvd;
 	microuptime(tv);
 	return (tcp_tv_to_usectick(tv));
 }
 
 #endif /* _KERNEL */
 #endif /* __tcp_hpts_h__ */