Index: stable/11/sys/amd64/amd64/efirt.c
===================================================================
--- stable/11/sys/amd64/amd64/efirt.c	(revision 331016)
+++ stable/11/sys/amd64/amd64/efirt.c	(revision 331017)
@@ -1,610 +1,611 @@
 /*-
  * Copyright (c) 2004 Marcel Moolenaar
  * Copyright (c) 2001 Doug Rabson
  * Copyright (c) 2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/efi.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/clock.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <isa/rtc.h>
 #include <machine/fpu.h>
 #include <machine/efi.h>
 #include <machine/metadata.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 static struct efi_systbl *efi_systbl;
 static struct efi_cfgtbl *efi_cfgtbl;
 static struct efi_rt *efi_runtime;
 
 static int efi_status2err[25] = {
 	0,		/* EFI_SUCCESS */
 	ENOEXEC,	/* EFI_LOAD_ERROR */
 	EINVAL,		/* EFI_INVALID_PARAMETER */
 	ENOSYS,		/* EFI_UNSUPPORTED */
 	EMSGSIZE, 	/* EFI_BAD_BUFFER_SIZE */
 	EOVERFLOW,	/* EFI_BUFFER_TOO_SMALL */
 	EBUSY,		/* EFI_NOT_READY */
 	EIO,		/* EFI_DEVICE_ERROR */
 	EROFS,		/* EFI_WRITE_PROTECTED */
 	EAGAIN,		/* EFI_OUT_OF_RESOURCES */
 	EIO,		/* EFI_VOLUME_CORRUPTED */
 	ENOSPC,		/* EFI_VOLUME_FULL */
 	ENXIO,		/* EFI_NO_MEDIA */
 	ESTALE,		/* EFI_MEDIA_CHANGED */
 	ENOENT,		/* EFI_NOT_FOUND */
 	EACCES,		/* EFI_ACCESS_DENIED */
 	ETIMEDOUT,	/* EFI_NO_RESPONSE */
 	EADDRNOTAVAIL,	/* EFI_NO_MAPPING */
 	ETIMEDOUT,	/* EFI_TIMEOUT */
 	EDOOFUS,	/* EFI_NOT_STARTED */
 	EALREADY,	/* EFI_ALREADY_STARTED */
 	ECANCELED,	/* EFI_ABORTED */
 	EPROTO,		/* EFI_ICMP_ERROR */
 	EPROTO,		/* EFI_TFTP_ERROR */
 	EPROTO		/* EFI_PROTOCOL_ERROR */
 };
 
 static int
 efi_status_to_errno(efi_status status)
 {
 	u_long code;
 
 	code = status & 0x3ffffffffffffffful;
 	return (code < nitems(efi_status2err) ? efi_status2err[code] : EDOOFUS);
 }
 
 static struct mtx efi_lock;
 static pml4_entry_t *efi_pml4;
 static vm_object_t obj_1t1_pt;
 static vm_page_t efi_pml4_page;
 static vm_pindex_t efi_1t1_idx;
 
 static void
 efi_destroy_1t1_map(void)
 {
 	vm_page_t m;
 
 	if (obj_1t1_pt != NULL) {
 		VM_OBJECT_RLOCK(obj_1t1_pt);
 		TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq)
 			m->wire_count = 0;
 		atomic_subtract_int(&vm_cnt.v_wire_count,
 		    obj_1t1_pt->resident_page_count);
 		VM_OBJECT_RUNLOCK(obj_1t1_pt);
 		vm_object_deallocate(obj_1t1_pt);
 	}
 
 	obj_1t1_pt = NULL;
 	efi_pml4 = NULL;
 	efi_pml4_page = NULL;
 }
 
 static vm_page_t
 efi_1t1_page(void)
 {
 
 	return (vm_page_grab(obj_1t1_pt, efi_1t1_idx++, VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO));
 }
 
 static pt_entry_t *
 efi_1t1_pte(vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 	vm_pindex_t pml4_idx, pdp_idx, pd_idx;
 	vm_paddr_t mphys;
 
 	pml4_idx = pmap_pml4e_index(va);
 	pml4e = &efi_pml4[pml4_idx];
 	if (*pml4e == 0) {
 		m = efi_1t1_page();
 		mphys =  VM_PAGE_TO_PHYS(m);
 		*pml4e = mphys | X86_PG_RW | X86_PG_V;
 	} else {
 		mphys = *pml4e & ~PAGE_MASK;
 	}
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys);
 	pdp_idx = pmap_pdpe_index(va);
 	pdpe += pdp_idx;
 	if (*pdpe == 0) {
 		m = efi_1t1_page();
 		mphys =  VM_PAGE_TO_PHYS(m);
 		*pdpe = mphys | X86_PG_RW | X86_PG_V;
 	} else {
 		mphys = *pdpe & ~PAGE_MASK;
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 	pd_idx = pmap_pde_index(va);
 	pde += pd_idx;
 	if (*pde == 0) {
 		m = efi_1t1_page();
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pde = mphys | X86_PG_RW | X86_PG_V;
 	} else {
 		mphys = *pde & ~PAGE_MASK;
 	}
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 	pte += pmap_pte_index(va);
 	KASSERT(*pte == 0, ("va %#jx *pt %#jx", va, *pte));
 
 	return (pte);
 }
 
 static bool
 efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz)
 {
 	struct efi_md *p;
 	pt_entry_t *pte;
 	vm_offset_t va;
 	uint64_t idx;
 	int bits, i, mode;
 
 	obj_1t1_pt = vm_pager_allocate(OBJT_PHYS, NULL, ptoa(1 +
 	    NPML4EPG + NPML4EPG * NPDPEPG + NPML4EPG * NPDPEPG * NPDEPG),
 	    VM_PROT_ALL, 0, NULL);
 	efi_1t1_idx = 0;
 	VM_OBJECT_WLOCK(obj_1t1_pt);
 	efi_pml4_page = efi_1t1_page();
 	VM_OBJECT_WUNLOCK(obj_1t1_pt);
 	efi_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pml4_page));
 	pmap_pinit_pml4(efi_pml4_page);
 
 	for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p,
 	    descsz)) {
 		if ((p->md_attr & EFI_MD_ATTR_RT) == 0)
 			continue;
 		if (p->md_virt != NULL) {
 			if (bootverbose)
 				printf("EFI Runtime entry %d is mapped\n", i);
 			goto fail;
 		}
 		if ((p->md_phys & EFI_PAGE_MASK) != 0) {
 			if (bootverbose)
 				printf("EFI Runtime entry %d is not aligned\n",
 				    i);
 			goto fail;
 		}
 		if (p->md_phys + p->md_pages * EFI_PAGE_SIZE < p->md_phys ||
 		    p->md_phys + p->md_pages * EFI_PAGE_SIZE >=
 		    VM_MAXUSER_ADDRESS) {
 			printf("EFI Runtime entry %d is not in mappable for RT:"
 			    "base %#016jx %#jx pages\n",
 			    i, (uintmax_t)p->md_phys,
 			    (uintmax_t)p->md_pages);
 			goto fail;
 		}
 		if ((p->md_attr & EFI_MD_ATTR_WB) != 0)
 			mode = VM_MEMATTR_WRITE_BACK;
 		else if ((p->md_attr & EFI_MD_ATTR_WT) != 0)
 			mode = VM_MEMATTR_WRITE_THROUGH;
 		else if ((p->md_attr & EFI_MD_ATTR_WC) != 0)
 			mode = VM_MEMATTR_WRITE_COMBINING;
 		else if ((p->md_attr & EFI_MD_ATTR_WP) != 0)
 			mode = VM_MEMATTR_WRITE_PROTECTED;
 		else if ((p->md_attr & EFI_MD_ATTR_UC) != 0)
 			mode = VM_MEMATTR_UNCACHEABLE;
 		else {
 			if (bootverbose)
 				printf("EFI Runtime entry %d mapping "
 				    "attributes unsupported\n", i);
 			mode = VM_MEMATTR_UNCACHEABLE;
 		}
 		bits = pmap_cache_bits(kernel_pmap, mode, FALSE) | X86_PG_RW |
 		    X86_PG_V;
 		VM_OBJECT_WLOCK(obj_1t1_pt);
 		for (va = p->md_phys, idx = 0; idx < p->md_pages; idx++,
 		    va += PAGE_SIZE) {
 			pte = efi_1t1_pte(va);
 			pte_store(pte, va | bits);
 		}
 		VM_OBJECT_WUNLOCK(obj_1t1_pt);
 	}
 
 	return (true);
 
 fail:
 	efi_destroy_1t1_map();
 	return (false);
 }
 
 /*
  * Create an environment for the EFI runtime code call.  The most
  * important part is creating the required 1:1 physical->virtual
  * mappings for the runtime segments.  To do that, we manually create
  * page table which unmap userspace but gives correct kernel mapping.
  * The 1:1 mappings for runtime segments usually occupy low 4G of the
  * physical address map.
  *
  * The 1:1 mappings were chosen over the SetVirtualAddressMap() EFI RT
  * service, because there are some BIOSes which fail to correctly
  * relocate itself on the call, requiring both 1:1 and virtual
  * mapping.  As result, we must provide 1:1 mapping anyway, so no
  * reason to bother with the virtual map, and no need to add a
  * complexity into loader.
  *
  * The fpu_kern_enter() call allows firmware to use FPU, as mandated
  * by the specification.  In particular, CR0.TS bit is cleared.  Also
  * it enters critical section, giving us neccessary protection against
  * context switch.
  *
  * There is no need to disable interrupts around the change of %cr3,
  * the kernel mappings are correct, while we only grabbed the
  * userspace portion of VA.  Interrupts handlers must not access
  * userspace.  Having interrupts enabled fixes the issue with
  * firmware/SMM long operation, which would negatively affect IPIs,
  * esp. TLB shootdown requests.
  */
 static int
 efi_enter(void)
 {
 	pmap_t curpmap;
 	int error;
 
 	if (efi_runtime == NULL)
 		return (ENXIO);
 	curpmap = PCPU_GET(curpmap);
 	PMAP_LOCK(curpmap);
 	mtx_lock(&efi_lock);
 	error = fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
 	if (error != 0) {
 		PMAP_UNLOCK(curpmap);
 		return (error);
 	}
 
 	/*
 	 * IPI TLB shootdown handler invltlb_pcid_handler() reloads
 	 * %cr3 from the curpmap->pm_cr3, which would disable runtime
 	 * segments mappings.  Block the handler's action by setting
 	 * curpmap to impossible value.  See also comment in
 	 * pmap.c:pmap_activate_sw().
 	 */
 	if (pmap_pcid_enabled && !invpcid_works)
 		PCPU_SET(curpmap, NULL);
 
 	load_cr3(VM_PAGE_TO_PHYS(efi_pml4_page) | (pmap_pcid_enabled ?
 	    curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
 	/*
 	 * If PCID is enabled, the clear CR3_PCID_SAVE bit in the loaded %cr3
 	 * causes TLB invalidation.
 	 */
 	if (!pmap_pcid_enabled)
 		invltlb();
 	return (0);
 }
 
 static void
 efi_leave(void)
 {
 	pmap_t curpmap;
 
 	curpmap = &curproc->p_vmspace->vm_pmap;
 	if (pmap_pcid_enabled && !invpcid_works)
 		PCPU_SET(curpmap, curpmap);
 	load_cr3(curpmap->pm_cr3 | (pmap_pcid_enabled ?
 	    curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
 	if (!pmap_pcid_enabled)
 		invltlb();
 
 	fpu_kern_leave(curthread, NULL);
 	mtx_unlock(&efi_lock);
 	PMAP_UNLOCK(curpmap);
 }
 
 static int
 efi_init(void)
 {
 	struct efi_map_header *efihdr;
 	struct efi_md *map;
 	caddr_t kmdp;
 	size_t efisz;
 
 	mtx_init(&efi_lock, "efi", NULL, MTX_DEF);
 
 	if (efi_systbl_phys == 0) {
 		if (bootverbose)
 			printf("EFI systbl not available\n");
 		return (0);
 	}
 	efi_systbl = (struct efi_systbl *)PHYS_TO_DMAP(efi_systbl_phys);
 	if (efi_systbl->st_hdr.th_sig != EFI_SYSTBL_SIG) {
 		efi_systbl = NULL;
 		if (bootverbose)
 			printf("EFI systbl signature invalid\n");
 		return (0);
 	}
 	efi_cfgtbl = (efi_systbl->st_cfgtbl == 0) ? NULL :
 	    (struct efi_cfgtbl *)efi_systbl->st_cfgtbl;
 	if (efi_cfgtbl == NULL) {
 		if (bootverbose)
 			printf("EFI config table is not present\n");
 	}
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL) {
 		if (bootverbose)
 			printf("EFI map is not present\n");
 		return (0);
 	}
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 	if (efihdr->descriptor_size == 0)
 		return (ENOMEM);
 
 	if (!efi_create_1t1_map(map, efihdr->memory_size /
 	    efihdr->descriptor_size, efihdr->descriptor_size)) {
 		if (bootverbose)
 			printf("EFI cannot create runtime map\n");
 		return (ENOMEM);
 	}
 
 	efi_runtime = (efi_systbl->st_rt == 0) ? NULL :
 	    (struct efi_rt *)efi_systbl->st_rt;
 	if (efi_runtime == NULL) {
 		if (bootverbose)
 			printf("EFI runtime services table is not present\n");
 		efi_destroy_1t1_map();
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 efi_uninit(void)
 {
 
 	efi_destroy_1t1_map();
 
 	efi_systbl = NULL;
 	efi_cfgtbl = NULL;
 	efi_runtime = NULL;
 
 	mtx_destroy(&efi_lock);
 }
 
 int
 efi_get_table(struct uuid *uuid, void **ptr)
 {
 	struct efi_cfgtbl *ct;
 	u_long count;
 
 	if (efi_cfgtbl == NULL)
 		return (ENXIO);
 	count = efi_systbl->st_entries;
 	ct = efi_cfgtbl;
 	while (count--) {
 		if (!bcmp(&ct->ct_uuid, uuid, sizeof(*uuid))) {
 			*ptr = (void *)PHYS_TO_DMAP(ct->ct_data);
 			return (0);
 		}
 		ct++;
 	}
 	return (ENOENT);
 }
 
 int
 efi_get_time_locked(struct efi_tm *tm)
 {
 	efi_status status;
 	int error;
 
 	mtx_assert(&atrtc_time_lock, MA_OWNED);
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	status = efi_runtime->rt_gettime(tm, NULL);
 	efi_leave();
 	error = efi_status_to_errno(status);
 	return (error);
 }
 
 int
 efi_get_time(struct efi_tm *tm)
 {
 	int error;
 
 	if (efi_runtime == NULL)
 		return (ENXIO);
 	mtx_lock(&atrtc_time_lock);
 	error = efi_get_time_locked(tm);
 	mtx_unlock(&atrtc_time_lock);
 	return (error);
 }
 
 int
 efi_reset_system(void)
 {
 	int error;
 
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	efi_runtime->rt_reset(EFI_RESET_WARM, 0, 0, NULL);
 	efi_leave();
 	return (EIO);
 }
 
 int
 efi_set_time_locked(struct efi_tm *tm)
 {
 	efi_status status;
 	int error;
 
 	mtx_assert(&atrtc_time_lock, MA_OWNED);
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	status = efi_runtime->rt_settime(tm);
 	efi_leave();
 	error = efi_status_to_errno(status);
 	return (error);
 }
 
 int
 efi_set_time(struct efi_tm *tm)
 {
 	int error;
 
 	if (efi_runtime == NULL)
 		return (ENXIO);
 	mtx_lock(&atrtc_time_lock);
 	error = efi_set_time_locked(tm);
 	mtx_unlock(&atrtc_time_lock);
 	return (error);
 }
 
 int
 efi_var_get(efi_char *name, struct uuid *vendor, uint32_t *attrib,
     size_t *datasize, void *data)
 {
 	efi_status status;
 	int error;
 
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	status = efi_runtime->rt_getvar(name, vendor, attrib, datasize, data);
 	efi_leave();
 	error = efi_status_to_errno(status);
 	return (error);
 }
 
 int
 efi_var_nextname(size_t *namesize, efi_char *name, struct uuid *vendor)
 {
 	efi_status status;
 	int error;
 
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	status = efi_runtime->rt_scanvar(namesize, name, vendor);
 	efi_leave();
 	error = efi_status_to_errno(status);
 	return (error);
 }
 
 int
 efi_var_set(efi_char *name, struct uuid *vendor, uint32_t attrib,
     size_t datasize, void *data)
 {
 	efi_status status;
 	int error;
 
 	error = efi_enter();
 	if (error != 0)
 		return (error);
 	status = efi_runtime->rt_setvar(name, vendor, attrib, datasize, data);
 	efi_leave();
 	error = efi_status_to_errno(status);
 	return (error);
 }
 
 static int
 efirt_modevents(module_t m, int event, void *arg __unused)
 {
 
 	switch (event) {
 	case MOD_LOAD:
 		return (efi_init());
 
 	case MOD_UNLOAD:
 		efi_uninit();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static moduledata_t efirt_moddata = {
 	.name = "efirt",
 	.evhand = efirt_modevents,
 	.priv = NULL,
 };
 DECLARE_MODULE(efirt, efirt_moddata, SI_SUB_VM_CONF, SI_ORDER_ANY);
 MODULE_VERSION(efirt, 1);
 
 /* XXX debug stuff */
 static int
 efi_time_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_tm tm;
 	int error, val;
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	error = efi_get_time(&tm);
 	if (error == 0) {
 		uprintf("EFI reports: Year %d Month %d Day %d Hour %d Min %d "
 		    "Sec %d\n", tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour,
 		    tm.tm_min, tm.tm_sec);
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, efi_time, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
     efi_time_sysctl_handler, "I", "");
Index: stable/11/sys/amd64/amd64/minidump_machdep.c
===================================================================
--- stable/11/sys/amd64/amd64/minidump_machdep.c	(revision 331016)
+++ stable/11/sys/amd64/amd64/minidump_machdep.c	(revision 331017)
@@ -1,487 +1,488 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pmap.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
 #include <machine/elf.h>
 #include <machine/md_var.h>
 #include <machine/minidump.h>
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
 /*
  * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
  * is to protect us from metadata and to protect metadata from us.
  */
 #define	SIZEOF_METADATA		(64*1024)
 
 uint64_t *vm_page_dump;
 int vm_page_dump_size;
 
 static struct kerneldumpheader kdh;
 static off_t dumplo;
 
 /* Handle chunked writes. */
 static size_t fragsz;
 static void *dump_va;
 static size_t counter, progress, dumpsize;
 
 CTASSERT(sizeof(*vm_page_dump) == 8);
 
 static int
 is_dumpable(vm_paddr_t pa)
 {
 	vm_page_t m;
 	int i;
 
 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
 		return ((m->flags & PG_NODUMP) == 0);
 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
 			return (1);
 	}
 	return (0);
 }
 
 #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
 
 static int
 blk_flush(struct dumperinfo *di)
 {
 	int error;
 
 	if (fragsz == 0)
 		return (0);
 
 	error = dump_write(di, dump_va, 0, dumplo, fragsz);
 	dumplo += fragsz;
 	fragsz = 0;
 	return (error);
 }
 
 static struct {
 	int min_per;
 	int max_per;
 	int visited;
 } progress_track[10] = {
 	{  0,  10, 0},
 	{ 10,  20, 0},
 	{ 20,  30, 0},
 	{ 30,  40, 0},
 	{ 40,  50, 0},
 	{ 50,  60, 0},
 	{ 60,  70, 0},
 	{ 70,  80, 0},
 	{ 80,  90, 0},
 	{ 90, 100, 0}
 };
 
 static void
 report_progress(size_t progress, size_t dumpsize)
 {
 	int sofar, i;
 
 	sofar = 100 - ((progress * 100) / dumpsize);
 	for (i = 0; i < nitems(progress_track); i++) {
 		if (sofar < progress_track[i].min_per ||
 		    sofar > progress_track[i].max_per)
 			continue;
 		if (progress_track[i].visited)
 			return;
 		progress_track[i].visited = 1;
 		printf("..%d%%", sofar);
 		return;
 	}
 }
 
 static int
 blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
 {
 	size_t len;
 	int error, i, c;
 	u_int maxdumpsz;
 
 	maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE);
 	if (maxdumpsz == 0)	/* seatbelt */
 		maxdumpsz = PAGE_SIZE;
 	error = 0;
 	if ((sz % PAGE_SIZE) != 0) {
 		printf("size not page aligned\n");
 		return (EINVAL);
 	}
 	if (ptr != NULL && pa != 0) {
 		printf("cant have both va and pa!\n");
 		return (EINVAL);
 	}
 	if ((((uintptr_t)pa) % PAGE_SIZE) != 0) {
 		printf("address not page aligned %p\n", ptr);
 		return (EINVAL);
 	}
 	if (ptr != NULL) {
 		/* If we're doing a virtual dump, flush any pre-existing pa pages */
 		error = blk_flush(di);
 		if (error)
 			return (error);
 	}
 	while (sz) {
 		len = maxdumpsz - fragsz;
 		if (len > sz)
 			len = sz;
 		counter += len;
 		progress -= len;
 		if (counter >> 24) {
 			report_progress(progress, dumpsize);
 			counter &= (1<<24) - 1;
 		}
 
 		wdog_kern_pat(WD_LASTVAL);
 
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
 				return (error);
 			dumplo += len;
 			ptr += len;
 			sz -= len;
 		} else {
 			for (i = 0; i < len; i += PAGE_SIZE)
 				dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
 			fragsz += len;
 			pa += len;
 			sz -= len;
 			if (fragsz == maxdumpsz) {
 				error = blk_flush(di);
 				if (error)
 					return (error);
 			}
 		}
 
 		/* Check for user abort. */
 		c = cncheckc();
 		if (c == 0x03)
 			return (ECANCELED);
 		if (c != -1)
 			printf(" (CTRL-C to abort) ");
 	}
 
 	return (0);
 }
 
 /* A fake page table page, to avoid having to handle both 4K and 2M pages */
 static pd_entry_t fakepd[NPDEPG];
 
 int
 minidumpsys(struct dumperinfo *di)
 {
 	uint32_t pmapsize;
 	vm_offset_t va;
 	int error;
 	uint64_t bits;
 	uint64_t *pml4, *pdp, *pd, *pt, pa;
 	size_t size;
 	int i, ii, j, k, n, bit;
 	int retry_count;
 	struct minidumphdr mdhdr;
 
 	retry_count = 0;
  retry:
 	retry_count++;
 	counter = 0;
 	for (i = 0; i < nitems(progress_track); i++)
 		progress_track[i].visited = 0;
 	/* Walk page table pages, set bits in vm_page_dump */
 	pmapsize = 0;
 	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); ) {
 		/*
 		 * We always write a page, even if it is zero. Each
 		 * page written corresponds to 1GB of space
 		 */
 		pmapsize += PAGE_SIZE;
 		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 		i = pmap_pdpe_index(va);
 		if ((pdp[i] & PG_V) == 0) {
 			va += NBPDP;
 			continue;
 		}
 
 		/*
 		 * 1GB page is represented as 512 2MB pages in a dump.
 		 */
 		if ((pdp[i] & PG_PS) != 0) {
 			va += NBPDP;
 			pa = pdp[i] & PG_PS_FRAME;
 			for (n = 0; n < NPDEPG * NPTEPG; n++) {
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 				pa += PAGE_SIZE;
 			}
 			continue;
 		}
 
 		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
 		for (n = 0; n < NPDEPG; n++, va += NBPDR) {
 			j = pmap_pde_index(va);
 
 			if ((pd[j] & PG_V) == 0)
 				continue;
 
 			if ((pd[j] & PG_PS) != 0) {
 				/* This is an entire 2M page. */
 				pa = pd[j] & PG_PS_FRAME;
 				for (k = 0; k < NPTEPG; k++) {
 					if (is_dumpable(pa))
 						dump_add_page(pa);
 					pa += PAGE_SIZE;
 				}
 				continue;
 			}
 
 			pa = pd[j] & PG_FRAME;
 			/* set bit for this PTE page */
 			if (is_dumpable(pa))
 				dump_add_page(pa);
 			/* and for each valid page in this 2MB block */
 			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
 			for (k = 0; k < NPTEPG; k++) {
 				if ((pt[k] & PG_V) == 0)
 					continue;
 				pa = pt[k] & PG_FRAME;
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
 	}
 
 	/* Calculate dump size. */
 	dumpsize = pmapsize;
 	dumpsize += round_page(msgbufp->msg_size);
 	dumpsize += round_page(vm_page_dump_size);
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = bsfq(bits);
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
 			/* Clear out undumpable pages now if needed */
 			if (is_dumpable(pa)) {
 				dumpsize += PAGE_SIZE;
 			} else {
 				dump_drop_page(pa);
 			}
 			bits &= ~(1ul << bit);
 		}
 	}
 	dumpsize += PAGE_SIZE;
 
 	/* Determine dump offset on device. */
 	if (di->mediasize < SIZEOF_METADATA + dumpsize + di->blocksize * 2) {
 		error = E2BIG;
 		goto fail;
 	}
 	dumplo = di->mediaoffset + di->mediasize - dumpsize;
 	dumplo -= di->blocksize * 2;
 	progress = dumpsize;
 
 	/* Initialize mdhdr */
 	bzero(&mdhdr, sizeof(mdhdr));
 	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
 	mdhdr.version = MINIDUMP_VERSION;
 	mdhdr.msgbufsize = msgbufp->msg_size;
 	mdhdr.bitmapsize = vm_page_dump_size;
 	mdhdr.pmapsize = pmapsize;
 	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
 	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
 	mdhdr.dmapend = DMAP_MAX_ADDRESS;
 
 	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, dumpsize, di->blocksize);
 
 	printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20,
 	    ptoa((uintmax_t)physmem) / 1048576);
 
 	/* Dump leader */
 	error = dump_write_pad(di, &kdh, 0, dumplo, sizeof(kdh), &size);
 	if (error)
 		goto fail;
 	dumplo += size;
 
 	/* Dump my header */
 	bzero(&fakepd, sizeof(fakepd));
 	bcopy(&mdhdr, &fakepd, sizeof(mdhdr));
 	error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 	if (error)
 		goto fail;
 
 	/* Dump msgbuf up front */
 	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
 	if (error)
 		goto fail;
 
 	/* Dump bitmap */
 	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
 	if (error)
 		goto fail;
 
 	/* Dump kernel page directory pages */
 	bzero(fakepd, sizeof(fakepd));
 	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); va += NBPDP) {
 		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 		i = pmap_pdpe_index(va);
 
 		/* We always write a page, even if it is zero */
 		if ((pdp[i] & PG_V) == 0) {
 			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse fakepd in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			continue;
 		}
 
 		/* 1GB page is represented as 512 2MB pages in a dump */
 		if ((pdp[i] & PG_PS) != 0) {
 			/* PDPE and PDP have identical layout in this case */
 			fakepd[0] = pdp[i];
 			for (j = 1; j < NPDEPG; j++)
 				fakepd[j] = fakepd[j - 1] + NBPDR;
 			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse fakepd in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			bzero(fakepd, sizeof(fakepd));
 			continue;
 		}
 
 		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
 		error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
 		if (error)
 			goto fail;
 		error = blk_flush(di);
 		if (error)
 			goto fail;
 	}
 
 	/* Dump memory chunks */
 	/* XXX cluster it up and use blk_dump() */
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = bsfq(bits);
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
 			error = blk_write(di, 0, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
 			bits &= ~(1ul << bit);
 		}
 	}
 
 	error = blk_flush(di);
 	if (error)
 		goto fail;
 
 	/* Dump trailer */
 	error = dump_write_pad(di, &kdh, 0, dumplo, sizeof(kdh), &size);
 	if (error)
 		goto fail;
 	dumplo += size;
 
 	/* Signal completion, signoff and exit stage left. */
 	dump_write(di, NULL, 0, 0, 0);
 	printf("\nDump complete\n");
 	return (0);
 
  fail:
 	if (error < 0)
 		error = -error;
 
 	printf("\n");
 	if (error == ENOSPC) {
 		printf("Dump map grown while dumping. ");
 		if (retry_count < 5) {
 			printf("Retrying...\n");
 			goto retry;
 		}
 		printf("Dump failed.\n");
 	}
 	else if (error == ECANCELED)
 		printf("Dump aborted\n");
 	else if (error == E2BIG)
 		printf("Dump failed. Partition too small.\n");
 	else
 		printf("** DUMP FAILED (ERROR %d) **\n", error);
 	return (error);
 }
 
 void
 dump_add_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 6;		/* 2^6 = 64 */
 	bit = pa & 63;
 	atomic_set_long(&vm_page_dump[idx], 1ul << bit);
 }
 
 void
 dump_drop_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 6;		/* 2^6 = 64 */
 	bit = pa & 63;
 	atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
 }
Index: stable/11/sys/amd64/amd64/uma_machdep.c
===================================================================
--- stable/11/sys/amd64/amd64/uma_machdep.c	(revision 331016)
+++ stable/11/sys/amd64/amd64/uma_machdep.c	(revision 331017)
@@ -1,78 +1,79 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 
 void *
 uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
 	m = vm_page_alloc(NULL, 0,
 	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL)
 		return (NULL);
 	pa = m->phys_addr;
 	if ((wait & M_NODUMP) == 0)
 		dump_add_page(pa);
 	va = (void *)PHYS_TO_DMAP(pa);
 	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
 		pagezero(va);
 	return (va);
 }
 
 void
 uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = DMAP_TO_PHYS((vm_offset_t)mem);
 	dump_drop_page(pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	m->wire_count--;
 	vm_page_free(m);
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 }
Index: stable/11/sys/arm/arm/intr.c
===================================================================
--- stable/11/sys/arm/arm/intr.c	(revision 331016)
+++ stable/11/sys/arm/arm/intr.c	(revision 331017)
@@ -1,203 +1,204 @@
 /*	$NetBSD: intr.c,v 1.12 2003/07/15 00:24:41 lukem Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2004 Olivier Houchard.
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  *	for the NetBSD Project.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Soft interrupt and other generic interrupt functions.
  */
 
 #include "opt_platform.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/conf.h>
 #include <sys/pmc.h>
 #include <sys/pmckern.h>
+#include <sys/vmmeter.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/intr.h>
 #include <machine/cpu.h>
 
 #ifdef FDT
 #include <dev/fdt/fdt_common.h>
 #include <machine/fdt.h>
 #endif
 
 #define	INTRNAME_LEN	(MAXCOMLEN + 1)
 
 typedef void (*mask_fn)(void *);
 
 static struct intr_event *intr_events[NIRQ];
 
 void	intr_irq_handler(struct trapframe *);
 
 void (*arm_post_filter)(void *) = NULL;
 int (*arm_config_irq)(int irq, enum intr_trigger trig,
     enum intr_polarity pol) = NULL;
 
 /* Data for statistics reporting. */
 u_long intrcnt[NIRQ];
 char intrnames[(NIRQ * INTRNAME_LEN) + 1];
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 
 /*
  * Pre-format intrnames into an array of fixed-size strings containing spaces.
  * This allows us to avoid the need for an intermediate table of indices into
  * the names and counts arrays, while still meeting the requirements and
  * assumptions of vmstat(8) and the kdb "show intrcnt" command, the two
  * consumers of this data.
  */
 static void
 intr_init(void *unused)
 {
 	int i;
 
 	for (i = 0; i < NIRQ; ++i) {
 		snprintf(&intrnames[i * INTRNAME_LEN], INTRNAME_LEN, "%-*s",
 		    INTRNAME_LEN - 1, "");
 	}
 }
 
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
 #ifdef FDT
 int
 intr_fdt_map_irq(phandle_t iparent, pcell_t *intr, int icells)
 {
 	fdt_pic_decode_t intr_decode;
 	phandle_t intr_parent;
 	int i, rv, interrupt, trig, pol;
 
 	intr_parent = OF_node_from_xref(iparent);
 	for (i = 0; i < icells; i++)
 		intr[i] = cpu_to_fdt32(intr[i]);
 
 	for (i = 0; fdt_pic_table[i] != NULL; i++) {
 		intr_decode = fdt_pic_table[i];
 		rv = intr_decode(intr_parent, intr, &interrupt, &trig, &pol);
 
 		if (rv == 0) {
 			/* This was recognized as our PIC and decoded. */
 			interrupt = FDT_MAP_IRQ(intr_parent, interrupt);
 			return (interrupt);
 		}
 	}
 
 	/* Not in table, so guess */
 	interrupt = FDT_MAP_IRQ(intr_parent, fdt32_to_cpu(intr[0]));
 
 	return (interrupt);
 }
 #endif
 
 void
 arm_setup_irqhandler(const char *name, driver_filter_t *filt,
     void (*hand)(void*), void *arg, int irq, int flags, void **cookiep)
 {
 	struct intr_event *event;
 	int error;
 
 	if (irq < 0 || irq >= NIRQ)
 		return;
 	event = intr_events[irq];
 	if (event == NULL) {
 		error = intr_event_create(&event, (void *)irq, 0, irq,
 		    (mask_fn)arm_mask_irq, (mask_fn)arm_unmask_irq,
 		    arm_post_filter, NULL, "intr%d:", irq);
 		if (error)
 			return;
 		intr_events[irq] = event;
 		snprintf(&intrnames[irq * INTRNAME_LEN], INTRNAME_LEN,
 		    "irq%d: %-*s", irq, INTRNAME_LEN - 1, name);
 	}
 	intr_event_add_handler(event, name, filt, hand, arg,
 	    intr_priority(flags), flags, cookiep);
 }
 
 int
 arm_remove_irqhandler(int irq, void *cookie)
 {
 	struct intr_event *event;
 	int error;
 
 	event = intr_events[irq];
 	arm_mask_irq(irq);
 
 	error = intr_event_remove_handler(cookie);
 
 	if (!TAILQ_EMPTY(&event->ie_handlers))
 		arm_unmask_irq(irq);
 	return (error);
 }
 
 void dosoftints(void);
 void
 dosoftints(void)
 {
 }
 
 void
 intr_irq_handler(struct trapframe *frame)
 {
 	struct intr_event *event;
 	int i;
 
 	PCPU_INC(cnt.v_intr);
 	i = -1;
 	while ((i = arm_get_next_irq(i)) != -1) {
 		intrcnt[i]++;
 		event = intr_events[i];
 		if (intr_event_handle(event, frame) != 0) {
 			/* XXX: Log stray IRQs */
 			arm_mask_irq(i);
 		}
 	}
 #ifdef HWPMC_HOOKS
 	if (pmc_hook && (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN))
 		pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, frame);
 #endif
 }
Index: stable/11/sys/arm/arm/machdep.c
===================================================================
--- stable/11/sys/arm/arm/machdep.c	(revision 331016)
+++ stable/11/sys/arm/arm/machdep.c	(revision 331017)
@@ -1,1258 +1,1259 @@
 /*	$NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2004 Olivier Houchard
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  *	for the NetBSD Project.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Machine dependent functions for kernel setup
  *
  * Created      : 17/09/94
  * Updated	: 18/04/01 updated for new wscons
  */
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 #include "opt_timer.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/devmap.h>
 #include <sys/efi.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/msgbuf.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/debug_monitor.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/pcb.h>
 #include <machine/physmem.h>
 #include <machine/platform.h>
 #include <machine/sysarch.h>
 #include <machine/undefined.h>
 #include <machine/vfp.h>
 #include <machine/vmparam.h>
 
 #ifdef FDT
 #include <dev/fdt/fdt_common.h>
 #include <machine/ofw_machdep.h>
 #endif
 
 #ifdef DEBUG
 #define	debugf(fmt, args...) printf(fmt, ##args)
 #else
 #define	debugf(fmt, args...)
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) || \
     defined(COMPAT_FREEBSD9)
 #error FreeBSD/arm doesn't provide compatibility with releases prior to 10
 #endif
 
 struct pcpu __pcpu[MAXCPU];
 struct pcpu *pcpup = &__pcpu[0];
 
 static struct trapframe proc0_tf;
 uint32_t cpu_reset_address = 0;
 int cold = 1;
 vm_offset_t vector_page;
 
 int (*_arm_memcpy)(void *, void *, int, int) = NULL;
 int (*_arm_bzero)(void *, int, int) = NULL;
 int _min_memcpy_size = 0;
 int _min_bzero_size = 0;
 
 extern int *end;
 
 #ifdef FDT
 vm_paddr_t pmap_pa;
 #if __ARM_ARCH >= 6
 vm_offset_t systempage;
 vm_offset_t irqstack;
 vm_offset_t undstack;
 vm_offset_t abtstack;
 #else
 /*
  * This is the number of L2 page tables required for covering max
  * (hypothetical) memsize of 4GB and all kernel mappings (vectors, msgbuf,
  * stacks etc.), uprounded to be divisible by 4.
  */
 #define KERNEL_PT_MAX	78
 static struct pv_addr kernel_pt_table[KERNEL_PT_MAX];
 struct pv_addr systempage;
 static struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 static struct pv_addr kernelstack;
 #endif /* __ARM_ARCH >= 6 */
 #endif /* FDT */
 
 #ifdef MULTIDELAY
 static delay_func *delay_impl;
 static void *delay_arg;
 #endif
 
 struct kva_md_info kmi;
 
 /*
  * arm32_vector_init:
  *
  *	Initialize the vector page, and select whether or not to
  *	relocate the vectors.
  *
  *	NOTE: We expect the vector page to be mapped at its expected
  *	destination.
  */
 
 extern unsigned int page0[], page0_data[];
 void
 arm_vector_init(vm_offset_t va, int which)
 {
 	unsigned int *vectors = (int *) va;
 	unsigned int *vectors_data = vectors + (page0_data - page0);
 	int vec;
 
 	/*
 	 * Loop through the vectors we're taking over, and copy the
 	 * vector's insn and data word.
 	 */
 	for (vec = 0; vec < ARM_NVEC; vec++) {
 		if ((which & (1 << vec)) == 0) {
 			/* Don't want to take over this vector. */
 			continue;
 		}
 		vectors[vec] = page0[vec];
 		vectors_data[vec] = page0_data[vec];
 	}
 
 	/* Now sync the vectors. */
 	icache_sync(va, (ARM_NVEC * 2) * sizeof(u_int));
 
 	vector_page = va;
 #if __ARM_ARCH < 6
 	if (va == ARM_VECTORS_HIGH) {
 		/*
 		 * Enable high vectors in the system control reg (SCTLR).
 		 *
 		 * Assume the MD caller knows what it's doing here, and really
 		 * does want the vector page relocated.
 		 *
 		 * Note: This has to be done here (and not just in
 		 * cpu_setup()) because the vector page needs to be
 		 * accessible *before* cpu_startup() is called.
 		 * Think ddb(9) ...
 		 */
 		cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC);
 	}
 #endif
 }
 
 static void
 cpu_startup(void *dummy)
 {
 	struct pcb *pcb = thread0.td_pcb;
 	const unsigned int mbyte = 1024 * 1024;
 #if __ARM_ARCH < 6 && !defined(ARM_CACHE_LOCK_ENABLE)
 	vm_page_t m;
 #endif
 
 	identify_arm_cpu();
 
 	vm_ksubmap_init(&kmi);
 
 	/*
 	 * Display the RAM layout.
 	 */
 	printf("real memory  = %ju (%ju MB)\n",
 	    (uintmax_t)arm32_ptob(realmem),
 	    (uintmax_t)arm32_ptob(realmem) / mbyte);
 	printf("avail memory = %ju (%ju MB)\n",
 	    (uintmax_t)arm32_ptob(vm_cnt.v_free_count),
 	    (uintmax_t)arm32_ptob(vm_cnt.v_free_count) / mbyte);
 	if (bootverbose) {
 		arm_physmem_print_tables();
 		devmap_print_table();
 	}
 
 	bufinit();
 	vm_pager_bufferinit();
 	pcb->pcb_regs.sf_sp = (u_int)thread0.td_kstack +
 	    USPACE_SVC_STACK_TOP;
 	pmap_set_pcb_pagedir(kernel_pmap, pcb);
 #if __ARM_ARCH < 6
 	vector_page_setprot(VM_PROT_READ);
 	pmap_postinit();
 #ifdef ARM_CACHE_LOCK_ENABLE
 	pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
 	arm_lock_cache_line(ARM_TP_ADDRESS);
 #else
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_ZERO);
 	pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m));
 #endif
 	*(uint32_t *)ARM_RAS_START = 0;
 	*(uint32_t *)ARM_RAS_END = 0xffffffff;
 #endif
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	dcache_wb_poc((vm_offset_t)ptr, (vm_paddr_t)vtophys(ptr), len);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 void
 cpu_idle(int busy)
 {
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu);
 	spinlock_enter();
 #ifndef NO_EVENTTIMERS
 	if (!busy)
 		cpu_idleclock();
 #endif
 	if (!sched_runnable())
 		cpu_sleep(0);
 #ifndef NO_EVENTTIMERS
 	if (!busy)
 		cpu_activeclock();
 #endif
 	spinlock_exit();
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (0);
 }
 
 /*
  * Most ARM platforms don't need to do anything special to init their clocks
  * (they get intialized during normal device attachment), and by not defining a
  * cpu_initclocks() function they get this generic one.  Any platform that needs
  * to do something special can just provide their own implementation, which will
  * override this one due to the weak linkage.
  */
 void
 arm_generic_initclocks(void)
 {
 
 #ifndef NO_EVENTTIMERS
 #ifdef SMP
 	if (PCPU_GET(cpuid) == 0)
 		cpu_initclocks_bsp();
 	else
 		cpu_initclocks_ap();
 #else
 	cpu_initclocks_bsp();
 #endif
 #endif
 }
 __weak_reference(arm_generic_initclocks, cpu_initclocks);
 
 #ifdef MULTIDELAY
 void
 arm_set_delay(delay_func *impl, void *arg)
 {
 
 	KASSERT(impl != NULL, ("No DELAY implementation"));
 	delay_impl = impl;
 	delay_arg = arg;
 }
 
 void
 DELAY(int usec)
 {
 
 	delay_impl(usec, delay_arg);
 }
 #endif
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t cspr;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		cspr = disable_interrupts(PSR_I | PSR_F);
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_cspr = cspr;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t cspr;
 
 	td = curthread;
 	critical_exit();
 	cspr = td->td_md.md_saved_cspr;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		restore_interrupts(cspr);
 }
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf = td->td_frame;
 
 	memset(tf, 0, sizeof(*tf));
 	tf->tf_usr_sp = stack;
 	tf->tf_usr_lr = imgp->entry_addr;
 	tf->tf_svc_lr = 0x77777777;
 	tf->tf_pc = imgp->entry_addr;
 	tf->tf_spsr = PSR_USR32_MODE;
 }
 
 
 #ifdef VFP
 /*
  * Get machine VFP context.
  */
 void
 get_vfpcontext(struct thread *td, mcontext_vfp_t *vfp)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if (td == curthread) {
 		critical_enter();
 		vfp_store(&pcb->pcb_vfpstate, false);
 		critical_exit();
 	} else
 		MPASS(TD_IS_SUSPENDED(td));
 	memcpy(vfp->mcv_reg, pcb->pcb_vfpstate.reg,
 	    sizeof(vfp->mcv_reg));
 	vfp->mcv_fpscr = pcb->pcb_vfpstate.fpscr;
 }
 
 /*
  * Set machine VFP context.
  */
 void
 set_vfpcontext(struct thread *td, mcontext_vfp_t *vfp)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if (td == curthread) {
 		critical_enter();
 		vfp_discard(td);
 		critical_exit();
 	} else
 		MPASS(TD_IS_SUSPENDED(td));
 	memcpy(pcb->pcb_vfpstate.reg, vfp->mcv_reg,
 	    sizeof(pcb->pcb_vfpstate.reg));
 	pcb->pcb_vfpstate.fpscr = vfp->mcv_fpscr;
 }
 #endif
 
 int
 arm_get_vfpstate(struct thread *td, void *args)
 {
 	int rv;
 	struct arm_get_vfpstate_args ua;
 	mcontext_vfp_t	mcontext_vfp;
 
 	rv = copyin(args, &ua, sizeof(ua));
 	if (rv != 0)
 		return (rv);
 	if (ua.mc_vfp_size != sizeof(mcontext_vfp_t))
 		return (EINVAL);
 #ifdef VFP
 	get_vfpcontext(td, &mcontext_vfp);
 #else
 	bzero(&mcontext_vfp, sizeof(mcontext_vfp));
 #endif
 
 	rv = copyout(&mcontext_vfp, ua.mc_vfp,  sizeof(mcontext_vfp));
 	if (rv != 0)
 		return (rv);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
 {
 	struct trapframe *tf = td->td_frame;
 	__greg_t *gr = mcp->__gregs;
 
 	if (clear_ret & GET_MC_CLEAR_RET) {
 		gr[_REG_R0] = 0;
 		gr[_REG_CPSR] = tf->tf_spsr & ~PSR_C;
 	} else {
 		gr[_REG_R0]   = tf->tf_r0;
 		gr[_REG_CPSR] = tf->tf_spsr;
 	}
 	gr[_REG_R1]   = tf->tf_r1;
 	gr[_REG_R2]   = tf->tf_r2;
 	gr[_REG_R3]   = tf->tf_r3;
 	gr[_REG_R4]   = tf->tf_r4;
 	gr[_REG_R5]   = tf->tf_r5;
 	gr[_REG_R6]   = tf->tf_r6;
 	gr[_REG_R7]   = tf->tf_r7;
 	gr[_REG_R8]   = tf->tf_r8;
 	gr[_REG_R9]   = tf->tf_r9;
 	gr[_REG_R10]  = tf->tf_r10;
 	gr[_REG_R11]  = tf->tf_r11;
 	gr[_REG_R12]  = tf->tf_r12;
 	gr[_REG_SP]   = tf->tf_usr_sp;
 	gr[_REG_LR]   = tf->tf_usr_lr;
 	gr[_REG_PC]   = tf->tf_pc;
 
 	mcp->mc_vfp_size = 0;
 	mcp->mc_vfp_ptr = NULL;
 	memset(&mcp->mc_spare, 0, sizeof(mcp->mc_spare));
 
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	mcontext_vfp_t mc_vfp, *vfp;
 	struct trapframe *tf = td->td_frame;
 	const __greg_t *gr = mcp->__gregs;
 	int spsr;
 
 	/*
 	 * Make sure the processor mode has not been tampered with and
 	 * interrupts have not been disabled.
 	 */
 	spsr = gr[_REG_CPSR];
 	if ((spsr & PSR_MODE) != PSR_USR32_MODE ||
 	    (spsr & (PSR_I | PSR_F)) != 0)
 		return (EINVAL);
 
 #ifdef WITNESS
 	if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_size != sizeof(mc_vfp)) {
 		printf("%s: %s: Malformed mc_vfp_size: %d (0x%08X)\n",
 		    td->td_proc->p_comm, __func__,
 		    mcp->mc_vfp_size, mcp->mc_vfp_size);
 	} else if (mcp->mc_vfp_size != 0 && mcp->mc_vfp_ptr == NULL) {
 		printf("%s: %s: c_vfp_size != 0 but mc_vfp_ptr == NULL\n",
 		    td->td_proc->p_comm, __func__);
 	}
 #endif
 
 	if (mcp->mc_vfp_size == sizeof(mc_vfp) && mcp->mc_vfp_ptr != NULL) {
 		if (copyin(mcp->mc_vfp_ptr, &mc_vfp, sizeof(mc_vfp)) != 0)
 			return (EFAULT);
 		vfp = &mc_vfp;
 	} else {
 		vfp = NULL;
 	}
 
 	tf->tf_r0 = gr[_REG_R0];
 	tf->tf_r1 = gr[_REG_R1];
 	tf->tf_r2 = gr[_REG_R2];
 	tf->tf_r3 = gr[_REG_R3];
 	tf->tf_r4 = gr[_REG_R4];
 	tf->tf_r5 = gr[_REG_R5];
 	tf->tf_r6 = gr[_REG_R6];
 	tf->tf_r7 = gr[_REG_R7];
 	tf->tf_r8 = gr[_REG_R8];
 	tf->tf_r9 = gr[_REG_R9];
 	tf->tf_r10 = gr[_REG_R10];
 	tf->tf_r11 = gr[_REG_R11];
 	tf->tf_r12 = gr[_REG_R12];
 	tf->tf_usr_sp = gr[_REG_SP];
 	tf->tf_usr_lr = gr[_REG_LR];
 	tf->tf_pc = gr[_REG_PC];
 	tf->tf_spsr = gr[_REG_CPSR];
 #ifdef VFP
 	if (vfp != NULL)
 		set_vfpcontext(td, vfp);
 #endif
 	return (0);
 }
 
 void
 sendsig(catcher, ksi, mask)
 	sig_t catcher;
 	ksiginfo_t *ksi;
 	sigset_t *mask;
 {
 	struct thread *td;
 	struct proc *p;
 	struct trapframe *tf;
 	struct sigframe *fp, frame;
 	struct sigacts *psp;
 	struct sysentvec *sysent;
 	int onstack;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	onstack = sigonstack(tf->tf_usr_sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !(onstack) &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct sigframe *)td->td_frame->tf_usr_sp;
 
 	/* make room on the stack */
 	fp--;
 
 	/* make the stack aligned */
 	fp = (struct sigframe *)STACKALIGN(fp);
 	/* Populate the siginfo frame. */
 	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
 #ifdef VFP
 	get_vfpcontext(td, &frame.sf_vfp);
 	frame.sf_uc.uc_mcontext.mc_vfp_size = sizeof(fp->sf_vfp);
 	frame.sf_uc.uc_mcontext.mc_vfp_ptr = &fp->sf_vfp;
 #else
 	frame.sf_uc.uc_mcontext.mc_vfp_size = 0;
 	frame.sf_uc.uc_mcontext.mc_vfp_ptr = NULL;
 #endif
 	frame.sf_si = ksi->ksi_info;
 	frame.sf_uc.uc_sigmask = *mask;
 	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK )
 	    ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	frame.sf_uc.uc_stack = td->td_sigstk;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(td->td_proc);
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
 		/* Process has trashed its stack. Kill it. */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.  We invoke the handler
 	 * directly, only returning via the trampoline.  Note the
 	 * trampoline version numbers are coordinated with machine-
 	 * dependent code in libc.
 	 */
 
 	tf->tf_r0 = sig;
 	tf->tf_r1 = (register_t)&fp->sf_si;
 	tf->tf_r2 = (register_t)&fp->sf_uc;
 
 	/* the trampoline uses r5 as the uc address */
 	tf->tf_r5 = (register_t)&fp->sf_uc;
 	tf->tf_pc = (register_t)catcher;
 	tf->tf_usr_sp = (register_t)fp;
 	sysent = p->p_sysent;
 	if (sysent->sv_sigcode_base != 0)
 		tf->tf_usr_lr = (register_t)sysent->sv_sigcode_base;
 	else
 		tf->tf_usr_lr = (register_t)(sysent->sv_psstrings -
 		    *(sysent->sv_szsigcode));
 	/* Set the mode to enter in the signal handler */
 #if __ARM_ARCH >= 7
 	if ((register_t)catcher & 1)
 		tf->tf_spsr |= PSR_T;
 	else
 		tf->tf_spsr &= ~PSR_T;
 #endif
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr,
 	    tf->tf_usr_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	int error;
 
 	if (uap == NULL)
 		return (EFAULT);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
 		return (EFAULT);
 	/* Restore register context. */
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	/* Restore signal mask. */
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 	pcb->pcb_regs.sf_r4 = tf->tf_r4;
 	pcb->pcb_regs.sf_r5 = tf->tf_r5;
 	pcb->pcb_regs.sf_r6 = tf->tf_r6;
 	pcb->pcb_regs.sf_r7 = tf->tf_r7;
 	pcb->pcb_regs.sf_r8 = tf->tf_r8;
 	pcb->pcb_regs.sf_r9 = tf->tf_r9;
 	pcb->pcb_regs.sf_r10 = tf->tf_r10;
 	pcb->pcb_regs.sf_r11 = tf->tf_r11;
 	pcb->pcb_regs.sf_r12 = tf->tf_r12;
 	pcb->pcb_regs.sf_pc = tf->tf_pc;
 	pcb->pcb_regs.sf_lr = tf->tf_usr_lr;
 	pcb->pcb_regs.sf_sp = tf->tf_usr_sp;
 }
 
 void
 pcpu0_init(void)
 {
 #if __ARM_ARCH >= 6
 	set_curthread(&thread0);
 #endif
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 }
 
 /*
  * Initialize proc0
  */
 void
 init_proc0(vm_offset_t kstack)
 {
 	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kstack;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + kstack_pages * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_vfpcpu = -1;
 	thread0.td_pcb->pcb_vfpstate.fpscr = VFPSCR_DN;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 }
 
 #if __ARM_ARCH >= 6
 void
 set_stackptrs(int cpu)
 {
 
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_UND32_MODE,
 	    undstack + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 }
 #else
 void
 set_stackptrs(int cpu)
 {
 
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1)));
 }
 #endif
 
 
 #ifdef FDT
 #if __ARM_ARCH < 6
 void *
 initarm(struct arm_boot_params *abp)
 {
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	struct pv_addr kernel_l1pt;
 	struct pv_addr dpcpu;
 	vm_offset_t dtbp, freemempos, l2_start, lastaddr;
 	uint64_t memsize;
 	uint32_t l2size;
 	char *env;
 	void *kmdp;
 	u_int l1pagetable;
 	int i, j, err_devmap, mem_regions_sz;
 
 	lastaddr = parse_boot_param(abp);
 	arm_physmem_kernaddr = abp->abp_physaddr;
 
 	memsize = 0;
 
 	cpuinfo_init();
 	set_cpufuncs();
 
 	/*
 	 * Find the dtb passed in by the boot loader.
 	 */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp != NULL)
 		dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
 	else
 		dtbp = (vm_offset_t)NULL;
 
 #if defined(FDT_DTB_STATIC)
 	/*
 	 * In case the device tree blob was not retrieved (from metadata) try
 	 * to use the statically embedded one.
 	 */
 	if (dtbp == (vm_offset_t)NULL)
 		dtbp = (vm_offset_t)&fdt_static_dtb;
 #endif
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 
 	/* Grab physical memory regions information from device tree. */
 	if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, &memsize) != 0)
 		panic("Cannot get physical memory regions");
 	arm_physmem_hardware_regions(mem_regions, mem_regions_sz);
 
 	/* Grab reserved memory regions information from device tree. */
 	if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0)
 		arm_physmem_exclude_regions(mem_regions, mem_regions_sz,
 		    EXFLAG_NODUMP | EXFLAG_NOALLOC);
 
 	/* Platform-specific initialisation */
 	platform_probe_and_attach();
 
 	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	/* Calculate number of L2 tables needed for mapping vm_page_array */
 	l2size = (memsize / PAGE_SIZE) * sizeof(struct vm_page);
 	l2size = (l2size >> L1_S_SHIFT) + 1;
 
 	/*
 	 * Add one table for end of kernel map, one for stacks, msgbuf and
 	 * L1 and L2 tables map and one for vectors map.
 	 */
 	l2size += 3;
 
 	/* Make it divisible by 4 */
 	l2size = (l2size + 3) & ~3;
 
 	freemempos = (lastaddr + PAGE_MASK) & ~PAGE_MASK;
 
 	/* Define a macro to simplify memory allocation */
 #define valloc_pages(var, np)						\
 	alloc_pages((var).pv_va, (np));					\
 	(var).pv_pa = (var).pv_va + (abp->abp_physaddr - KERNVIRTADDR);
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos += PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 
 	for (i = 0, j = 0; i < l2size; ++i) {
 		if (!(i % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[i],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 			j = i;
 		} else {
 			kernel_pt_table[i].pv_va = kernel_pt_table[j].pv_va +
 			    L2_TABLE_SIZE_REAL * (i - j);
 			kernel_pt_table[i].pv_pa =
 			    kernel_pt_table[i].pv_va - KERNVIRTADDR +
 			    abp->abp_physaddr;
 
 		}
 	}
 	/*
 	 * Allocate a page for the system page mapped to 0x00000000
 	 * or 0xffff0000. This page will just contain the system vectors
 	 * and can be shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate dynamic per-cpu area. */
 	valloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu.pv_va, 0);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE * MAXCPU);
 	valloc_pages(abtstack, ABT_STACK_SIZE * MAXCPU);
 	valloc_pages(undstack, UND_STACK_SIZE * MAXCPU);
 	valloc_pages(kernelstack, kstack_pages * MAXCPU);
 	valloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/*
 	 * Try to map as much as possible of kernel text and data using
 	 * 1MB section mapping and for the rest of initial kernel address
 	 * space use L2 coarse tables.
 	 *
 	 * Link L2 tables for mapping remainder of kernel (modulo 1MB)
 	 * and kernel structures
 	 */
 	l2_start = lastaddr & ~(L1_S_OFFSET);
 	for (i = 0 ; i < l2size - 1; i++)
 		pmap_link_l2pt(l1pagetable, l2_start + i * L1_S_SIZE,
 		    &kernel_pt_table[i]);
 
 	pmap_curmaxkvaddr = l2_start + (l2size - 1) * L1_S_SIZE;
 
 	/* Map kernel code and data */
 	pmap_map_chunk(l1pagetable, KERNVIRTADDR, abp->abp_physaddr,
 	   (((uint32_t)(lastaddr) - KERNVIRTADDR) + PAGE_MASK) & ~PAGE_MASK,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	/* Map L1 directory and allocated L2 page tables */
 	pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa,
 	    L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	pmap_map_chunk(l1pagetable, kernel_pt_table[0].pv_va,
 	    kernel_pt_table[0].pv_pa,
 	    L2_TABLE_SIZE_REAL * l2size,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	/* Map allocated DPCPU, stacks and msgbuf */
 	pmap_map_chunk(l1pagetable, dpcpu.pv_va, dpcpu.pv_pa,
 	    freemempos - dpcpu.pv_va,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	/* Link and map the vector page */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH,
 	    &kernel_pt_table[l2size - 1]);
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, PTE_CACHE);
 
 	/* Establish static device mappings. */
 	err_devmap = platform_devmap_init();
 	devmap_bootstrap(l1pagetable, NULL);
 	vm_max_kernel_address = platform_lastaddr();
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | DOMAIN_CLIENT);
 	pmap_pa = kernel_l1pt.pv_pa;
 	cpu_setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2));
 
 	/*
 	 * Now that proper page tables are installed, call cpu_setup() to enable
 	 * instruction and data caches and other chip-specific features.
 	 */
 	cpu_setup();
 
 	/*
 	 * Only after the SOC registers block is mapped we can perform device
 	 * tree fixups, as they may attempt to read parameters from hardware.
 	 */
 	OF_interpret("perform-fixup", 0);
 
 	platform_gpio_init();
 
 	cninit();
 
 	debugf("initarm: console initialized\n");
 	debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp);
 	debugf(" boothowto = 0x%08x\n", boothowto);
 	debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp);
 	arm_print_kenv();
 
 	env = kern_getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	if (err_devmap != 0)
 		printf("WARNING: could not fully configure devmap, error=%d\n",
 		    err_devmap);
 
 	platform_late_init();
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 	cpu_control(CPU_CONTROL_MMU_ENABLE, CPU_CONTROL_MMU_ENABLE);
 
 	set_stackptrs(0);
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in cpu_setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross relocations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 
 	undefined_init();
 
 	init_proc0(kernelstack.pv_va);
 
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 	pmap_bootstrap(freemempos, &kernel_l1pt);
 	msgbufp = (void *)msgbufpv.pv_va;
 	msgbufinit(msgbufp, msgbufsize);
 	mutex_init();
 
 	/*
 	 * Exclude the kernel (and all the things we allocated which immediately
 	 * follow the kernel) from the VM allocation pool but not from crash
 	 * dumps.  virtual_avail is a global variable which tracks the kva we've
 	 * "allocated" while setting up pmaps.
 	 *
 	 * Prepare the list of physical memory available to the vm subsystem.
 	 */
 	arm_physmem_exclude_region(abp->abp_physaddr,
 	    (virtual_avail - KERNVIRTADDR), EXFLAG_NOALLOC);
 	arm_physmem_init_kernel_globals();
 
 	init_param2(physmem);
 	dbg_monitor_init();
 	kdb_init();
 
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
 #else /* __ARM_ARCH < 6 */
 void *
 initarm(struct arm_boot_params *abp)
 {
 	struct mem_region mem_regions[FDT_MEM_REGIONS];
 	vm_paddr_t lastaddr;
 	vm_offset_t dtbp, kernelstack, dpcpu;
 	char *env;
 	void *kmdp;
 	int err_devmap, mem_regions_sz;
 #ifdef EFI
 	struct efi_map_header *efihdr;
 #endif
 
 	/* get last allocated physical address */
 	arm_physmem_kernaddr = abp->abp_physaddr;
 	lastaddr = parse_boot_param(abp) - KERNVIRTADDR + arm_physmem_kernaddr;
 
 	set_cpufuncs();
 	cpuinfo_init();
 
 	/*
 	 * Find the dtb passed in by the boot loader.
 	 */
 	kmdp = preload_search_by_type("elf kernel");
 	dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t);
 #if defined(FDT_DTB_STATIC)
 	/*
 	 * In case the device tree blob was not retrieved (from metadata) try
 	 * to use the statically embedded one.
 	 */
 	if (dtbp == (vm_offset_t)NULL)
 		dtbp = (vm_offset_t)&fdt_static_dtb;
 #endif
 
 	if (OF_install(OFW_FDT, 0) == FALSE)
 		panic("Cannot install FDT");
 
 	if (OF_init((void *)dtbp) != 0)
 		panic("OF_init failed with the found device tree");
 
 #if defined(LINUX_BOOT_ABI)
 	arm_parse_fdt_bootargs();
 #endif
 
 #ifdef EFI
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr != NULL) {
 		arm_add_efi_map_entries(efihdr, mem_regions, &mem_regions_sz);
 	} else
 #endif
 	{
 		/* Grab physical memory regions information from device tree. */
 		if (fdt_get_mem_regions(mem_regions, &mem_regions_sz,NULL) != 0)
 			panic("Cannot get physical memory regions");
 	}
 	arm_physmem_hardware_regions(mem_regions, mem_regions_sz);
 
 	/* Grab reserved memory regions information from device tree. */
 	if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0)
 		arm_physmem_exclude_regions(mem_regions, mem_regions_sz,
 		    EXFLAG_NODUMP | EXFLAG_NOALLOC);
 
 	/*
 	 * Set TEX remapping registers.
 	 * Setup kernel page tables and switch to kernel L1 page table.
 	 */
 	pmap_set_tex();
 	pmap_bootstrap_prepare(lastaddr);
 
 	/*
 	 * If EARLY_PRINTF support is enabled, we need to re-establish the
 	 * mapping after pmap_bootstrap_prepare() switches to new page tables.
 	 * Note that we can only do the remapping if the VA is outside the
 	 * kernel, now that we have real virtual (not VA=PA) mappings in effect.
 	 * Early printf does not work between the time pmap_set_tex() does
 	 * cp15_prrr_set() and this code remaps the VA.
 	 */
 #if defined(EARLY_PRINTF) && defined(SOCDEV_PA) && defined(SOCDEV_VA) && SOCDEV_VA < KERNBASE
 	pmap_preboot_map_attr(SOCDEV_PA, SOCDEV_VA, 1024 * 1024, 
 	    VM_PROT_READ | VM_PROT_WRITE, VM_MEMATTR_DEVICE);
 #endif
 
 	/*
 	 * Now that proper page tables are installed, call cpu_setup() to enable
 	 * instruction and data caches and other chip-specific features.
 	 */
 	cpu_setup();
 
 	/* Platform-specific initialisation */
 	platform_probe_and_attach();
 	pcpu0_init();
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 
 	/*
 	 * Allocate a page for the system page mapped to 0xffff0000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	systempage = pmap_preboot_get_pages(1);
 
 	/* Map the vector page. */
 	pmap_preboot_map_pages(systempage, ARM_VECTORS_HIGH,  1);
 	if (virtual_end >= ARM_VECTORS_HIGH)
 		virtual_end = ARM_VECTORS_HIGH - 1;
 
 	/* Allocate dynamic per-cpu area. */
 	dpcpu = pmap_preboot_get_vpages(DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate stacks for all modes */
 	irqstack    = pmap_preboot_get_vpages(IRQ_STACK_SIZE * MAXCPU);
 	abtstack    = pmap_preboot_get_vpages(ABT_STACK_SIZE * MAXCPU);
 	undstack    = pmap_preboot_get_vpages(UND_STACK_SIZE * MAXCPU );
 	kernelstack = pmap_preboot_get_vpages(kstack_pages * MAXCPU);
 
 	/* Allocate message buffer. */
 	msgbufp = (void *)pmap_preboot_get_vpages(
 	    round_page(msgbufsize) / PAGE_SIZE);
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 	set_stackptrs(0);
 	mutex_init();
 
 	/* Establish static device mappings. */
 	err_devmap = platform_devmap_init();
 	devmap_bootstrap(0, NULL);
 	vm_max_kernel_address = platform_lastaddr();
 
 	/*
 	 * Only after the SOC registers block is mapped we can perform device
 	 * tree fixups, as they may attempt to read parameters from hardware.
 	 */
 	OF_interpret("perform-fixup", 0);
 	platform_gpio_init();
 	cninit();
 
 	/*
 	 * If we made a mapping for EARLY_PRINTF after pmap_bootstrap_prepare(),
 	 * undo it now that the normal console printf works.
 	 */
 #if defined(EARLY_PRINTF) && defined(SOCDEV_PA) && defined(SOCDEV_VA) && SOCDEV_VA < KERNBASE
 	pmap_kremove(SOCDEV_VA);
 #endif
 
 	debugf("initarm: console initialized\n");
 	debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp);
 	debugf(" boothowto = 0x%08x\n", boothowto);
 	debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp);
 	debugf(" lastaddr1: 0x%08x\n", lastaddr);
 	arm_print_kenv();
 
 	env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	if (err_devmap != 0)
 		printf("WARNING: could not fully configure devmap, error=%d\n",
 		    err_devmap);
 
 	platform_late_init();
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in cpu_setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross relocations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	/* Set stack for exception handlers */
 	undefined_init();
 	init_proc0(kernelstack);
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 	enable_interrupts(PSR_A);
 	pmap_bootstrap(0);
 
 	/* Exclude the kernel (and all the things we allocated which immediately
 	 * follow the kernel) from the VM allocation pool but not from crash
 	 * dumps.  virtual_avail is a global variable which tracks the kva we've
 	 * "allocated" while setting up pmaps.
 	 *
 	 * Prepare the list of physical memory available to the vm subsystem.
 	 */
 	arm_physmem_exclude_region(abp->abp_physaddr,
 		pmap_preboot_get_pages(0) - abp->abp_physaddr, EXFLAG_NOALLOC);
 	arm_physmem_init_kernel_globals();
 
 	init_param2(physmem);
 	/* Init message buffer. */
 	msgbufinit(msgbufp, msgbufsize);
 	dbg_monitor_init();
 	kdb_init();
 	return ((void *)STACKALIGN(thread0.td_pcb));
 
 }
 
 #endif /* __ARM_ARCH < 6 */
 #endif /* FDT */
Index: stable/11/sys/arm/arm/trap-v4.c
===================================================================
--- stable/11/sys/arm/arm/trap-v4.c	(revision 331016)
+++ stable/11/sys/arm/arm/trap-v4.c	(revision 331017)
@@ -1,728 +1,729 @@
 /*	$NetBSD: fault.c,v 1.45 2003/11/20 14:44:36 scw Exp $	*/
 
 /*-
  * Copyright 2004 Olivier Houchard
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1994-1997 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * fault.c
  *
  * Fault handlers
  *
  * Created      : 28/11/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/machdep.h>
 #include <machine/pcb.h>
 #include <machine/vmparam.h>
 
 #ifdef KDB
 #include <sys/kdb.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 #define ReadWord(a)	(*((volatile unsigned int *)(a)))
 
 #ifdef DEBUG
 int last_fault_code;	/* For the benefit of pmap_fault_fixup() */
 #endif
 
 struct ksig {
 	int signb;
 	u_long code;
 };
 struct data_abort {
 	int (*func)(struct trapframe *, u_int, u_int, struct thread *,
 	    struct ksig *);
 	const char *desc;
 };
 
 static int dab_fatal(struct trapframe *, u_int, u_int, struct thread *,
     struct ksig *);
 static int dab_align(struct trapframe *, u_int, u_int, struct thread *,
     struct ksig *);
 static int dab_buserr(struct trapframe *, u_int, u_int, struct thread *,
     struct ksig *);
 static void prefetch_abort_handler(struct trapframe *);
 
 static const struct data_abort data_aborts[] = {
 	{dab_fatal,	"Vector Exception"},
 	{dab_align,	"Alignment Fault 1"},
 	{dab_fatal,	"Terminal Exception"},
 	{dab_align,	"Alignment Fault 3"},
 	{dab_buserr,	"External Linefetch Abort (S)"},
 	{NULL,		"Translation Fault (S)"},
 	{dab_buserr,	"External Linefetch Abort (P)"},
 	{NULL,		"Translation Fault (P)"},
 	{dab_buserr,	"External Non-Linefetch Abort (S)"},
 	{NULL,		"Domain Fault (S)"},
 	{dab_buserr,	"External Non-Linefetch Abort (P)"},
 	{NULL,		"Domain Fault (P)"},
 	{dab_buserr,	"External Translation Abort (L1)"},
 	{NULL,		"Permission Fault (S)"},
 	{dab_buserr,	"External Translation Abort (L2)"},
 	{NULL,		"Permission Fault (P)"}
 };
 
 /* Determine if a fault came from user mode */
 #define	TRAP_USERMODE(tf)	((tf->tf_spsr & PSR_MODE) == PSR_USR32_MODE)
 
 /* Determine if 'x' is a permission fault */
 #define	IS_PERMISSION_FAULT(x)					\
 	(((1 << ((x) & FAULT_TYPE_MASK)) &			\
 	  ((1 << FAULT_PERM_P) | (1 << FAULT_PERM_S))) != 0)
 
 static __inline void
 call_trapsignal(struct thread *td, int sig, u_long code)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = (int)code;
 	trapsignal(td, &ksi);
 }
 
 void
 abort_handler(struct trapframe *tf, int type)
 {
 	struct vm_map *map;
 	struct pcb *pcb;
 	struct thread *td;
 	u_int user, far, fsr;
 	vm_prot_t ftype;
 	void *onfault;
 	vm_offset_t va;
 	int error = 0;
 	struct ksig ksig;
 	struct proc *p;
 
 	if (type == 1)
 		return (prefetch_abort_handler(tf));
 
 	/* Grab FAR/FSR before enabling interrupts */
 	far = cpu_faultaddress();
 	fsr = cpu_faultstatus();
 #if 0
 	printf("data abort: fault address=%p (from pc=%p lr=%p)\n",
 	       (void*)far, (void*)tf->tf_pc, (void*)tf->tf_svc_lr);
 #endif
 
 	/* Update vmmeter statistics */
 #if 0
 	vmexp.traps++;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
 
 	PCPU_INC(cnt.v_trap);
 	/* Data abort came from user mode? */
 	user = TRAP_USERMODE(tf);
 
 	if (user) {
 		td->td_pticks = 0;
 		td->td_frame = tf;
 		if (td->td_cowgen != td->td_proc->p_cowgen)
 			thread_cow_update(td);
 
 	}
 	/* Grab the current pcb */
 	pcb = td->td_pcb;
 	/* Re-enable interrupts if they were enabled previously */
 	if (td->td_md.md_spinlock_count == 0) {
 		if (__predict_true(tf->tf_spsr & PSR_I) == 0)
 			enable_interrupts(PSR_I);
 		if (__predict_true(tf->tf_spsr & PSR_F) == 0)
 			enable_interrupts(PSR_F);
 	}
 
 
 	/* Invoke the appropriate handler, if necessary */
 	if (__predict_false(data_aborts[fsr & FAULT_TYPE_MASK].func != NULL)) {
 		if ((data_aborts[fsr & FAULT_TYPE_MASK].func)(tf, fsr, far,
 		    td, &ksig)) {
 			goto do_trapsignal;
 		}
 		goto out;
 	}
 
 	/*
 	 * At this point, we're dealing with one of the following data aborts:
 	 *
 	 *  FAULT_TRANS_S  - Translation -- Section
 	 *  FAULT_TRANS_P  - Translation -- Page
 	 *  FAULT_DOMAIN_S - Domain -- Section
 	 *  FAULT_DOMAIN_P - Domain -- Page
 	 *  FAULT_PERM_S   - Permission -- Section
 	 *  FAULT_PERM_P   - Permission -- Page
 	 *
 	 * These are the main virtual memory-related faults signalled by
 	 * the MMU.
 	 */
 
 	/*
 	 * Make sure the Program Counter is sane. We could fall foul of
 	 * someone executing Thumb code, in which case the PC might not
 	 * be word-aligned. This would cause a kernel alignment fault
 	 * further down if we have to decode the current instruction.
 	 * XXX: It would be nice to be able to support Thumb at some point.
 	 */
 	if (__predict_false((tf->tf_pc & 3) != 0)) {
 		if (user) {
 			/*
 			 * Give the user an illegal instruction signal.
 			 */
 			/* Deliver a SIGILL to the process */
 			ksig.signb = SIGILL;
 			ksig.code = 0;
 			goto do_trapsignal;
 		}
 
 		/*
 		 * The kernel never executes Thumb code.
 		 */
 		printf("\ndata_abort_fault: Misaligned Kernel-mode "
 		    "Program Counter\n");
 		dab_fatal(tf, fsr, far, td, &ksig);
 	}
 
 	va = trunc_page((vm_offset_t)far);
 
 	/*
 	 * It is only a kernel address space fault iff:
 	 *	1. user == 0  and
 	 *	2. pcb_onfault not set or
 	 *	3. pcb_onfault set and not LDRT/LDRBT/STRT/STRBT instruction.
 	 */
 	if (user == 0 && (va >= VM_MIN_KERNEL_ADDRESS ||
 	    (va < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW)) &&
 	    __predict_true((pcb->pcb_onfault == NULL ||
 	     (ReadWord(tf->tf_pc) & 0x05200000) != 0x04200000))) {
 		map = kernel_map;
 
 		/* Was the fault due to the FPE/IPKDB ? */
 		if (__predict_false((tf->tf_spsr & PSR_MODE)==PSR_UND32_MODE)) {
 
 			/*
 			 * Force exit via userret()
 			 * This is necessary as the FPE is an extension to
 			 * userland that actually runs in a priveledged mode
 			 * but uses USR mode permissions for its accesses.
 			 */
 			user = 1;
 			ksig.signb = SIGSEGV;
 			ksig.code = 0;
 			goto do_trapsignal;
 		}
 	} else {
 		map = &td->td_proc->p_vmspace->vm_map;
 	}
 
 	/*
 	 * We need to know whether the page should be mapped as R or R/W.
 	 * On armv4, the fault status register does not indicate whether
 	 * the access was a read or write.  We know that a permission fault
 	 * can only be the result of a write to a read-only location, so we
 	 * can deal with those quickly.  Otherwise we need to disassemble
 	 * the faulting instruction to determine if it was a write.
 	 */
 	if (IS_PERMISSION_FAULT(fsr))
 		ftype = VM_PROT_WRITE;
 	else {
 		u_int insn = ReadWord(tf->tf_pc);
 
 		if (((insn & 0x0c100000) == 0x04000000) ||	/* STR/STRB */
 		    ((insn & 0x0e1000b0) == 0x000000b0) ||	/* STRH/STRD */
 		    ((insn & 0x0a100000) == 0x08000000)) {	/* STM/CDT */
 			ftype = VM_PROT_WRITE;
 		} else {
 			if ((insn & 0x0fb00ff0) == 0x01000090)	/* SWP */
 				ftype = VM_PROT_READ | VM_PROT_WRITE;
 			else
 				ftype = VM_PROT_READ;
 		}
 	}
 
 	/*
 	 * See if the fault is as a result of ref/mod emulation,
 	 * or domain mismatch.
 	 */
 #ifdef DEBUG
 	last_fault_code = fsr;
 #endif
 	if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK,
 	    NULL, "Kernel page fault") != 0)
 		goto fatal_pagefault;
 
 	if (pmap_fault_fixup(vmspace_pmap(td->td_proc->p_vmspace), va, ftype,
 	    user)) {
 		goto out;
 	}
 
 	onfault = pcb->pcb_onfault;
 	pcb->pcb_onfault = NULL;
 	error = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	pcb->pcb_onfault = onfault;
 	if (__predict_true(error == 0))
 		goto out;
 fatal_pagefault:
 	if (user == 0) {
 		if (pcb->pcb_onfault) {
 			tf->tf_r0 = error;
 			tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault;
 			return;
 		}
 
 		printf("\nvm_fault(%p, %x, %x, 0) -> %x\n", map, va, ftype,
 		    error);
 		dab_fatal(tf, fsr, far, td, &ksig);
 	}
 
 
 	if (error == ENOMEM) {
 		printf("VM: pid %d (%s), uid %d killed: "
 		    "out of swap\n", td->td_proc->p_pid, td->td_name,
 		    (td->td_proc->p_ucred) ?
 		     td->td_proc->p_ucred->cr_uid : -1);
 		ksig.signb = SIGKILL;
 	} else {
 		ksig.signb = SIGSEGV;
 	}
 	ksig.code = 0;
 do_trapsignal:
 	call_trapsignal(td, ksig.signb, ksig.code);
 out:
 	/* If returning to user mode, make sure to invoke userret() */
 	if (user)
 		userret(td, tf);
 }
 
 /*
  * dab_fatal() handles the following data aborts:
  *
  *  FAULT_WRTBUF_0 - Vector Exception
  *  FAULT_WRTBUF_1 - Terminal Exception
  *
  * We should never see these on a properly functioning system.
  *
  * This function is also called by the other handlers if they
  * detect a fatal problem.
  *
  * Note: If 'l' is NULL, we assume we're dealing with a prefetch abort.
  */
 static int
 dab_fatal(struct trapframe *tf, u_int fsr, u_int far, struct thread *td,
     struct ksig *ksig)
 {
 	const char *mode;
 
 #ifdef KDTRACE_HOOKS
 	if (!TRAP_USERMODE(tf))	{
 		if (dtrace_trap_func != NULL && (*dtrace_trap_func)(tf, far & FAULT_TYPE_MASK))
 			return (0);
 	}
 #endif
 
 	mode = TRAP_USERMODE(tf) ? "user" : "kernel";
 
 	disable_interrupts(PSR_I|PSR_F);
 	if (td != NULL) {
 		printf("Fatal %s mode data abort: '%s'\n", mode,
 		    data_aborts[fsr & FAULT_TYPE_MASK].desc);
 		printf("trapframe: %p\nFSR=%08x, FAR=", tf, fsr);
 		if ((fsr & FAULT_IMPRECISE) == 0)
 			printf("%08x, ", far);
 		else
 			printf("Invalid,  ");
 		printf("spsr=%08x\n", tf->tf_spsr);
 	} else {
 		printf("Fatal %s mode prefetch abort at 0x%08x\n",
 		    mode, tf->tf_pc);
 		printf("trapframe: %p, spsr=%08x\n", tf, tf->tf_spsr);
 	}
 
 	printf("r0 =%08x, r1 =%08x, r2 =%08x, r3 =%08x\n",
 	    tf->tf_r0, tf->tf_r1, tf->tf_r2, tf->tf_r3);
 	printf("r4 =%08x, r5 =%08x, r6 =%08x, r7 =%08x\n",
 	    tf->tf_r4, tf->tf_r5, tf->tf_r6, tf->tf_r7);
 	printf("r8 =%08x, r9 =%08x, r10=%08x, r11=%08x\n",
 	    tf->tf_r8, tf->tf_r9, tf->tf_r10, tf->tf_r11);
 	printf("r12=%08x, ", tf->tf_r12);
 
 	if (TRAP_USERMODE(tf))
 		printf("usp=%08x, ulr=%08x",
 		    tf->tf_usr_sp, tf->tf_usr_lr);
 	else
 		printf("ssp=%08x, slr=%08x",
 		    tf->tf_svc_sp, tf->tf_svc_lr);
 	printf(", pc =%08x\n\n", tf->tf_pc);
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active)
 		if (kdb_trap(fsr, 0, tf))
 			return (0);
 #endif
 	panic("Fatal abort");
 	/*NOTREACHED*/
 }
 
 /*
  * dab_align() handles the following data aborts:
  *
  *  FAULT_ALIGN_0 - Alignment fault
  *  FAULT_ALIGN_1 - Alignment fault
  *
  * These faults are fatal if they happen in kernel mode. Otherwise, we
  * deliver a bus error to the process.
  */
 static int
 dab_align(struct trapframe *tf, u_int fsr, u_int far, struct thread *td,
     struct ksig *ksig)
 {
 
 	/* Alignment faults are always fatal if they occur in kernel mode */
 	if (!TRAP_USERMODE(tf)) {
 		if (!td || !td->td_pcb->pcb_onfault)
 			dab_fatal(tf, fsr, far, td, ksig);
 		tf->tf_r0 = EFAULT;
 		tf->tf_pc = (int)td->td_pcb->pcb_onfault;
 		return (0);
 	}
 
 	/* pcb_onfault *must* be NULL at this point */
 
 	/* Deliver a bus error signal to the process */
 	ksig->code = 0;
 	ksig->signb = SIGBUS;
 	td->td_frame = tf;
 
 	return (1);
 }
 
 /*
  * dab_buserr() handles the following data aborts:
  *
  *  FAULT_BUSERR_0 - External Abort on Linefetch -- Section
  *  FAULT_BUSERR_1 - External Abort on Linefetch -- Page
  *  FAULT_BUSERR_2 - External Abort on Non-linefetch -- Section
  *  FAULT_BUSERR_3 - External Abort on Non-linefetch -- Page
  *  FAULT_BUSTRNL1 - External abort on Translation -- Level 1
  *  FAULT_BUSTRNL2 - External abort on Translation -- Level 2
  *
  * If pcb_onfault is set, flag the fault and return to the handler.
  * If the fault occurred in user mode, give the process a SIGBUS.
  *
  * Note: On XScale, FAULT_BUSERR_0, FAULT_BUSERR_1, and FAULT_BUSERR_2
  * can be flagged as imprecise in the FSR. This causes a real headache
  * since some of the machine state is lost. In this case, tf->tf_pc
  * may not actually point to the offending instruction. In fact, if
  * we've taken a double abort fault, it generally points somewhere near
  * the top of "data_abort_entry" in exception.S.
  *
  * In all other cases, these data aborts are considered fatal.
  */
 static int
 dab_buserr(struct trapframe *tf, u_int fsr, u_int far, struct thread *td,
     struct ksig *ksig)
 {
 	struct pcb *pcb = td->td_pcb;
 
 #ifdef __XSCALE__
 	if ((fsr & FAULT_IMPRECISE) != 0 &&
 	    (tf->tf_spsr & PSR_MODE) == PSR_ABT32_MODE) {
 		/*
 		 * Oops, an imprecise, double abort fault. We've lost the
 		 * r14_abt/spsr_abt values corresponding to the original
 		 * abort, and the spsr saved in the trapframe indicates
 		 * ABT mode.
 		 */
 		tf->tf_spsr &= ~PSR_MODE;
 
 		/*
 		 * We use a simple heuristic to determine if the double abort
 		 * happened as a result of a kernel or user mode access.
 		 * If the current trapframe is at the top of the kernel stack,
 		 * the fault _must_ have come from user mode.
 		 */
 		if (tf != ((struct trapframe *)pcb->pcb_regs.sf_sp) - 1) {
 			/*
 			 * Kernel mode. We're either about to die a
 			 * spectacular death, or pcb_onfault will come
 			 * to our rescue. Either way, the current value
 			 * of tf->tf_pc is irrelevant.
 			 */
 			tf->tf_spsr |= PSR_SVC32_MODE;
 			if (pcb->pcb_onfault == NULL)
 				printf("\nKernel mode double abort!\n");
 		} else {
 			/*
 			 * User mode. We've lost the program counter at the
 			 * time of the fault (not that it was accurate anyway;
 			 * it's not called an imprecise fault for nothing).
 			 * About all we can do is copy r14_usr to tf_pc and
 			 * hope for the best. The process is about to get a
 			 * SIGBUS, so it's probably history anyway.
 			 */
 			tf->tf_spsr |= PSR_USR32_MODE;
 			tf->tf_pc = tf->tf_usr_lr;
 		}
 	}
 
 	/* FAR is invalid for imprecise exceptions */
 	if ((fsr & FAULT_IMPRECISE) != 0)
 		far = 0;
 #endif /* __XSCALE__ */
 
 	if (pcb->pcb_onfault) {
 		tf->tf_r0 = EFAULT;
 		tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault;
 		return (0);
 	}
 
 	/*
 	 * At this point, if the fault happened in kernel mode, we're toast
 	 */
 	if (!TRAP_USERMODE(tf))
 		dab_fatal(tf, fsr, far, td, ksig);
 
 	/* Deliver a bus error signal to the process */
 	ksig->signb = SIGBUS;
 	ksig->code = 0;
 	td->td_frame = tf;
 
 	return (1);
 }
 
 /*
  * void prefetch_abort_handler(struct trapframe *tf)
  *
  * Abort handler called when instruction execution occurs at
  * a non existent or restricted (access permissions) memory page.
  * If the address is invalid and we were in SVC mode then panic as
  * the kernel should never prefetch abort.
  * If the address is invalid and the page is mapped then the user process
  * does no have read permission so send it a signal.
  * Otherwise fault the page in and try again.
  */
 static void
 prefetch_abort_handler(struct trapframe *tf)
 {
 	struct thread *td;
 	struct proc * p;
 	struct vm_map *map;
 	vm_offset_t fault_pc, va;
 	int error = 0;
 	struct ksig ksig;
 
 
 #if 0
 	/* Update vmmeter statistics */
 	uvmexp.traps++;
 #endif
 #if 0
 	printf("prefetch abort handler: %p %p\n", (void*)tf->tf_pc,
 	    (void*)tf->tf_usr_lr);
 #endif
 
  	td = curthread;
 	p = td->td_proc;
 	PCPU_INC(cnt.v_trap);
 
 	if (TRAP_USERMODE(tf)) {
 		td->td_frame = tf;
 		if (td->td_cowgen != td->td_proc->p_cowgen)
 			thread_cow_update(td);
 	}
 	fault_pc = tf->tf_pc;
 	if (td->td_md.md_spinlock_count == 0) {
 		if (__predict_true(tf->tf_spsr & PSR_I) == 0)
 			enable_interrupts(PSR_I);
 		if (__predict_true(tf->tf_spsr & PSR_F) == 0)
 			enable_interrupts(PSR_F);
 	}
 
 	/* Prefetch aborts cannot happen in kernel mode */
 	if (__predict_false(!TRAP_USERMODE(tf)))
 		dab_fatal(tf, 0, tf->tf_pc, NULL, &ksig);
 	td->td_pticks = 0;
 
 
 	/* Ok validate the address, can only execute in USER space */
 	if (__predict_false(fault_pc >= VM_MAXUSER_ADDRESS ||
 	    (fault_pc < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW))) {
 		ksig.signb = SIGSEGV;
 		ksig.code = 0;
 		goto do_trapsignal;
 	}
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	va = trunc_page(fault_pc);
 
 	/*
 	 * See if the pmap can handle this fault on its own...
 	 */
 #ifdef DEBUG
 	last_fault_code = -1;
 #endif
 	if (pmap_fault_fixup(map->pmap, va, VM_PROT_READ, 1))
 		goto out;
 
 	error = vm_fault(map, va, VM_PROT_READ | VM_PROT_EXECUTE,
 	    VM_FAULT_NORMAL);
 	if (__predict_true(error == 0))
 		goto out;
 
 	if (error == ENOMEM) {
 		printf("VM: pid %d (%s), uid %d killed: "
 		    "out of swap\n", td->td_proc->p_pid, td->td_name,
 		    (td->td_proc->p_ucred) ?
 		     td->td_proc->p_ucred->cr_uid : -1);
 		ksig.signb = SIGKILL;
 	} else {
 		ksig.signb = SIGSEGV;
 	}
 	ksig.code = 0;
 
 do_trapsignal:
 	call_trapsignal(td, ksig.signb, ksig.code);
 
 out:
 	userret(td, tf);
 
 }
 
 extern int badaddr_read_1(const uint8_t *, uint8_t *);
 extern int badaddr_read_2(const uint16_t *, uint16_t *);
 extern int badaddr_read_4(const uint32_t *, uint32_t *);
 /*
  * Tentatively read an 8, 16, or 32-bit value from 'addr'.
  * If the read succeeds, the value is written to 'rptr' and zero is returned.
  * Else, return EFAULT.
  */
 int
 badaddr_read(void *addr, size_t size, void *rptr)
 {
 	union {
 		uint8_t v1;
 		uint16_t v2;
 		uint32_t v4;
 	} u;
 	int rv;
 
 	cpu_drain_writebuf();
 
 	/* Read from the test address. */
 	switch (size) {
 	case sizeof(uint8_t):
 		rv = badaddr_read_1(addr, &u.v1);
 		if (rv == 0 && rptr)
 			*(uint8_t *) rptr = u.v1;
 		break;
 
 	case sizeof(uint16_t):
 		rv = badaddr_read_2(addr, &u.v2);
 		if (rv == 0 && rptr)
 			*(uint16_t *) rptr = u.v2;
 		break;
 
 	case sizeof(uint32_t):
 		rv = badaddr_read_4(addr, &u.v4);
 		if (rv == 0 && rptr)
 			*(uint32_t *) rptr = u.v4;
 		break;
 
 	default:
 		panic("badaddr: invalid size (%lu)", (u_long) size);
 	}
 
 	/* Return EFAULT if the address was invalid, else zero */
 	return (rv);
 }
Index: stable/11/sys/arm/arm/trap-v6.c
===================================================================
--- stable/11/sys/arm/arm/trap-v6.c	(revision 331016)
+++ stable/11/sys/arm/arm/trap-v6.c	(revision 331017)
@@ -1,646 +1,647 @@
 /*-
  * Copyright 2014 Olivier Houchard <cognet@FreeBSD.org>
  * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
  * Copyright 2014 Michal Meloun <meloun@miracle.cz>
  * Copyright 2014 Andrew Turner <andrew@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
 #include <sys/ktr.h>
+#include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/machdep.h>
 #include <machine/pcb.h>
 
 #ifdef KDB
 #include <sys/kdb.h>
 #include <machine/db_machdep.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 #endif
 
 extern char cachebailout[];
 
 #ifdef DEBUG
 int last_fault_code;	/* For the benefit of pmap_fault_fixup() */
 #endif
 
 struct ksig {
 	int sig;
 	u_long code;
 	vm_offset_t	addr;
 };
 
 typedef int abort_func_t(struct trapframe *, u_int, u_int, u_int, u_int,
     struct thread *, struct ksig *);
 
 static abort_func_t abort_fatal;
 static abort_func_t abort_align;
 static abort_func_t abort_icache;
 
 struct abort {
 	abort_func_t	*func;
 	const char	*desc;
 };
 
 /*
  * How are the aborts handled?
  *
  * Undefined Code:
  *  - Always fatal as we do not know what does it mean.
  * Imprecise External Abort:
  *  - Always fatal, but can be handled somehow in the future.
  *    Now, due to PCIe buggy hardware, ignored.
  * Precise External Abort:
  *  - Always fatal, but who knows in the future???
  * Debug Event:
  *  - Special handling.
  * External Translation Abort (L1 & L2)
  *  - Always fatal as something is screwed up in page tables or hardware.
  * Domain Fault (L1 & L2):
  *  - Always fatal as we do not play game with domains.
  * Alignment Fault:
  *  - Everything should be aligned in kernel with exception of user to kernel
  *    and vice versa data copying, so if pcb_onfault is not set, it's fatal.
  *    We generate signal in case of abort from user mode.
  * Instruction cache maintenance:
  *  - According to manual, this is translation fault during cache maintenance
  *    operation. So, it could be really complex in SMP case and fuzzy too
  *    for cache operations working on virtual addresses. For now, we will
  *    consider this abort as fatal. In fact, no cache maintenance on
  *    not mapped virtual addresses should be called. As cache maintenance
  *    operation (except DMB, DSB, and Flush Prefetch Buffer) are priviledged,
  *    the abort is fatal for user mode as well for now. (This is good place to
  *    note that cache maintenance on virtual address fill TLB.)
  * Acces Bit (L1 & L2):
  *  - Fast hardware emulation for kernel and user mode.
  * Translation Fault (L1 & L2):
  *  - Standard fault mechanism is held including vm_fault().
  * Permission Fault (L1 & L2):
  *  - Fast hardware emulation of modify bits and in other cases, standard
  *    fault mechanism is held including vm_fault().
  */
 
 static const struct abort aborts[] = {
 	{abort_fatal,	"Undefined Code (0x000)"},
 	{abort_align,	"Alignment Fault"},
 	{abort_fatal,	"Debug Event"},
 	{NULL,		"Access Bit (L1)"},
 	{NULL,		"Instruction cache maintenance"},
 	{NULL,		"Translation Fault (L1)"},
 	{NULL,		"Access Bit (L2)"},
 	{NULL,		"Translation Fault (L2)"},
 
 	{abort_fatal,	"External Abort"},
 	{abort_fatal,	"Domain Fault (L1)"},
 	{abort_fatal,	"Undefined Code (0x00A)"},
 	{abort_fatal,	"Domain Fault (L2)"},
 	{abort_fatal,	"External Translation Abort (L1)"},
 	{NULL,		"Permission Fault (L1)"},
 	{abort_fatal,	"External Translation Abort (L2)"},
 	{NULL,		"Permission Fault (L2)"},
 
 	{abort_fatal,	"TLB Conflict Abort"},
 	{abort_fatal,	"Undefined Code (0x401)"},
 	{abort_fatal,	"Undefined Code (0x402)"},
 	{abort_fatal,	"Undefined Code (0x403)"},
 	{abort_fatal,	"Undefined Code (0x404)"},
 	{abort_fatal,	"Undefined Code (0x405)"},
 	{abort_fatal,	"Asynchronous External Abort"},
 	{abort_fatal,	"Undefined Code (0x407)"},
 
 	{abort_fatal,	"Asynchronous Parity Error on Memory Access"},
 	{abort_fatal,	"Parity Error on Memory Access"},
 	{abort_fatal,	"Undefined Code (0x40A)"},
 	{abort_fatal,	"Undefined Code (0x40B)"},
 	{abort_fatal,	"Parity Error on Translation (L1)"},
 	{abort_fatal,	"Undefined Code (0x40D)"},
 	{abort_fatal,	"Parity Error on Translation (L2)"},
 	{abort_fatal,	"Undefined Code (0x40F)"}
 };
 
 static __inline void
 call_trapsignal(struct thread *td, int sig, int code, vm_offset_t addr)
 {
 	ksiginfo_t ksi;
 
 	CTR4(KTR_TRAP, "%s: addr: %#x, sig: %d, code: %d",
 	   __func__, addr, sig, code);
 
 	/*
 	 * TODO: some info would be nice to know
 	 * if we are serving data or prefetch abort.
 	 */
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = code;
 	ksi.ksi_addr = (void *)addr;
 	trapsignal(td, &ksi);
 }
 
 /*
  * abort_imprecise() handles the following abort:
  *
  *  FAULT_EA_IMPREC - Imprecise External Abort
  *
  * The imprecise means that we don't know where the abort happened,
  * thus FAR is undefined. The abort should not never fire, but hot
  * plugging or accidental hardware failure can be the cause of it.
  * If the abort happens, it can even be on different (thread) context.
  * Without any additional support, the abort is fatal, as we do not
  * know what really happened.
  *
  * QQQ: Some additional functionality, like pcb_onfault but global,
  *      can be implemented. Imprecise handlers could be registered
  *      which tell us if the abort is caused by something they know
  *      about. They should return one of three codes like:
  *		FAULT_IS_MINE,
  *		FAULT_CAN_BE_MINE,
  *		FAULT_IS_NOT_MINE.
  *      The handlers should be called until some of them returns
  *      FAULT_IS_MINE value or all was called. If all handlers return
  *	FAULT_IS_NOT_MINE value, then the abort is fatal.
  */
 static __inline void
 abort_imprecise(struct trapframe *tf, u_int fsr, u_int prefetch, bool usermode)
 {
 
 	/*
 	 * XXX - We can got imprecise abort as result of access
 	 * to not-present PCI/PCIe configuration space.
 	 */
 #if 0
 	goto out;
 #endif
 	abort_fatal(tf, FAULT_EA_IMPREC, fsr, 0, prefetch, curthread, NULL);
 
 	/*
 	 * Returning from this function means that we ignore
 	 * the abort for good reason. Note that imprecise abort
 	 * could fire any time even in user mode.
 	 */
 
 #if 0
 out:
 	if (usermode)
 		userret(curthread, tf);
 #endif
 }
 
 /*
  * abort_debug() handles the following abort:
  *
  *  FAULT_DEBUG - Debug Event
  *
  */
 static __inline void
 abort_debug(struct trapframe *tf, u_int fsr, u_int prefetch, bool usermode,
     u_int far)
 {
 
 	if (usermode) {
 		struct thread *td;
 
 		td = curthread;
 		call_trapsignal(td, SIGTRAP, TRAP_BRKPT, far);
 		userret(td, tf);
 	} else {
 #ifdef KDB
 		kdb_trap((prefetch) ? T_BREAKPOINT : T_WATCHPOINT, 0, tf);
 #else
 		printf("No debugger in kernel.\n");
 #endif
 	}
 }
 
 /*
  * Abort handler.
  *
  * FAR, FSR, and everything what can be lost after enabling
  * interrupts must be grabbed before the interrupts will be
  * enabled. Note that when interrupts will be enabled, we
  * could even migrate to another CPU ...
  *
  * TODO: move quick cases to ASM
  */
 void
 abort_handler(struct trapframe *tf, int prefetch)
 {
 	struct thread *td;
 	vm_offset_t far, va;
 	int idx, rv;
 	uint32_t fsr;
 	struct ksig ksig;
 	struct proc *p;
 	struct pcb *pcb;
 	struct vm_map *map;
 	struct vmspace *vm;
 	vm_prot_t ftype;
 	bool usermode;
 #ifdef INVARIANTS
 	void *onfault;
 #endif
 
 	PCPU_INC(cnt.v_trap);
 	td = curthread;
 
 	fsr = (prefetch) ? cp15_ifsr_get(): cp15_dfsr_get();
 #if __ARM_ARCH >= 7
 	far = (prefetch) ? cp15_ifar_get() : cp15_dfar_get();
 #else
 	far = (prefetch) ? TRAPF_PC(tf) : cp15_dfar_get();
 #endif
 
 	idx = FSR_TO_FAULT(fsr);
 	usermode = TRAPF_USERMODE(tf);	/* Abort came from user mode? */
 	if (usermode)
 		td->td_frame = tf;
 
 	CTR6(KTR_TRAP, "%s: fsr %#x (idx %u) far %#x prefetch %u usermode %d",
 	    __func__, fsr, idx, far, prefetch, usermode);
 
 	/*
 	 * Firstly, handle aborts that are not directly related to mapping.
 	 */
 	if (__predict_false(idx == FAULT_EA_IMPREC)) {
 		abort_imprecise(tf, fsr, prefetch, usermode);
 		return;
 	}
 
 	if (__predict_false(idx == FAULT_DEBUG)) {
 		abort_debug(tf, fsr, prefetch, usermode, far);
 		return;
 	}
 
 	/*
 	 * ARM has a set of unprivileged load and store instructions
 	 * (LDRT/LDRBT/STRT/STRBT ...) which are supposed to be used in other
 	 * than user mode and OS should recognize their aborts and behave
 	 * appropriately. However, there is no way how to do that reasonably
 	 * in general unless we restrict the handling somehow.
 	 *
 	 * For now, these instructions are used only in copyin()/copyout()
 	 * like functions where usermode buffers are checked in advance that
 	 * they are not from KVA space. Thus, no action is needed here.
 	 */
 
 	/*
 	 * (1) Handle access and R/W hardware emulation aborts.
 	 * (2) Check that abort is not on pmap essential address ranges.
 	 *     There is no way how to fix it, so we don't even try.
 	 */
 	rv = pmap_fault(PCPU_GET(curpmap), far, fsr, idx, usermode);
 	if (rv == KERN_SUCCESS)
 		return;
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 	if (rv == KERN_INVALID_ADDRESS)
 		goto nogo;
 
 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
 		/*
 		 * Due to both processor errata and lazy TLB invalidation when
 		 * access restrictions are removed from virtual pages, memory
 		 * accesses that are allowed by the physical mapping layer may
 		 * nonetheless cause one spurious page fault per virtual page.
 		 * When the thread is executing a "no faulting" section that
 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
 		 * every page fault is treated as a spurious page fault,
 		 * unless it accesses the same virtual address as the most
 		 * recent page fault within the same "no faulting" section.
 		 */
 		if (td->td_md.md_spurflt_addr != far ||
 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
 			td->td_md.md_spurflt_addr = far;
 			td->td_pflags &= ~TDP_RESETSPUR;
 
 			tlb_flush_local(far & ~PAGE_MASK);
 			return;
 		}
 	} else {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0) {
 			abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig);
 			return;
 		}
 	}
 
 	/* Re-enable interrupts if they were enabled previously. */
 	if (td->td_md.md_spinlock_count == 0) {
 		if (__predict_true(tf->tf_spsr & PSR_I) == 0)
 			enable_interrupts(PSR_I);
 		if (__predict_true(tf->tf_spsr & PSR_F) == 0)
 			enable_interrupts(PSR_F);
 	}
 
 	p = td->td_proc;
 	if (usermode) {
 		td->td_pticks = 0;
 		if (td->td_cowgen != p->p_cowgen)
 			thread_cow_update(td);
 	}
 
 	/* Invoke the appropriate handler, if necessary. */
 	if (__predict_false(aborts[idx].func != NULL)) {
 		if ((aborts[idx].func)(tf, idx, fsr, far, prefetch, td, &ksig))
 			goto do_trapsignal;
 		goto out;
 	}
 
 	/*
 	 * At this point, we're dealing with one of the following aborts:
 	 *
 	 *  FAULT_ICACHE   - I-cache maintenance
 	 *  FAULT_TRAN_xx  - Translation
 	 *  FAULT_PERM_xx  - Permission
 	 */
 
 	/*
 	 * Don't pass faulting cache operation to vm_fault(). We don't want
 	 * to handle all vm stuff at this moment.
 	 */
 	pcb = td->td_pcb;
 	if (__predict_false(pcb->pcb_onfault == cachebailout)) {
 		tf->tf_r0 = far;		/* return failing address */
 		tf->tf_pc = (register_t)pcb->pcb_onfault;
 		return;
 	}
 
 	/* Handle remaining I-cache aborts. */
 	if (idx == FAULT_ICACHE) {
 		if (abort_icache(tf, idx, fsr, far, prefetch, td, &ksig))
 			goto do_trapsignal;
 		goto out;
 	}
 
 	va = trunc_page(far);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory. If curproc
 		 * is NULL or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = (p != NULL) ? p->p_vmspace : NULL;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
 		    pcb->pcb_onfault == NULL)) {
 			abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig);
 			return;
 		}
 	}
 
 	ftype = (fsr & FSR_WNR) ? VM_PROT_WRITE : VM_PROT_READ;
 	if (prefetch)
 		ftype |= VM_PROT_EXECUTE;
 
 #ifdef DEBUG
 	last_fault_code = fsr;
 #endif
 
 #ifdef INVARIANTS
 	onfault = pcb->pcb_onfault;
 	pcb->pcb_onfault = NULL;
 #endif
 
 	/* Fault in the page. */
 	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 
 #ifdef INVARIANTS
 	pcb->pcb_onfault = onfault;
 #endif
 
 	if (__predict_true(rv == KERN_SUCCESS))
 		goto out;
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    pcb->pcb_onfault != NULL) {
 			tf->tf_r0 = rv;
 			tf->tf_pc = (int)pcb->pcb_onfault;
 			return;
 		}
 		CTR2(KTR_TRAP, "%s: vm_fault() failed with %d", __func__, rv);
 		abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig);
 		return;
 	}
 
 	ksig.sig = SIGSEGV;
 	ksig.code = (rv == KERN_PROTECTION_FAILURE) ? SEGV_ACCERR : SEGV_MAPERR;
 	ksig.addr = far;
 
 do_trapsignal:
 	call_trapsignal(td, ksig.sig, ksig.code, ksig.addr);
 out:
 	if (usermode)
 		userret(td, tf);
 }
 
 /*
  * abort_fatal() handles the following data aborts:
  *
  *  FAULT_DEBUG		- Debug Event
  *  FAULT_ACCESS_xx	- Acces Bit
  *  FAULT_EA_PREC	- Precise External Abort
  *  FAULT_DOMAIN_xx	- Domain Fault
  *  FAULT_EA_TRAN_xx	- External Translation Abort
  *  FAULT_EA_IMPREC	- Imprecise External Abort
  *  + all undefined codes for ABORT
  *
  * We should never see these on a properly functioning system.
  *
  * This function is also called by the other handlers if they
  * detect a fatal problem.
  *
  * Note: If 'l' is NULL, we assume we're dealing with a prefetch abort.
  */
 static int
 abort_fatal(struct trapframe *tf, u_int idx, u_int fsr, u_int far,
     u_int prefetch, struct thread *td, struct ksig *ksig)
 {
 	bool usermode;
 	const char *mode;
 	const char *rw_mode;
 
 	usermode = TRAPF_USERMODE(tf);
 #ifdef KDTRACE_HOOKS
 	if (!usermode) {
 		if (dtrace_trap_func != NULL && (*dtrace_trap_func)(tf, far))
 			return (0);
 	}
 #endif
 
 	mode = usermode ? "user" : "kernel";
 	rw_mode  = fsr & FSR_WNR ? "write" : "read";
 	disable_interrupts(PSR_I|PSR_F);
 
 	if (td != NULL) {
 		printf("Fatal %s mode data abort: '%s' on %s\n", mode,
 		    aborts[idx].desc, rw_mode);
 		printf("trapframe: %p\nFSR=%08x, FAR=", tf, fsr);
 		if (idx != FAULT_EA_IMPREC)
 			printf("%08x, ", far);
 		else
 			printf("Invalid,  ");
 		printf("spsr=%08x\n", tf->tf_spsr);
 	} else {
 		printf("Fatal %s mode prefetch abort at 0x%08x\n",
 		    mode, tf->tf_pc);
 		printf("trapframe: %p, spsr=%08x\n", tf, tf->tf_spsr);
 	}
 
 	printf("r0 =%08x, r1 =%08x, r2 =%08x, r3 =%08x\n",
 	    tf->tf_r0, tf->tf_r1, tf->tf_r2, tf->tf_r3);
 	printf("r4 =%08x, r5 =%08x, r6 =%08x, r7 =%08x\n",
 	    tf->tf_r4, tf->tf_r5, tf->tf_r6, tf->tf_r7);
 	printf("r8 =%08x, r9 =%08x, r10=%08x, r11=%08x\n",
 	    tf->tf_r8, tf->tf_r9, tf->tf_r10, tf->tf_r11);
 	printf("r12=%08x, ", tf->tf_r12);
 
 	if (usermode)
 		printf("usp=%08x, ulr=%08x",
 		    tf->tf_usr_sp, tf->tf_usr_lr);
 	else
 		printf("ssp=%08x, slr=%08x",
 		    tf->tf_svc_sp, tf->tf_svc_lr);
 	printf(", pc =%08x\n\n", tf->tf_pc);
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active)
 		kdb_trap(fsr, 0, tf);
 #endif
 	panic("Fatal abort");
 	/*NOTREACHED*/
 }
 
 /*
  * abort_align() handles the following data abort:
  *
  *  FAULT_ALIGN - Alignment fault
  *
  * Everything should be aligned in kernel with exception of user to kernel 
  * and vice versa data copying, so if pcb_onfault is not set, it's fatal.
  * We generate signal in case of abort from user mode.
  */
 static int
 abort_align(struct trapframe *tf, u_int idx, u_int fsr, u_int far,
     u_int prefetch, struct thread *td, struct ksig *ksig)
 {
 	bool usermode;
 
 	usermode = TRAPF_USERMODE(tf);
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 && td != NULL &&
 		    td->td_pcb->pcb_onfault != NULL) {
 			tf->tf_r0 = EFAULT;
 			tf->tf_pc = (int)td->td_pcb->pcb_onfault;
 			return (0);
 		}
 		abort_fatal(tf, idx, fsr, far, prefetch, td, ksig);
 	}
 	/* Deliver a bus error signal to the process */
 	ksig->code = BUS_ADRALN;
 	ksig->sig = SIGBUS;
 	ksig->addr = far;
 	return (1);
 }
 
 /*
  * abort_icache() handles the following data abort:
  *
  * FAULT_ICACHE - Instruction cache maintenance
  *
  * According to manual, FAULT_ICACHE is translation fault during cache
  * maintenance operation. In fact, no cache maintenance operation on
  * not mapped virtual addresses should be called. As cache maintenance
  * operation (except DMB, DSB, and Flush Prefetch Buffer) are priviledged,
  * the abort is concider as fatal for now. However, all the matter with
  * cache maintenance operation on virtual addresses could be really complex
  * and fuzzy in SMP case, so maybe in future standard fault mechanism
  * should be held here including vm_fault() calling.
  */
 static int
 abort_icache(struct trapframe *tf, u_int idx, u_int fsr, u_int far,
     u_int prefetch, struct thread *td, struct ksig *ksig)
 {
 
 	abort_fatal(tf, idx, fsr, far, prefetch, td, ksig);
 	return(0);
 }
Index: stable/11/sys/arm/arm/undefined.c
===================================================================
--- stable/11/sys/arm/arm/undefined.c	(revision 331016)
+++ stable/11/sys/arm/arm/undefined.c	(revision 331017)
@@ -1,350 +1,351 @@
 /*	$NetBSD: undefined.c,v 1.22 2003/11/29 22:21:29 bjh21 Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2001 Ben Harris.
  * Copyright (c) 1995 Mark Brinicombe.
  * Copyright (c) 1995 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * undefined.c
  *
  * Fault handler
  *
  * Created      : 06/01/95
  */
 
 
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/signal.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
 #include <sys/ptrace.h>
+#include <sys/vmmeter.h>
 #ifdef KDB
 #include <sys/kdb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <machine/armreg.h>
 #include <machine/asm.h>
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/undefined.h>
 #include <machine/trap.h>
 
 #include <machine/disassem.h>
 
 #ifdef DDB
 #include <ddb/db_output.h>
 #endif
 
 #ifdef KDB
 #include <machine/db_machdep.h>
 #endif
 
 #define	ARM_COPROC_INSN(insn)	(((insn) & (1 << 27)) != 0)
 #define	ARM_VFP_INSN(insn)	((((insn) & 0xfe000000) == 0xf2000000) || \
     (((insn) & 0xff100000) == 0xf4000000))
 #define	ARM_COPROC(insn)	(((insn) >> 8) & 0xf)
 
 #define	THUMB_32BIT_INSN(insn)	((insn) >= 0xe800)
 #define	THUMB_COPROC_INSN(insn)	(((insn) & (3 << 26)) == (3 << 26))
 #define	THUMB_COPROC_UNDEFINED(insn) (((insn) & 0x3e << 20) == 0)
 #define	THUMB_VFP_INSN(insn)	(((insn) & (3 << 24)) == (3 << 24))
 #define	THUMB_COPROC(insn)	(((insn) >> 8) & 0xf)
 
 #define	COPROC_VFP	10
 
 static int gdb_trapper(u_int, u_int, struct trapframe *, int);
 
 LIST_HEAD(, undefined_handler) undefined_handlers[MAX_COPROCS];
 
 
 void *
 install_coproc_handler(int coproc, undef_handler_t handler)
 {
 	struct undefined_handler *uh;
 
 	KASSERT(coproc >= 0 && coproc < MAX_COPROCS, ("bad coproc"));
 	KASSERT(handler != NULL, ("handler is NULL")); /* Used to be legal. */
 
 	/* XXX: M_TEMP??? */
 	uh = malloc(sizeof(*uh), M_TEMP, M_WAITOK);
 	uh->uh_handler = handler;
 	install_coproc_handler_static(coproc, uh);
 	return uh;
 }
 
 void
 install_coproc_handler_static(int coproc, struct undefined_handler *uh)
 {
 
 	LIST_INSERT_HEAD(&undefined_handlers[coproc], uh, uh_link);
 }
 
 void
 remove_coproc_handler(void *cookie)
 {
 	struct undefined_handler *uh = cookie;
 
 	LIST_REMOVE(uh, uh_link);
 	free(uh, M_TEMP);
 }
 
 
 static int
 gdb_trapper(u_int addr, u_int insn, struct trapframe *frame, int code)
 {
 	struct thread *td;
 	ksiginfo_t ksi;
 
 	td = (curthread == NULL) ? &thread0 : curthread;
 
 	if (insn == GDB_BREAKPOINT || insn == GDB5_BREAKPOINT) {
 		if (code == FAULT_USER) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGTRAP;
 			ksi.ksi_code = TRAP_BRKPT;
 			ksi.ksi_addr = (u_int32_t *)addr;
 			trapsignal(td, &ksi);
 			return 0;
 		}
 #if 0
 #ifdef KGDB
 		return !kgdb_trap(T_BREAKPOINT, frame);
 #endif
 #endif
 	}
 	return 1;
 }
 
 static struct undefined_handler gdb_uh;
 
 void
 undefined_init(void)
 {
 	int loop;
 
 	/* Not actually necessary -- the initialiser is just NULL */
 	for (loop = 0; loop < MAX_COPROCS; ++loop)
 		LIST_INIT(&undefined_handlers[loop]);
 
 	/* Install handler for GDB breakpoints */
 	gdb_uh.uh_handler = gdb_trapper;
 	install_coproc_handler_static(0, &gdb_uh);
 }
 
 
 void
 undefinedinstruction(struct trapframe *frame)
 {
 	struct thread *td;
 	u_int fault_pc;
 	int fault_instruction;
 	int fault_code;
 	int coprocessor;
 	struct undefined_handler *uh;
 	int error;
 #ifdef VERBOSE_ARM32
 	int s;
 #endif
 	ksiginfo_t ksi;
 
 	/* Enable interrupts if they were enabled before the exception. */
 	if (__predict_true(frame->tf_spsr & PSR_I) == 0)
 		enable_interrupts(PSR_I);
 	if (__predict_true(frame->tf_spsr & PSR_F) == 0)
 		enable_interrupts(PSR_F);
 
 	PCPU_INC(cnt.v_trap);
 
 	fault_pc = frame->tf_pc;
 
 	/*
 	 * Get the current thread/proc structure or thread0/proc0 if there is
 	 * none.
 	 */
 	td = curthread == NULL ? &thread0 : curthread;
 
 	coprocessor = 0;
 	if ((frame->tf_spsr & PSR_T) == 0) {
 		/*
 		 * Make sure the program counter is correctly aligned so we
 		 * don't take an alignment fault trying to read the opcode.
 		 */
 		if (__predict_false((fault_pc & 3) != 0)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGILL;
 			ksi.ksi_code = ILL_ILLADR;
 			ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc;
 			trapsignal(td, &ksi);
 			userret(td, frame);
 			return;
 		}
 
 		/*
 		 * Should use fuword() here .. but in the interests of
 		 * squeezing every bit of speed we will just use ReadWord().
 		 * We know the instruction can be read as was just executed
 		 * so this will never fail unless the kernel is screwed up
 		 * in which case it does not really matter does it ?
 		 */
 
 		fault_instruction = *(u_int32_t *)fault_pc;
 
 		/* Check for coprocessor instruction */
 
 		/*
 		 * According to the datasheets you only need to look at bit
 		 * 27 of the instruction to tell the difference between and
 		 * undefined instruction and a coprocessor instruction
 		 * following an undefined instruction trap.
 		 */
 
 		if (ARM_COPROC_INSN(fault_instruction))
 			coprocessor = ARM_COPROC(fault_instruction);
 		else {          /* check for special instructions */
 			if (ARM_VFP_INSN(fault_instruction))
 				coprocessor = COPROC_VFP; /* vfp / simd */
 		}
 	} else {
 #if __ARM_ARCH >= 7
 		fault_instruction = *(uint16_t *)fault_pc;
 		if (THUMB_32BIT_INSN(fault_instruction)) {
 			fault_instruction <<= 16;
 			fault_instruction |= *(uint16_t *)(fault_pc + 2);
 
 			/*
 			 * Is it a Coprocessor, Advanced SIMD, or
 			 * Floating-point instruction.
 			 */
 			if (THUMB_COPROC_INSN(fault_instruction)) {
 				if (THUMB_COPROC_UNDEFINED(fault_instruction)) {
 					/* undefined insn */
 				} else if (THUMB_VFP_INSN(fault_instruction))
 					coprocessor = COPROC_VFP;
 				else
 					coprocessor =
 					    THUMB_COPROC(fault_instruction);
 			}
 		}
 #else
 		/*
 		 * No support for Thumb-2 on this cpu
 		 */
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGILL;
 		ksi.ksi_code = ILL_ILLADR;
 		ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc;
 		trapsignal(td, &ksi);
 		userret(td, frame);
 		return;
 #endif
 	}
 
 	if ((frame->tf_spsr & PSR_MODE) == PSR_USR32_MODE) {
 		/*
 		 * Modify the fault_code to reflect the USR/SVC state at
 		 * time of fault.
 		 */
 		fault_code = FAULT_USER;
 		td->td_frame = frame;
 	} else
 		fault_code = 0;
 
 	/* OK this is were we do something about the instruction. */
 	LIST_FOREACH(uh, &undefined_handlers[coprocessor], uh_link)
 	    if (uh->uh_handler(fault_pc, fault_instruction, frame,
 			       fault_code) == 0)
 		    break;
 
 	if (fault_code & FAULT_USER) {
 		/* TODO: No support for ptrace from Thumb-2 */
 		if ((frame->tf_spsr & PSR_T) == 0 &&
 		    fault_instruction == PTRACE_BREAKPOINT) {
 			PROC_LOCK(td->td_proc);
 			_PHOLD(td->td_proc);
 			error = ptrace_clear_single_step(td);
 			_PRELE(td->td_proc);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				ksiginfo_init_trap(&ksi);
 				ksi.ksi_signo = SIGILL;
 				ksi.ksi_code = ILL_ILLOPC;
 				ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc;
 				trapsignal(td, &ksi);
 			}
 			return;
 		}
 	}
 
 	if (uh == NULL && (fault_code & FAULT_USER)) {
 		/* Fault has not been handled */
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGILL;
 		ksi.ksi_code = ILL_ILLOPC;
 		ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc;
 		trapsignal(td, &ksi);
 	}
 
 	if ((fault_code & FAULT_USER) == 0) {
 		if (fault_instruction == KERNEL_BREAKPOINT) {
 #ifdef KDB
 			kdb_trap(T_BREAKPOINT, 0, frame);
 #else
 			printf("No debugger in kernel.\n");
 #endif
 			return;
 		}
 		else
 			panic("Undefined instruction in kernel.\n");
 	}
 
 	userret(td, frame);
 }
Index: stable/11/sys/arm64/arm64/minidump_machdep.c
===================================================================
--- stable/11/sys/arm64/arm64/minidump_machdep.c	(revision 331016)
+++ stable/11/sys/arm64/arm64/minidump_machdep.c	(revision 331017)
@@ -1,465 +1,466 @@
 /*-
  * Copyright (c) 2006 Peter Wemm
  * Copyright (c) 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Andrew Turner under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_watchdog.h"
 
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/pmap.h>
 
 #include <machine/md_var.h>
 #include <machine/pte.h>
 #include <machine/minidump.h>
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
 /*
  * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
  * is to protect us from metadata and to protect metadata from us.
  */
 #define	SIZEOF_METADATA		(64*1024)
 
 uint64_t *vm_page_dump;
 int vm_page_dump_size;
 
 static struct kerneldumpheader kdh;
 static off_t dumplo;
 
 /* Handle chunked writes. */
 static size_t fragsz;
 static void *dump_va;
 static size_t counter, progress, dumpsize;
 
 static uint64_t tmpbuffer[PAGE_SIZE / sizeof(uint64_t)];
 
 CTASSERT(sizeof(*vm_page_dump) == 8);
 
 static int
 is_dumpable(vm_paddr_t pa)
 {
 	vm_page_t m;
 	int i;
 
 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
 		return ((m->flags & PG_NODUMP) == 0);
 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
 			return (1);
 	}
 	return (0);
 }
 
 static int
 blk_flush(struct dumperinfo *di)
 {
 	int error;
 
 	if (fragsz == 0)
 		return (0);
 
 	error = dump_write(di, dump_va, 0, dumplo, fragsz);
 	dumplo += fragsz;
 	fragsz = 0;
 	return (error);
 }
 
 static struct {
 	int min_per;
 	int max_per;
 	int visited;
 } progress_track[10] = {
 	{  0,  10, 0},
 	{ 10,  20, 0},
 	{ 20,  30, 0},
 	{ 30,  40, 0},
 	{ 40,  50, 0},
 	{ 50,  60, 0},
 	{ 60,  70, 0},
 	{ 70,  80, 0},
 	{ 80,  90, 0},
 	{ 90, 100, 0}
 };
 
 static void
 report_progress(size_t progress, size_t dumpsize)
 {
 	int sofar, i;
 
 	sofar = 100 - ((progress * 100) / dumpsize);
 	for (i = 0; i < nitems(progress_track); i++) {
 		if (sofar < progress_track[i].min_per ||
 		    sofar > progress_track[i].max_per)
 			continue;
 		if (progress_track[i].visited)
 			return;
 		progress_track[i].visited = 1;
 		printf("..%d%%", sofar);
 		return;
 	}
 }
 
 static int
 blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
 {
 	size_t len;
 	int error, c;
 	u_int maxdumpsz;
 
 	maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE);
 	if (maxdumpsz == 0)	/* seatbelt */
 		maxdumpsz = PAGE_SIZE;
 	error = 0;
 	if ((sz % PAGE_SIZE) != 0) {
 		printf("size not page aligned\n");
 		return (EINVAL);
 	}
 	if (ptr != NULL && pa != 0) {
 		printf("cant have both va and pa!\n");
 		return (EINVAL);
 	}
 	if ((((uintptr_t)pa) % PAGE_SIZE) != 0) {
 		printf("address not page aligned %p\n", ptr);
 		return (EINVAL);
 	}
 	if (ptr != NULL) {
 		/*
 		 * If we're doing a virtual dump, flush any
 		 * pre-existing pa pages.
 		 */
 		error = blk_flush(di);
 		if (error)
 			return (error);
 	}
 	while (sz) {
 		len = maxdumpsz - fragsz;
 		if (len > sz)
 			len = sz;
 		counter += len;
 		progress -= len;
 		if (counter >> 22) {
 			report_progress(progress, dumpsize);
 			counter &= (1 << 22) - 1;
 		}
 
 		wdog_kern_pat(WD_LASTVAL);
 
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
 				return (error);
 			dumplo += len;
 			ptr += len;
 			sz -= len;
 		} else {
 			dump_va = (void *)PHYS_TO_DMAP(pa);
 			fragsz += len;
 			pa += len;
 			sz -= len;
 			error = blk_flush(di);
 			if (error)
 				return (error);
 		}
 
 		/* Check for user abort. */
 		c = cncheckc();
 		if (c == 0x03)
 			return (ECANCELED);
 		if (c != -1)
 			printf(" (CTRL-C to abort) ");
 	}
 
 	return (0);
 }
 
 int
 minidumpsys(struct dumperinfo *di)
 {
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3;
 	uint32_t pmapsize;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int error;
 	uint64_t bits;
 	int i, bit;
 	int retry_count;
 	struct minidumphdr mdhdr;
 
 	retry_count = 0;
  retry:
 	retry_count++;
 	error = 0;
 	pmapsize = 0;
 	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
 		pmapsize += PAGE_SIZE;
 		if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3))
 			continue;
 
 		/* We should always be using the l2 table for kvm */
 		if (l2 == NULL)
 			continue;
 
 		if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
 			pa = *l2 & ~ATTR_MASK;
 			for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) {
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		} else if ((*l2 & ATTR_DESCR_MASK) == L2_TABLE) {
 			for (i = 0; i < Ln_ENTRIES; i++) {
 				if ((l3[i] & ATTR_DESCR_MASK) != L3_PAGE)
 					continue;
 				pa = l3[i] & ~ATTR_MASK;
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
 	}
 
 	/* Calculate dump size. */
 	dumpsize = pmapsize;
 	dumpsize += round_page(msgbufp->msg_size);
 	dumpsize += round_page(vm_page_dump_size);
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = ffsl(bits) - 1;
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) +
 			    bit) * PAGE_SIZE;
 			/* Clear out undumpable pages now if needed */
 			if (is_dumpable(pa))
 				dumpsize += PAGE_SIZE;
 			else
 				dump_drop_page(pa);
 			bits &= ~(1ul << bit);
 		}
 	}
 	dumpsize += PAGE_SIZE;
 
 	/* Determine dump offset on device. */
 	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
 		error = E2BIG;
 		goto fail;
 	}
 	dumplo = di->mediaoffset + di->mediasize - dumpsize;
 	dumplo -= sizeof(kdh) * 2;
 	progress = dumpsize;
 
 	/* Initialize mdhdr */
 	bzero(&mdhdr, sizeof(mdhdr));
 	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
 	mdhdr.version = MINIDUMP_VERSION;
 	mdhdr.msgbufsize = msgbufp->msg_size;
 	mdhdr.bitmapsize = vm_page_dump_size;
 	mdhdr.pmapsize = pmapsize;
 	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
 	mdhdr.dmapphys = DMAP_MIN_PHYSADDR;
 	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
 	mdhdr.dmapend = DMAP_MAX_ADDRESS;
 
 	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_AARCH64_VERSION,
 	    dumpsize, di->blocksize);
 
 	printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20,
 	    ptoa((uintmax_t)physmem) / 1048576);
 
 	/* Dump leader */
 	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 	dumplo += sizeof(kdh);
 
 	/* Dump my header */
 	bzero(&tmpbuffer, sizeof(tmpbuffer));
 	bcopy(&mdhdr, &tmpbuffer, sizeof(mdhdr));
 	error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
 	if (error)
 		goto fail;
 
 	/* Dump msgbuf up front */
 	error = blk_write(di, (char *)msgbufp->msg_ptr, 0,
 	    round_page(msgbufp->msg_size));
 	if (error)
 		goto fail;
 
 	/* Dump bitmap */
 	error = blk_write(di, (char *)vm_page_dump, 0,
 	    round_page(vm_page_dump_size));
 	if (error)
 		goto fail;
 
 	/* Dump kernel page directory pages */
 	bzero(&tmpbuffer, sizeof(tmpbuffer));
 	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
 		if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) {
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse tmpbuffer in the same block*/
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 		} else if (l2 == NULL) {
 			pa = (*l1 & ~ATTR_MASK) | (va & L1_OFFSET);
 
 			/* Generate fake l3 entries based upon the l1 entry */
 			for (i = 0; i < Ln_ENTRIES; i++) {
 				tmpbuffer[i] = pa + (i * PAGE_SIZE) |
 				    ATTR_DEFAULT | L3_PAGE;
 			}
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse tmpbuffer in the same block*/
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			bzero(&tmpbuffer, sizeof(tmpbuffer));
 		} else if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
 			/* TODO: Handle an invalid L2 entry */
 			pa = (*l2 & ~ATTR_MASK) | (va & L2_OFFSET);
 
 			/* Generate fake l3 entries based upon the l1 entry */
 			for (i = 0; i < Ln_ENTRIES; i++) {
 				tmpbuffer[i] = pa + (i * PAGE_SIZE) |
 				    ATTR_DEFAULT | L3_PAGE;
 			}
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
 			/* flush, in case we reuse fakepd in the same block */
 			error = blk_flush(di);
 			if (error)
 				goto fail;
 			bzero(&tmpbuffer, sizeof(tmpbuffer));
 			continue;
 		} else {
 			pa = *l2 & ~ATTR_MASK;
 
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, NULL, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
 		}
 	}
 
 	/* Dump memory chunks */
 	/* XXX cluster it up and use blk_dump() */
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = ffsl(bits) - 1;
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) +
 			    bit) * PAGE_SIZE;
 			error = blk_write(di, 0, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
 			bits &= ~(1ul << bit);
 		}
 	}
 
 	error = blk_flush(di);
 	if (error)
 		goto fail;
 
 	/* Dump trailer */
 	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 	dumplo += sizeof(kdh);
 
 	/* Signal completion, signoff and exit stage left. */
 	dump_write(di, NULL, 0, 0, 0);
 	printf("\nDump complete\n");
 	return (0);
 
  fail:
 	if (error < 0)
 		error = -error;
 
 	printf("\n");
 	if (error == ENOSPC) {
 		printf("Dump map grown while dumping. ");
 		if (retry_count < 5) {
 			printf("Retrying...\n");
 			goto retry;
 		}
 		printf("Dump failed.\n");
 	}
 	else if (error == ECANCELED)
 		printf("Dump aborted\n");
 	else if (error == E2BIG)
 		printf("Dump failed. Partition too small.\n");
 	else
 		printf("** DUMP FAILED (ERROR %d) **\n", error);
 	return (error);
 }
 
 void
 dump_add_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 6;		/* 2^6 = 64 */
 	bit = pa & 63;
 	atomic_set_long(&vm_page_dump[idx], 1ul << bit);
 }
 
 void
 dump_drop_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 6;		/* 2^6 = 64 */
 	bit = pa & 63;
 	atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
 }
Index: stable/11/sys/arm64/arm64/uma_machdep.c
===================================================================
--- stable/11/sys/arm64/arm64/uma_machdep.c	(revision 331016)
+++ stable/11/sys/arm64/arm64/uma_machdep.c	(revision 331017)
@@ -1,76 +1,77 @@
 /*-
  * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 
 void *
 uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
 	m = vm_page_alloc(NULL, 0,
 	    malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL)
 		return (NULL);
 	pa = m->phys_addr;
 	if ((wait & M_NODUMP) == 0)
 		dump_add_page(pa);
 	va = (void *)PHYS_TO_DMAP(pa);
 	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
 		bzero(va, PAGE_SIZE);
 	return (va);
 }
 
 void
 uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = DMAP_TO_PHYS((vm_offset_t)mem);
 	dump_drop_page(pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	m->wire_count--;
 	vm_page_free(m);
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 }
Index: stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
===================================================================
--- stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	(revision 331016)
+++ stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	(revision 331017)
@@ -1,274 +1,275 @@
 /*-
  * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
 #include <sys/mutex.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 
 #ifdef KMEM_DEBUG
 #include <sys/queue.h>
 #include <sys/stack.h>
 #endif
 
 #ifdef _KERNEL
 MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
 #else
 #define	malloc(size, type, flags)	malloc(size)
 #define	free(addr, type)		free(addr)
 #endif
 
 #ifdef KMEM_DEBUG
 struct kmem_item {
 	struct stack	stack;
 	LIST_ENTRY(kmem_item) next;
 };
 static LIST_HEAD(, kmem_item) kmem_items;
 static struct mtx kmem_items_mtx;
 MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
 #endif	/* KMEM_DEBUG */
 
 #include <sys/vmem.h>
 
 void *
 zfs_kmem_alloc(size_t size, int kmflags)
 {
 	void *p;
 #ifdef KMEM_DEBUG
 	struct kmem_item *i;
 
 	size += sizeof(struct kmem_item);
 #endif
 	p = malloc(size, M_SOLARIS, kmflags);
 #ifndef _KERNEL
 	if (kmflags & KM_SLEEP)
 		assert(p != NULL);
 #endif
 #ifdef KMEM_DEBUG
 	if (p != NULL) {
 		i = p;
 		p = (u_char *)p + sizeof(struct kmem_item);
 		stack_save(&i->stack);
 		mtx_lock(&kmem_items_mtx);
 		LIST_INSERT_HEAD(&kmem_items, i, next);
 		mtx_unlock(&kmem_items_mtx);
 	}
 #endif
 	return (p);
 }
 
 void
 zfs_kmem_free(void *buf, size_t size __unused)
 {
 #ifdef KMEM_DEBUG
 	if (buf == NULL) {
 		printf("%s: attempt to free NULL\n", __func__);
 		return;
 	}
 	struct kmem_item *i;
 
 	buf = (u_char *)buf - sizeof(struct kmem_item);
 	mtx_lock(&kmem_items_mtx);
 	LIST_FOREACH(i, &kmem_items, next) {
 		if (i == buf)
 			break;
 	}
 	ASSERT(i != NULL);
 	LIST_REMOVE(i, next);
 	mtx_unlock(&kmem_items_mtx);
 #endif
 	free(buf, M_SOLARIS);
 }
 
 static uint64_t kmem_size_val;
 
 static void
 kmem_size_init(void *unused __unused)
 {
 
 	kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
 	if (kmem_size_val > vm_kmem_size)
 		kmem_size_val = vm_kmem_size;
 }
 SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
 
 uint64_t
 kmem_size(void)
 {
 
 	return (kmem_size_val);
 }
 
 static int
 kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
 {
 	struct kmem_cache *cache = private;
 
 	return (cache->kc_constructor(mem, cache->kc_private, flags));
 }
 
 static void
 kmem_std_destructor(void *mem, int size __unused, void *private)
 {
 	struct kmem_cache *cache = private;
 
 	cache->kc_destructor(mem, cache->kc_private);
 }
 
 kmem_cache_t *
 kmem_cache_create(char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
 {
 	kmem_cache_t *cache;
 
 	ASSERT(vmp == NULL);
 
 	cache = kmem_alloc(sizeof(*cache), KM_SLEEP);
 	strlcpy(cache->kc_name, name, sizeof(cache->kc_name));
 	cache->kc_constructor = constructor;
 	cache->kc_destructor = destructor;
 	cache->kc_private = private;
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
 	    constructor != NULL ? kmem_std_constructor : NULL,
 	    destructor != NULL ? kmem_std_destructor : NULL,
 	    NULL, NULL, align > 0 ? align - 1 : 0, cflags);
 #else
 	cache->kc_size = bufsize;
 #endif
 
 	return (cache);
 }
 
 void
 kmem_cache_destroy(kmem_cache_t *cache)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	uma_zdestroy(cache->kc_zone);
 #endif
 	kmem_free(cache, sizeof(*cache));
 }
 
 void *
 kmem_cache_alloc(kmem_cache_t *cache, int flags)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	return (uma_zalloc_arg(cache->kc_zone, cache, flags));
 #else
 	void *p;
 
 	p = kmem_alloc(cache->kc_size, flags);
 	if (p != NULL && cache->kc_constructor != NULL)
 		kmem_std_constructor(p, cache->kc_size, cache, flags);
 	return (p);
 #endif
 }
 
 void
 kmem_cache_free(kmem_cache_t *cache, void *buf)
 {
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	uma_zfree_arg(cache->kc_zone, buf, cache);
 #else
 	if (cache->kc_destructor != NULL)
 		kmem_std_destructor(buf, cache->kc_size, cache);
 	kmem_free(buf, cache->kc_size);
 #endif
 }
 
 #ifdef _KERNEL
 void
 kmem_cache_reap_now(kmem_cache_t *cache)
 {
 #ifndef KMEM_DEBUG
 	zone_drain(cache->kc_zone);
 #endif
 }
 
 void
 kmem_reap(void)
 {
 	uma_reclaim();
 }
 #else
 void
 kmem_cache_reap_now(kmem_cache_t *cache __unused)
 {
 }
 
 void
 kmem_reap(void)
 {
 }
 #endif
 
 int
 kmem_debugging(void)
 {
 	return (0);
 }
 
 void *
 calloc(size_t n, size_t s)
 {
 	return (kmem_zalloc(n * s, KM_NOSLEEP));
 }
 
 #ifdef KMEM_DEBUG
 void kmem_show(void *);
 void
 kmem_show(void *dummy __unused)
 {
 	struct kmem_item *i;
 
 	mtx_lock(&kmem_items_mtx);
 	if (LIST_EMPTY(&kmem_items))
 		printf("KMEM_DEBUG: No leaked elements.\n");
 	else {
 		printf("KMEM_DEBUG: Leaked elements:\n\n");
 		LIST_FOREACH(i, &kmem_items, next) {
 			printf("address=%p\n", i);
 			stack_print_ddb(&i->stack);
 			printf("\n");
 		}
 	}
 	mtx_unlock(&kmem_items_mtx);
 }
 
 SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
 #endif	/* KMEM_DEBUG */
Index: stable/11/sys/cddl/compat/opensolaris/sys/kmem.h
===================================================================
--- stable/11/sys/cddl/compat/opensolaris/sys/kmem.h	(revision 331016)
+++ stable/11/sys/cddl/compat/opensolaris/sys/kmem.h	(revision 331017)
@@ -1,89 +1,90 @@
 /*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _OPENSOLARIS_SYS_KMEM_H_
 #define	_OPENSOLARIS_SYS_KMEM_H_
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/vmem.h>
+#include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 MALLOC_DECLARE(M_SOLARIS);
 
 #define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
 #define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
 
 #define	KM_SLEEP		M_WAITOK
 #define	KM_PUSHPAGE		M_WAITOK
 #define	KM_NOSLEEP		M_NOWAIT
 #define	KM_NODEBUG		M_NODUMP
 #define	KM_NORMALPRI		0
 #define	KMC_NODEBUG		UMA_ZONE_NODUMP
 #define	KMC_NOTOUCH		0
 
 typedef struct kmem_cache {
 	char		kc_name[32];
 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
 	uma_zone_t	kc_zone;
 #else
 	size_t		kc_size;
 #endif
 	int		(*kc_constructor)(void *, void *, int);
 	void		(*kc_destructor)(void *, void *);
 	void		*kc_private;
 } kmem_cache_t;
 
 void *zfs_kmem_alloc(size_t size, int kmflags);
 void zfs_kmem_free(void *buf, size_t size);
 uint64_t kmem_size(void);
 kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
 void kmem_cache_destroy(kmem_cache_t *cache);
 void *kmem_cache_alloc(kmem_cache_t *cache, int flags);
 void kmem_cache_free(kmem_cache_t *cache, void *buf);
 void kmem_cache_reap_now(kmem_cache_t *cache);
 void kmem_reap(void);
 int kmem_debugging(void);
 void *calloc(size_t n, size_t s);
 
 #define	freemem				vm_cnt.v_free_count
 #define	minfree				vm_cnt.v_free_min
 #define	heap_arena			kmem_arena
 #define	kmem_alloc(size, kmflags)	zfs_kmem_alloc((size), (kmflags))
 #define	kmem_zalloc(size, kmflags)	zfs_kmem_alloc((size), (kmflags) | M_ZERO)
 #define	kmem_free(buf, size)		zfs_kmem_free((buf), (size))
 
 #define	kmem_cache_set_move(cache, movefunc)	do { } while (0)
 
 #endif	/* _OPENSOLARIS_SYS_KMEM_H_ */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 331016)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 331017)
@@ -1,6059 +1,6060 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
+#include <sys/vmmeter.h>
 #include <vm/vm_param.h>
 #include <sys/zil.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	offset_t ndata;
 	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 	{
 #ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		off = *(offset_t *)data;
 #endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 #ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		*(offset_t *)data = off;
 #endif
 		return (0);
 	}
 #ifdef illumos
 	case _FIO_COUNT_FILLED:
 	{
 		/*
 		 * _FIO_COUNT_FILLED adds a new ioctl command which
 		 * exposes the number of filled blocks in a
 		 * ZFS object.
 		 */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/*
 		 * Wait for all dirty blocks for this object
 		 * to get synced out to disk, and the DMU info
 		 * updated.
 		 */
 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Retrieve fill count from DMU object.
 		 */
 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		ndata = doi.doi_fill_count;
 
 		ZFS_EXIT(zfsvfs);
 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 #endif
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considred clean despite have some dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 			vm_page_sbusy(pp);
 		} else if (pp != NULL) {
 			ASSERT(!pp->valid);
 			pp = NULL;
 		}
 
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
 			pmap_remove_write(pp);
 			if (nbytes != 0)
 				vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 	vm_object_pip_subtract(pp->object, 1);
 }
 
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_lock(pp);
 			vm_page_hold(pp);
 			vm_page_unlock(pp);
 
 		} else
 			pp = NULL;
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unhold(vm_page_t pp)
 {
 
 	vm_page_lock(pp);
 	vm_page_unhold(pp);
 	vm_page_unlock(pp);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
     int segflg, dmu_tx_t *tx)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	caddr_t va;
 	int off;
 
 	ASSERT(segflg != UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	zfs_vmobject_wunlock(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 static int
 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
 	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (pp->valid == 0) {
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			vm_page_sunbusy(pp);
 			vm_page_lock(pp);
 			if (error) {
 				if (pp->wire_count == 0 && pp->valid == 0 &&
 				    !vm_page_busied(pp))
 					vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
 		uio->uio_resid -= bytes;
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_hold(vp, start)) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 #ifdef illumos
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 #else
 			error = vn_io_fault_uiomove(va + off, bytes, uio);
 #endif
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			page_unhold(pp);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif	/* illumos */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 #ifdef __FreeBSD__
 		if (uio->uio_segflg == UIO_NOCOPY)
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
 		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
  *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM.
 	 * Intentionally allow ZFS_READONLY through here.
 	 * See zfs_zaccess_common()
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 			    zp->z_id, uio->uio_segflg, tx);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 #ifdef illumos
 			ASSERT(error == 0);
 #else
 			ASSERT(error == 0 || error == EFAULT);
 #endif
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		if (error == 0)
 			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		else
 			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 #ifdef illumos
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef __FreeBSD__
 	/*
 	 * EFAULT means that at least one page of the source buffer was not
 	 * available.  VFS will re-try remaining I/O upon this error.
 	 */
 	if (error == EFAULT) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 #endif
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 			    RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	int error;
 
 	*vpp = arg;
 	error = vn_lock(*vpp, lkflags);
 	if (error != 0)
 		vrele(*vpp);
 	return (error);
 }
 
 static int
 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int error;
 	int ltype;
 
 	ASSERT_VOP_LOCKED(dvp, __func__);
 #ifdef DIAGNOSTIC
 	if ((zdp->z_pflags & ZFS_XATTR) == 0)
 		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
 #endif
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		ASSERT3P(dvp, ==, vp);
 		vref(dvp);
 		ltype = lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(dvp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 			else /* if (ltype == LK_SHARED) */
 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 			/*
 			 * Relock for the "." case could leave us with
 			 * reclaimed vnode.
 			 */
 			if (dvp->v_iflag & VI_DOOMED) {
 				vrele(dvp);
 				return (SET_ERROR(ENOENT));
 			}
 		}
 		return (0);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 *
 		 * XXX Note that there is a time window when both vnodes are
 		 * unlocked.  It is possible, although highly unlikely, that
 		 * during that window the parent-child relationship between
 		 * the vnodes may change, for example, get reversed.
 		 * In that case we would have a wrong lock order for the vnodes.
 		 * All other filesystems seem to ignore this problem, so we
 		 * do the same here.
 		 * A potential solution could be implemented as follows:
 		 * - using LK_NOWAIT when locking the second vnode and retrying
 		 *   if necessary
 		 * - checking that the parent-child relationship still holds
 		 *   after locking both vnodes and retrying if it doesn't
 		 */
 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 		return (error);
 	} else {
 		error = vn_lock(vp, lkflags);
 		if (error != 0)
 			vrele(vp);
 		return (error);
 	}
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 #endif
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			vrele(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 
 	/*
 	 * First handle the special cases.
 	 */
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the vp for the snapshot directory.
 		 */
 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 			struct componentname cn;
 			vnode_t *zfsctl_vp;
 			int ltype;
 
 			ZFS_EXIT(zfsvfs);
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp, 0);
 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 			    &zfsctl_vp);
 			if (error == 0) {
 				cn.cn_nameptr = "snapshot";
 				cn.cn_namelen = strlen(cn.cn_nameptr);
 				cn.cn_nameiop = cnp->cn_nameiop;
 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 				cn.cn_lkflags = cnp->cn_lkflags;
 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 				vput(zfsctl_vp);
 			}
 			vn_lock(dvp, ltype | LK_RETRY);
 			return (error);
 		}
 	}
 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 		ZFS_EXIT(zfsvfs);
 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 			return (SET_ERROR(ENOTSUP));
 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 		return (error);
 	}
 
 	/*
 	 * The loop is retry the lookup if the parent-child relationship
 	 * changes during the dot-dot locking complexities.
 	 */
 	for (;;) {
 		uint64_t parent;
 
 		error = zfs_dirlook(zdp, nm, &zp);
 		if (error == 0)
 			*vpp = ZTOV(zp);
 
 		ZFS_EXIT(zfsvfs);
 		if (error != 0)
 			break;
 
 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 		if (error != 0) {
 			/*
 			 * If we've got a locking error, then the vnode
 			 * got reclaimed because of a force unmount.
 			 * We never enter doomed vnodes into the name cache.
 			 */
 			*vpp = NULL;
 			return (error);
 		}
 
 		if ((cnp->cn_flags & ISDOTDOT) == 0)
 			break;
 
 		ZFS_ENTER(zfsvfs);
 		if (zdp->z_sa_hdl == NULL) {
 			error = SET_ERROR(EIO);
 		} else {
 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 			    &parent, sizeof (parent));
 		}
 		if (error != 0) {
 			ZFS_EXIT(zfsvfs);
 			vput(ZTOV(zp));
 			break;
 		}
 		if (zp->z_id == parent) {
 			ZFS_EXIT(zfsvfs);
 			break;
 		}
 		vput(ZTOV(zp));
 	}
 
 out:
 	if (error != 0)
 		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 
 	/* Insert name into cache (as non-existent) if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULL, cnp);
 
 	/* Insert name into cache if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	void		*vsecp = NULL;
 	int		flag = 0;
 	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		goto out;
 	}
 
 	/*
 	 * We only support the creation of regular files in
 	 * extended attribute directories.
 	 */
 
 	if ((dzp->z_pflags & ZFS_XATTR) &&
 	    (vap->va_type != VREG)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids)) != 0)
 		goto out;
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	getnewvnode_reserve(1);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 out:
 	if (error == 0) {
 		*vpp = ZTOV(zp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 	zp = VTOZ(vp);
 
 	xattr_obj = 0;
 	xzp = NULL;
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	obj = zp->z_id;
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 
 	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 		vp->v_vflag |= VV_NOSYNC;
 	}
 
 	txtype = TX_REMOVE;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 
 	if (xzp)
 		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	cache_purge(dvp);
 
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	cache_purge(vp);
 out:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 #ifdef TODO
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 #else
 	check_sysattrs = 0;
 #endif
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 #ifdef TODO
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 #else
 				panic("%s:%u: TODO", __func__, __LINE__);
 #endif
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
 			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
 			/* NOTE: ed_off is the offset for the *next* entry */
 			next = &(eodp->ed_off);
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			odp->d_namlen = strlen(zap.za_name);
 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 			odp->d_type = type;
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t links;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 #endif
 	vap->va_nodeid = zp->z_id;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
 		links = zp->z_links + 1;
 	else
 		links = zp->z_links;
 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
 	vap->va_size = zp->z_size;
 #ifdef illumos
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
      	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
 	 */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
 	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EOVERFLOW));
 	}
 
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err == 0) {
 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
 				if (err != 0)
 					vrele(ZTOV(attrzp));
 			}
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
     struct vnode *tdvp, struct vnode **tvpp,
     const struct componentname *scnp, const struct componentname *tcnp)
 {
 	zfsvfs_t	*zfsvfs;
 	struct vnode	*nvp, *svp, *tvp;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	const char	*snm = scnp->cn_nameptr;
 	const char	*tnm = tcnp->cn_nameptr;
 	int error;
 
 	VOP_UNLOCK(tdvp, 0);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK(*tvpp, 0);
 
 relock:
 	error = vn_lock(sdvp, LK_EXCLUSIVE);
 	if (error)
 		goto out;
 	sdzp = VTOZ(sdvp);
 
 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		if (error != EBUSY)
 			goto out;
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto out;
 		VOP_UNLOCK(tdvp, 0);
 		goto relock;
 	}
 	tdzp = VTOZ(tdvp);
 
 	/*
 	 * Before using sdzp and tdzp we must ensure that they are live.
 	 * As a porting legacy from illumos we have two things to worry
 	 * about.  One is typical for FreeBSD and it is that the vnode is
 	 * not reclaimed (doomed).  The other is that the znode is live.
 	 * The current code can invalidate the znode without acquiring the
 	 * corresponding vnode lock if the object represented by the znode
 	 * and vnode is no longer valid after a rollback or receive operation.
 	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
 	 * that protects the znodes from the invalidation.
 	 */
 	zfsvfs = sdzp->z_zfsvfs;
 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		error = SET_ERROR(EIO);
 		goto out;
 	}
 
 	/*
 	 * Re-resolve svp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
 	if (error != 0) {
 		/* Source entry invalid or not there. */
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	svp = ZTOV(szp);
 
 	/*
 	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
 	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		vrele(svp);
 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	if (tzp != NULL)
 		tvp = ZTOV(tzp);
 	else
 		tvp = NULL;
 
 	/*
 	 * At present the vnode locks must be acquired before z_teardown_lock,
 	 * although it would be more logical to use the opposite order.
 	 */
 	ZFS_EXIT(zfsvfs);
 
 	/*
 	 * Now try acquire locks on svp and tvp.
 	 */
 	nvp = svp;
 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (tvp != NULL)
 			vrele(tvp);
 		if (error != EBUSY) {
 			vrele(nvp);
 			goto out;
 		}
 		error = vn_lock(nvp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vrele(nvp);
 			goto out;
 		}
 		VOP_UNLOCK(nvp, 0);
 		/*
 		 * Concurrent rename race.
 		 * XXX ?
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		vrele(*svpp);
 		*svpp = nvp;
 		goto relock;
 	}
 	vrele(*svpp);
 	*svpp = nvp;
 
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	*tvpp = NULL;
 	if (tvp != NULL) {
 		nvp = tvp;
 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error != 0) {
 			VOP_UNLOCK(sdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(*svpp, 0);
 			if (error != EBUSY) {
 				vrele(nvp);
 				goto out;
 			}
 			error = vn_lock(nvp, LK_EXCLUSIVE);
 			if (error != 0) {
 				vrele(nvp);
 				goto out;
 			}
 			vput(nvp);
 			goto relock;
 		}
 		*tvpp = nvp;
 	}
 
 	return (0);
 
 out:
 	return (error);
 }
 
 /*
  * Note that we must use VRELE_ASYNC in this function as it walks
  * up the directory tree and vrele may need to acquire an exclusive
  * lock if a last reference to a vnode is dropped.
  */
 static int
 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*zp, *zp1;
 	uint64_t	parent;
 	int		error;
 
 	zfsvfs = tdzp->z_zfsvfs;
 	if (tdzp == szp)
 		return (SET_ERROR(EINVAL));
 	if (tdzp == sdzp)
 		return (0);
 	if (tdzp->z_id == zfsvfs->z_root)
 		return (0);
 	zp = tdzp;
 	for (;;) {
 		ASSERT(!zp->z_unlinked);
 		if ((error = sa_lookup(zp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			break;
 
 		if (parent == szp->z_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		if (parent == zfsvfs->z_root)
 			break;
 		if (parent == sdzp->z_id)
 			break;
 
 		error = zfs_zget(zfsvfs, parent, &zp1);
 		if (error != 0)
 			break;
 
 		if (zp != tdzp)
 			VN_RELE_ASYNC(ZTOV(zp),
 			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 		zp = zp1;
 	}
 
 	if (error == ENOTDIR)
 		panic("checkpath: .. not a directory\n");
 	if (zp != tdzp)
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 	return (error);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
 	char		*snm = scnp->cn_nameptr;
 	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
 
 	/* Reject renames across filesystems. */
 	if ((*svpp)->v_mount != tdvp->v_mount ||
 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	if (zfsctl_is_node(tdvp)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
 	if (error != 0) {
 		/* no vnodes are locked in the case of error here */
 		return (error);
 	}
 
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 	zfsvfs = tdzp->z_zfsvfs;
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * After we re-enter ZFS_ENTER() we will have to revalidate all
 	 * znodes involved.
 	 */
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		error = SET_ERROR(EILSEQ);
 		goto unlockout;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if ((*svpp) == (*tvpp)) {
 		error = 0;
 		goto unlockout;
 	}
 
 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
 	    (*tvpp)->v_mountedhere != NULL)) {
 		error = SET_ERROR(EXDEV);
 		goto unlockout;
 	}
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	szp = VTOZ(*svpp);
 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
 	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		error = SET_ERROR(EINVAL);
 		goto unlockout;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto unlockout;
 
 	if ((*svpp)->v_type == VDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
 		    sdzp == szp ||
 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_check(szp, sdzp, tdzp))
 			goto unlockout;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if ((*svpp)->v_type == VDIR) {
 			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto unlockout;
 			} else {
 				cache_purge(tdvp);
 				if (sdvp != tdvp)
 					cache_purge(sdvp);
 			}
 		} else {
 			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto unlockout;
 			}
 		}
 	}
 
 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto unlockout;
 	}
 
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
 			    NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 		if (error == 0) {
 			cache_purge(*svpp);
 			if (*tvpp != NULL)
 				cache_purge(*tvpp);
 			cache_purge_negative(tdvp);
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
 	ZFS_EXIT(zfsvfs);
 	VOP_UNLOCK(*svpp, 0);
 	VOP_UNLOCK(sdvp, 0);
 
 out:				/* original two vnodes are locked */
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (*tvpp != NULL)
 		VOP_UNLOCK(*tvpp, 0);
 	if (tdvp != *tvpp)
 		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 
 #ifdef illumos
 	if (fidp->fid_len < size) {
 		fidp->fid_len = size;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 #else
 	fidp->fid_len = size;
 #endif
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 #ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lookup(zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 #endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 #ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 #endif
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*valp = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	rl_t *rl;
 	vm_object_t object;
 	off_t start, end, obj_size;
 	uint_t blksz;
 	int pgsin_b, pgsin_a;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	start = IDX_TO_OFF(ma[0]->pindex);
 	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
 	/*
 	 * Lock a range covering all required and optional pages.
 	 * Note that we need to handle the case of the block size growing.
 	 */
 	for (;;) {
 		blksz = zp->z_blksz;
 		rl = zfs_range_lock(zp, rounddown(start, blksz),
 		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
 		if (blksz == zp->z_blksz)
 			break;
 		zfs_range_unlock(rl);
 	}
 
 	object = ma[0]->object;
 	zfs_vmobject_wlock(object);
 	obj_size = object->un_pager.vnp.vnp_size;
 	zfs_vmobject_wunlock(object);
 	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	pgsin_b = 0;
 	if (rbehind != NULL) {
 		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
 		pgsin_b = MIN(*rbehind, pgsin_b);
 	}
 
 	pgsin_a = 0;
 	if (rahead != NULL) {
 		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
 		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
 			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
 	/*
 	 * NB: we need to pass the exact byte size of the data that we expect
 	 * to read after accounting for the file size.  This is required because
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
 	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
 	    MIN(end, obj_size) - (end - PAGE_SIZE));
 
 	zfs_range_unlock(rl);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 
 	if (error != 0)
 		return (zfs_vm_pagerret_error);
 
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	return (zfs_vm_pagerret_ok);
 }
 
 static int
 zfs_freebsd_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	rl_t		*rl;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	object = vp->v_object;
 	pcount = btoc(len);
 	ncount = pcount;
 
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		goto out;
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
 			rtvals[i] = zfs_vm_pagerret_ok;
 			vm_page_undirty(ma[i]);
 		}
 		zfs_vmobject_wunlock(object);
 		PCPU_INC(cnt.v_vnodeout);
 		PCPU_ADD(cnt.v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	zfs_range_unlock(rl);
 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (rtvals[0]);
 }
 
 int
 zfs_freebsd_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
 	return (flags);
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
 zfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	if (zfsvfs->z_use_namecache)
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (error);
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	u_long fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	u_long fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM 
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		     UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the security.jail.chflags_allowed sysctl is
 		 * is non-zero; otherwise, they behave like unprivileged
 		 * processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on objects they
 			 * have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	if (vap->va_birthtime.tv_sec != VNOVAL) {
 		xvap.xva_vattr.va_mask |= AT_XVATTR;
 		XVA_SET_REQ(&xvap, XAT_CREATETIME);
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
 	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp != NULL);
 
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0) {
 		*ap->a_retval = val;
 		return (error);
 	}
 	if (error != EOPNOTSUPP)
 		return (error);
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	PREFIX	
  *	system		freebsd:system:
  *	user		(none, can be used to access ZFS fsattr(5) attributes
  *			created on Solaris)
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size)
 {
 	const char *namespace, *prefix, *suffix;
 
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (EINVAL);
 	/* We don't allow attribute names that start with "freebsd:" string. */
 	if (strncmp(name, "freebsd:", 8) == 0)
 		return (EINVAL);
 
 	bzero(attrname, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 #if 0
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_USER_STRING;
 		suffix = ":";
 #else
 		/*
 		 * This is the default namespace by which we can access all
 		 * attributes created on Solaris.
 		 */
 		prefix = namespace = suffix = "";
 #endif
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (EINVAL);
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (ENAMETOOLONG);
 	}
 	return (0);
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrprefix[16];
 	u_char dirbuf[sizeof(struct dirent)];
 	struct dirent *dp;
 	struct iovec aiov;
 	struct uio auio, *uio = ap->a_uio;
 	size_t *sizep = ap->a_size;
 	size_t plen;
 	vnode_t *xvp = NULL, *vp;
 	int done, error, eof, pos;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof(attrprefix));
 	if (error != 0)
 		return (error);
 	plen = strlen(attrprefix);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (sizep != NULL)
 		*sizep = 0;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	do {
 		u_char nlen;
 
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof(dirbuf);
 		auio.uio_resid = sizeof(dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		done = sizeof(dirbuf) - auio.uio_resid;
 		if (error != 0)
 			break;
 		for (pos = 0; pos < done;) {
 			dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			nlen = dp->d_namlen - plen;
 			if (sizep != NULL)
 				*sizep += 1 + nlen;
 			else if (uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
 				if (error == 0) {
 					error = uiomove(dp->d_name + plen, nlen,
 					    uio->uio_rw, uio);
 				}
 				if (error != 0)
 					break;
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 int
 zfs_freebsd_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 int
 zfs_freebsd_setacl(ap)
 	struct vop_setacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp == NULL)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 int
 zfs_freebsd_aclcheck(ap)
 	struct vop_aclcheck_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	vnode_t *covered_vp;
 	vnode_t *vp = ap->a_vp;;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *zp = VTOZ(vp);
 	int ltype;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * If we are a snapshot mounted under .zfs, run the operation
 	 * on the covered vnode.
 	 */
 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
 		char name[MAXNAMLEN + 1];
 		znode_t *dzp;
 		size_t len;
 
 		error = zfs_znode_parent_and_name(zp, &dzp, name);
 		if (error == 0) {
 			len = strlen(name);
 			if (*ap->a_buflen < len)
 				error = SET_ERROR(ENOMEM);
 		}
 		if (error == 0) {
 			*ap->a_buflen -= len;
 			bcopy(name, ap->a_buf + *ap->a_buflen, len);
 			*ap->a_vpp = ZTOV(dzp);
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ZFS_EXIT(zfsvfs);
 
 	covered_vp = vp->v_mount->mnt_vnodecovered;
 	vhold(covered_vp);
 	ltype = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
 	if (error == 0) {
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
 		    ap->a_buf, ap->a_buflen);
 		vput(covered_vp);
 	}
 	vn_lock(vp, ltype | LK_RETRY);
 	if ((vp->v_iflag & VI_DOOMED) != 0)
 		error = SET_ERROR(ENOENT);
 	return (error);
 }
 
 #ifdef DIAGNOSTIC
 static int
 zfs_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	vnode_t *vp;
 	znode_t *zp;
 	int err;
 
 	err = vop_stdlock(ap);
 	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
 		vp = ap->a_vp;
 		zp = vp->v_data;
 		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
 		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
 			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
 	}
 	return (err);
 }
 #endif
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
 	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
 #ifdef DIAGNOSTIC
 	.vop_lock1 =		zfs_lock,
 #endif
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 };
Index: stable/11/sys/dev/drm/drmP.h
===================================================================
--- stable/11/sys/dev/drm/drmP.h	(revision 331016)
+++ stable/11/sys/dev/drm/drmP.h	(revision 331017)
@@ -1,1010 +1,1011 @@
 /* drmP.h -- Private header for Direct Rendering Manager -*- linux-c -*-
  * Created: Mon Jan  4 10:05:05 1999 by faith@precisioninsight.com
  */
 /*-
  * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
  * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
  * All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *    Rickard E. (Rik) Faith <faith@valinux.com>
  *    Gareth Hughes <gareth@valinux.com>
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _DRM_P_H_
 #define _DRM_P_H_
 
 #if defined(_KERNEL) || defined(__KERNEL__)
 
 struct drm_device;
 struct drm_file;
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/sglist.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
 #include <sys/uio.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/queue.h>
 #include <sys/signalvar.h>
 #include <sys/poll.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/specialreg.h>
 #endif
 #include <machine/sysarch.h>
 #include <sys/endian.h>
 #if _BYTE_ORDER == _BIG_ENDIAN
 #define __BIG_ENDIAN 1
 #else
 #define __LITTLE_ENDIAN 1
 #endif
 #include <sys/mman.h>
 #include <sys/rman.h>
 #include <sys/memrange.h>
 #include <dev/agp/agpvar.h>
 #include <sys/agpio.h>
 #include <sys/mutex.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <sys/selinfo.h>
 #include <sys/bus.h>
 
 #include "dev/drm/drm.h"
 #include "dev/drm/drm_atomic.h"
 #include "dev/drm/drm_internal.h"
 #include "dev/drm/drm_linux_list.h"
 
 #include <opt_drm.h>
 #ifdef DRM_DEBUG
 #undef DRM_DEBUG
 #define DRM_DEBUG_DEFAULT_ON 1
 #endif /* DRM_DEBUG */
 
 #if defined(DRM_LINUX) && DRM_LINUX && !defined(__amd64__)
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #else
 /* Either it was defined when it shouldn't be (FreeBSD amd64) or it isn't
  * supported on this OS yet.
  */
 #undef DRM_LINUX
 #define DRM_LINUX 0
 #endif
 
 /* driver capabilities and requirements mask */
 #define DRIVER_USE_AGP     0x1
 #define DRIVER_REQUIRE_AGP 0x2
 #define DRIVER_USE_MTRR    0x4
 #define DRIVER_PCI_DMA     0x8
 #define DRIVER_SG          0x10
 #define DRIVER_HAVE_DMA    0x20
 #define DRIVER_HAVE_IRQ    0x40
 #define DRIVER_DMA_QUEUE   0x100
 
 
 #define DRM_HASH_SIZE	      16 /* Size of key hash table		  */
 #define DRM_KERNEL_CONTEXT    0	 /* Change drm_resctx if changed	  */
 #define DRM_RESERVED_CONTEXTS 1	 /* Change drm_resctx if changed	  */
 
 MALLOC_DECLARE(DRM_MEM_DMA);
 MALLOC_DECLARE(DRM_MEM_SAREA);
 MALLOC_DECLARE(DRM_MEM_DRIVER);
 MALLOC_DECLARE(DRM_MEM_MAGIC);
 MALLOC_DECLARE(DRM_MEM_IOCTLS);
 MALLOC_DECLARE(DRM_MEM_MAPS);
 MALLOC_DECLARE(DRM_MEM_BUFS);
 MALLOC_DECLARE(DRM_MEM_SEGS);
 MALLOC_DECLARE(DRM_MEM_PAGES);
 MALLOC_DECLARE(DRM_MEM_FILES);
 MALLOC_DECLARE(DRM_MEM_QUEUES);
 MALLOC_DECLARE(DRM_MEM_CMDS);
 MALLOC_DECLARE(DRM_MEM_MAPPINGS);
 MALLOC_DECLARE(DRM_MEM_BUFLISTS);
 MALLOC_DECLARE(DRM_MEM_AGPLISTS);
 MALLOC_DECLARE(DRM_MEM_CTXBITMAP);
 MALLOC_DECLARE(DRM_MEM_SGLISTS);
 MALLOC_DECLARE(DRM_MEM_DRAWABLE);
 MALLOC_DECLARE(DRM_MEM_MM);
 MALLOC_DECLARE(DRM_MEM_HASHTAB);
 
 SYSCTL_DECL(_hw_drm);
 
 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8)
 
 				/* Internal types and structures */
 #define DRM_ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 #define DRM_MIN(a,b) ((a)<(b)?(a):(b))
 #define DRM_MAX(a,b) ((a)>(b)?(a):(b))
 
 #define DRM_IF_VERSION(maj, min) (maj << 16 | min)
 
 #define __OS_HAS_AGP	1
 
 #define DRM_DEV_MODE	(S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
 #define DRM_DEV_UID	UID_ROOT
 #define DRM_DEV_GID	GID_VIDEO
 
 #define wait_queue_head_t	atomic_t
 #define DRM_WAKEUP(w)		wakeup((void *)w)
 #define DRM_WAKEUP_INT(w)	wakeup(w)
 #define DRM_INIT_WAITQUEUE(queue) do {(void)(queue);} while (0)
 
 #define DRM_CURPROC		curthread
 #define DRM_STRUCTPROC		struct thread
 #define DRM_SPINTYPE		struct mtx
 #define DRM_SPININIT(l,name)	mtx_init(l, name, NULL, MTX_DEF)
 #define DRM_SPINUNINIT(l)	mtx_destroy(l)
 #define DRM_SPINLOCK(l)		mtx_lock(l)
 #define DRM_SPINUNLOCK(u)	mtx_unlock(u)
 #define DRM_SPINLOCK_IRQSAVE(l, irqflags) do {		\
 	mtx_lock(l);					\
 	(void)irqflags;					\
 } while (0)
 #define DRM_SPINUNLOCK_IRQRESTORE(u, irqflags) mtx_unlock(u)
 #define DRM_SPINLOCK_ASSERT(l)	mtx_assert(l, MA_OWNED)
 #define DRM_CURRENTPID		curthread->td_proc->p_pid
 #define DRM_LOCK()		mtx_lock(&dev->dev_lock)
 #define DRM_UNLOCK() 		mtx_unlock(&dev->dev_lock)
 #define DRM_SYSCTL_HANDLER_ARGS	(SYSCTL_HANDLER_ARGS)
 
 #define DRM_IRQ_ARGS		void *arg
 typedef void			irqreturn_t;
 #define IRQ_HANDLED		/* nothing */
 #define IRQ_NONE		/* nothing */
 
 #define unlikely(x)            __builtin_expect(!!(x), 0)
 #define container_of(ptr, type, member) ({			\
 	__typeof( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
 enum {
 	DRM_IS_NOT_AGP,
 	DRM_IS_AGP,
 	DRM_MIGHT_BE_AGP
 };
 #define DRM_AGP_MEM		struct agp_memory_info
 
 #define drm_get_device_from_kdev(_kdev) (_kdev->si_drv1)
 
 #define PAGE_ALIGN(addr) round_page(addr)
 /* DRM_SUSER returns true if the user is superuser */
 #define DRM_SUSER(p)		(priv_check(p, PRIV_DRIVER) == 0)
 #define DRM_AGP_FIND_DEVICE()	agp_find_device()
 #define DRM_MTRR_WC		MDF_WRITECOMBINE
 #define jiffies			ticks
 
 typedef vm_paddr_t dma_addr_t;
 typedef u_int64_t u64;
 typedef u_int32_t u32;
 typedef u_int16_t u16;
 typedef u_int8_t u8;
 
 /* DRM_READMEMORYBARRIER() prevents reordering of reads.
  * DRM_WRITEMEMORYBARRIER() prevents reordering of writes.
  * DRM_MEMORYBARRIER() prevents reordering of reads and writes.
  */
 #define DRM_READMEMORYBARRIER()		rmb()
 #define DRM_WRITEMEMORYBARRIER()	wmb()
 #define DRM_MEMORYBARRIER()		mb()
 
 #define DRM_READ8(map, offset)						\
 	*(volatile u_int8_t *)(((vm_offset_t)(map)->virtual) +		\
 	    (vm_offset_t)(offset))
 #define DRM_READ16(map, offset)						\
 	le16toh(*(volatile u_int16_t *)(((vm_offset_t)(map)->virtual) +	\
 	    (vm_offset_t)(offset)))
 #define DRM_READ32(map, offset)						\
 	le32toh(*(volatile u_int32_t *)(((vm_offset_t)(map)->virtual) +	\
 	    (vm_offset_t)(offset)))
 #define DRM_WRITE8(map, offset, val)					\
 	*(volatile u_int8_t *)(((vm_offset_t)(map)->virtual) +		\
 	    (vm_offset_t)(offset)) = val
 #define DRM_WRITE16(map, offset, val)					\
 	*(volatile u_int16_t *)(((vm_offset_t)(map)->virtual) +		\
 	    (vm_offset_t)(offset)) = htole16(val)
 #define DRM_WRITE32(map, offset, val)					\
 	*(volatile u_int32_t *)(((vm_offset_t)(map)->virtual) +		\
 	    (vm_offset_t)(offset)) = htole32(val)
 
 #define DRM_VERIFYAREA_READ( uaddr, size )		\
 	(!useracc(__DECONST(caddr_t, uaddr), size, VM_PROT_READ))
 
 #define DRM_COPY_TO_USER(user, kern, size) \
 	copyout(kern, user, size)
 #define DRM_COPY_FROM_USER(kern, user, size) \
 	copyin(user, kern, size)
 #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) 	\
 	copyin(arg2, arg1, arg3)
 #define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3)	\
 	copyout(arg2, arg1, arg3)
 #define DRM_GET_USER_UNCHECKED(val, uaddr)		\
 	((val) = fuword32(uaddr), 0)
 
 #define cpu_to_le32(x) htole32(x)
 #define le32_to_cpu(x) le32toh(x)
 
 #define DRM_HZ			hz
 #define DRM_UDELAY(udelay)	DELAY(udelay)
 #define DRM_TIME_SLICE		(hz/20)  /* Time slice for GLXContexts	  */
 
 #define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do {	\
 	(_map) = (_dev)->context_sareas[_ctx];		\
 } while(0)
 
 #define LOCK_TEST_WITH_RETURN(dev, file_priv)				\
 do {									\
 	if (!_DRM_LOCK_IS_HELD(dev->lock.hw_lock->lock) ||		\
 	     dev->lock.file_priv != file_priv) {			\
 		DRM_ERROR("%s called without lock held\n",		\
 			   __FUNCTION__);				\
 		return EINVAL;						\
 	}								\
 } while (0)
 
 /* Returns -errno to shared code */
 #define DRM_WAIT_ON( ret, queue, timeout, condition )		\
 for ( ret = 0 ; !ret && !(condition) ; ) {			\
 	DRM_UNLOCK();						\
 	mtx_lock(&dev->irq_lock);				\
 	if (!(condition))					\
 	    ret = -mtx_sleep(&(queue), &dev->irq_lock, 		\
 		PCATCH, "drmwtq", (timeout));			\
 	mtx_unlock(&dev->irq_lock);				\
 	DRM_LOCK();						\
 }
 
 #define DRM_ERROR(fmt, ...) \
 	printf("error: [" DRM_NAME ":pid%d:%s] *ERROR* " fmt,		\
 	    DRM_CURRENTPID, __func__ , ##__VA_ARGS__)
 
 #define DRM_INFO(fmt, ...)  printf("info: [" DRM_NAME "] " fmt , ##__VA_ARGS__)
 
 #define DRM_DEBUG(fmt, ...) do {					\
 	if (drm_debug_flag)						\
 		printf("[" DRM_NAME ":pid%d:%s] " fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 typedef struct drm_pci_id_list
 {
 	int vendor;
 	int device;
 	long driver_private;
 	char *name;
 } drm_pci_id_list_t;
 
 struct drm_msi_blacklist_entry
 {
 	int vendor;
 	int device;
 };
 
 #define DRM_AUTH	0x1
 #define DRM_MASTER	0x2
 #define DRM_ROOT_ONLY	0x4
 typedef struct drm_ioctl_desc {
 	unsigned long cmd;
 	int (*func)(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 	int flags;
 } drm_ioctl_desc_t;
 /**
  * Creates a driver or general drm_ioctl_desc array entry for the given
  * ioctl, for use by drm_ioctl().
  */
 #define DRM_IOCTL_DEF(ioctl, func, flags) \
 	[DRM_IOCTL_NR(ioctl)] = {ioctl, func, flags}
 
 typedef struct drm_magic_entry {
 	drm_magic_t	       magic;
 	struct drm_file	       *priv;
 	struct drm_magic_entry *next;
 } drm_magic_entry_t;
 
 typedef struct drm_magic_head {
 	struct drm_magic_entry *head;
 	struct drm_magic_entry *tail;
 } drm_magic_head_t;
 
 typedef struct drm_buf {
 	int		  idx;	       /* Index into master buflist	     */
 	int		  total;       /* Buffer size			     */
 	int		  order;       /* log-base-2(total)		     */
 	int		  used;	       /* Amount of buffer in use (for DMA)  */
 	unsigned long	  offset;      /* Byte offset (used internally)	     */
 	void		  *address;    /* Address of buffer		     */
 	unsigned long	  bus_address; /* Bus address of buffer		     */
 	struct drm_buf	  *next;       /* Kernel-only: used for free list    */
 	__volatile__ int  pending;     /* On hardware DMA queue		     */
 	struct drm_file   *file_priv;  /* Unique identifier of holding process */
 	int		  context;     /* Kernel queue for this buffer	     */
 	enum {
 		DRM_LIST_NONE	 = 0,
 		DRM_LIST_FREE	 = 1,
 		DRM_LIST_WAIT	 = 2,
 		DRM_LIST_PEND	 = 3,
 		DRM_LIST_PRIO	 = 4,
 		DRM_LIST_RECLAIM = 5
 	}		  list;	       /* Which list we're on		     */
 
 	int		  dev_priv_size; /* Size of buffer private stoarge   */
 	void		  *dev_private;  /* Per-buffer private storage       */
 } drm_buf_t;
 
 typedef struct drm_freelist {
 	int		  initialized; /* Freelist in use		   */
 	atomic_t	  count;       /* Number of free buffers	   */
 	drm_buf_t	  *next;       /* End pointer			   */
 
 	int		  low_mark;    /* Low water mark		   */
 	int		  high_mark;   /* High water mark		   */
 } drm_freelist_t;
 
 typedef struct drm_dma_handle {
 	void *vaddr;
 	bus_addr_t busaddr;
 	bus_dma_tag_t tag;
 	bus_dmamap_t map;
 } drm_dma_handle_t;
 
 typedef struct drm_buf_entry {
 	int		  buf_size;
 	int		  buf_count;
 	drm_buf_t	  *buflist;
 	int		  seg_count;
 	drm_dma_handle_t  **seglist;
 	int		  page_order;
 
 	drm_freelist_t	  freelist;
 } drm_buf_entry_t;
 
 typedef TAILQ_HEAD(drm_file_list, drm_file) drm_file_list_t;
 struct drm_file {
 	TAILQ_ENTRY(drm_file) link;
 	struct drm_device *dev;
 	int		  authenticated;
 	int		  master;
 	pid_t		  pid;
 	uid_t		  uid;
 	drm_magic_t	  magic;
 	unsigned long	  ioctl_count;
 	void		 *driver_priv;
 };
 
 typedef struct drm_lock_data {
 	struct drm_hw_lock	*hw_lock;	/* Hardware lock		   */
 	struct drm_file   *file_priv;   /* Unique identifier of holding process (NULL is kernel)*/
 	int		  lock_queue;	/* Queue of blocked processes	   */
 	unsigned long	  lock_time;	/* Time of last lock in jiffies	   */
 } drm_lock_data_t;
 
 /* This structure, in the struct drm_device, is always initialized while the
  * device
  * is open.  dev->dma_lock protects the incrementing of dev->buf_use, which
  * when set marks that no further bufs may be allocated until device teardown
  * occurs (when the last open of the device has closed).  The high/low
  * watermarks of bufs are only touched by the X Server, and thus not
  * concurrently accessed, so no locking is needed.
  */
 typedef struct drm_device_dma {
 	drm_buf_entry_t	  bufs[DRM_MAX_ORDER+1];
 	int		  buf_count;
 	drm_buf_t	  **buflist;	/* Vector of pointers info bufs	   */
 	int		  seg_count;
 	int		  page_count;
 	unsigned long	  *pagelist;
 	unsigned long	  byte_count;
 	enum {
 		_DRM_DMA_USE_AGP = 0x01,
 		_DRM_DMA_USE_SG  = 0x02
 	} flags;
 } drm_device_dma_t;
 
 typedef struct drm_agp_mem {
 	void               *handle;
 	unsigned long      bound; /* address */
 	int                pages;
 	struct drm_agp_mem *prev;
 	struct drm_agp_mem *next;
 } drm_agp_mem_t;
 
 typedef struct drm_agp_head {
 	device_t	   agpdev;
 	struct agp_info    info;
 	const char         *chipset;
 	drm_agp_mem_t      *memory;
 	unsigned long      mode;
 	int                enabled;
 	int                acquired;
 	unsigned long      base;
    	int 		   mtrr;
 	int		   cant_use_aperture;
 	unsigned long	   page_mask;
 } drm_agp_head_t;
 
 typedef struct drm_sg_mem {
 	vm_offset_t vaddr;
 	vm_paddr_t *busaddr;
 	vm_pindex_t pages;
 } drm_sg_mem_t;
 
 #define DRM_MAP_HANDLE_BITS	(sizeof(void *) == 4 ? 4 : 24)
 #define DRM_MAP_HANDLE_SHIFT	(sizeof(void *) * 8 - DRM_MAP_HANDLE_BITS)
 typedef TAILQ_HEAD(drm_map_list, drm_local_map) drm_map_list_t;
 
 typedef struct drm_local_map {
 	unsigned long offset;	  /* Physical address (0 for SAREA)       */
 	unsigned long size;	  /* Physical size (bytes)                */
 	enum drm_map_type type;	  /* Type of memory mapped                */
 	enum drm_map_flags flags; /* Flags                                */
 	void *handle;		  /* User-space: "Handle" to pass to mmap */
 				  /* Kernel-space: kernel-virtual address */
 	int mtrr;		  /* Boolean: MTRR used                   */
 				  /* Private data                         */
 	int rid;		  /* PCI resource ID for bus_space        */
 	void *virtual;		  /* Kernel-space: kernel-virtual address */
 	struct resource *bsr;
 	bus_space_tag_t bst;
 	bus_space_handle_t bsh;
 	drm_dma_handle_t *dmah;
 	TAILQ_ENTRY(drm_local_map) link;
 } drm_local_map_t;
 
 struct drm_vblank_info {
 	wait_queue_head_t queue;	/* vblank wait queue */
 	atomic_t count;			/* number of VBLANK interrupts */
 					/* (driver must alloc the right number of counters) */
 	atomic_t refcount;		/* number of users of vblank interrupts */
 	u32 last;			/* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int enabled;			/* so we don't call enable more than */
 					/* once per disable */
 	int inmodeset;			/* Display driver is setting mode */
 };
 
 /* location of GART table */
 #define DRM_ATI_GART_MAIN 1
 #define DRM_ATI_GART_FB   2
 
 #define DRM_ATI_GART_PCI  1
 #define DRM_ATI_GART_PCIE 2
 #define DRM_ATI_GART_IGP  3
 
 struct drm_ati_pcigart_info {
 	int gart_table_location;
 	int gart_reg_if;
 	void *addr;
 	dma_addr_t bus_addr;
 	dma_addr_t table_mask;
 	dma_addr_t member_mask;
 	struct drm_dma_handle *table_handle;
 	drm_local_map_t mapping;
 	int table_size;
 	struct drm_dma_handle *dmah; /* handle for ATI PCIGART table */
 };
 
 #ifndef DMA_BIT_MASK
 #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : (1ULL<<(n)) - 1)
 #endif
 
 #define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
 
 struct drm_driver_info {
 	int	(*load)(struct drm_device *, unsigned long flags);
 	int	(*firstopen)(struct drm_device *);
 	int	(*open)(struct drm_device *, struct drm_file *);
 	void	(*preclose)(struct drm_device *, struct drm_file *file_priv);
 	void	(*postclose)(struct drm_device *, struct drm_file *);
 	void	(*lastclose)(struct drm_device *);
 	int	(*unload)(struct drm_device *);
 	void	(*reclaim_buffers_locked)(struct drm_device *,
 					  struct drm_file *file_priv);
 	int	(*dma_ioctl)(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 	void	(*dma_ready)(struct drm_device *);
 	int	(*dma_quiescent)(struct drm_device *);
 	int	(*dma_flush_block_and_flush)(struct drm_device *, int context,
 					     enum drm_lock_flags flags);
 	int	(*dma_flush_unblock)(struct drm_device *, int context,
 				     enum drm_lock_flags flags);
 	int	(*context_ctor)(struct drm_device *dev, int context);
 	int	(*context_dtor)(struct drm_device *dev, int context);
 	int	(*kernel_context_switch)(struct drm_device *dev, int old,
 					 int new);
 	int	(*kernel_context_switch_unlock)(struct drm_device *dev);
 	void	(*irq_preinstall)(struct drm_device *dev);
 	int	(*irq_postinstall)(struct drm_device *dev);
 	void	(*irq_uninstall)(struct drm_device *dev);
 	void	(*irq_handler)(DRM_IRQ_ARGS);
 	u32	(*get_vblank_counter)(struct drm_device *dev, int crtc);
 	int	(*enable_vblank)(struct drm_device *dev, int crtc);
 	void	(*disable_vblank)(struct drm_device *dev, int crtc);
 
 	drm_pci_id_list_t *id_entry;	/* PCI ID, name, and chipset private */
 
 	/**
 	 * Called by \c drm_device_is_agp.  Typically used to determine if a
 	 * card is really attached to AGP or not.
 	 *
 	 * \param dev  DRM device handle
 	 *
 	 * \returns 
 	 * One of three values is returned depending on whether or not the
 	 * card is absolutely \b not AGP (return of 0), absolutely \b is AGP
 	 * (return of 1), or may or may not be AGP (return of 2).
 	 */
 	int	(*device_is_agp) (struct drm_device * dev);
 
 	drm_ioctl_desc_t *ioctls;
 	int	max_ioctl;
 
 	int	buf_priv_size;
 
 	int	major;
 	int	minor;
 	int	patchlevel;
 	const char *name;		/* Simple driver name		   */
 	const char *desc;		/* Longer driver name		   */
 	const char *date;		/* Date of last major changes.	   */
 
 	u32 driver_features;
 };
 
 /* Length for the array of resource pointers for drm_get_resource_*. */
 #define DRM_MAX_PCI_RESOURCE	6
 
 /** 
  * DRM device functions structure
  */
 struct drm_device {
 	struct drm_driver_info *driver;
 	drm_pci_id_list_t *id_entry;	/* PCI ID, name, and chipset private */
 
 	u_int16_t pci_device;		/* PCI device id */
 	u_int16_t pci_vendor;		/* PCI vendor id */
 
 	char		  *unique;	/* Unique identifier: e.g., busid  */
 	int		  unique_len;	/* Length of unique field	   */
 	device_t	  device;	/* Device instance from newbus     */
 	struct cdev	  *devnode;	/* Device number for mknod	   */
 	int		  if_version;	/* Highest interface version set */
 
 	int		  flags;	/* Flags to open(2)		   */
 
 				/* Locks */
 	struct mtx	  vbl_lock;	/* protects vblank operations */
 	struct mtx	  dma_lock;	/* protects dev->dma */
 	struct mtx	  irq_lock;	/* protects irq condition checks */
 	struct mtx	  dev_lock;	/* protects everything else */
 	DRM_SPINTYPE	  drw_lock;
 
 				/* Usage Counters */
 	int		  open_count;	/* Outstanding files open	   */
 	int		  buf_use;	/* Buffers in use -- cannot alloc  */
 
 				/* Performance counters */
 	unsigned long     counters;
 	enum drm_stat_type	types[15];
 	atomic_t          counts[15];
 
 				/* Authentication */
 	drm_file_list_t   files;
 	drm_magic_head_t  magiclist[DRM_HASH_SIZE];
 
 	/* Linked list of mappable regions. Protected by dev_lock */
 	drm_map_list_t	  maplist;
 	struct unrhdr	  *map_unrhdr;
 
 	drm_local_map_t	  **context_sareas;
 	int		  max_context;
 
 	drm_lock_data_t	  lock;		/* Information on hardware lock	   */
 
 				/* DMA queues (contexts) */
 	drm_device_dma_t  *dma;		/* Optional pointer for DMA support */
 
 				/* Context support */
 	int		  irq;		/* Interrupt used by board	   */
 	int		  irq_enabled;	/* True if the irq handler is enabled */
 	int		  msi_enabled;	/* MSI enabled */
 	int		  irqrid;	/* Interrupt used by board */
 	struct resource   *irqr;	/* Resource for interrupt used by board	   */
 	void		  *irqh;	/* Handle from bus_setup_intr      */
 
 	/* Storage of resource pointers for drm_get_resource_* */
 	struct resource   *pcir[DRM_MAX_PCI_RESOURCE];
 	int		  pcirid[DRM_MAX_PCI_RESOURCE];
 
 	int		  pci_domain;
 	int		  pci_bus;
 	int		  pci_slot;
 	int		  pci_func;
 
 	atomic_t	  context_flag;	/* Context swapping flag	   */
 	int		  last_context;	/* Last current context		   */
 
 	int		  vblank_disable_allowed;
 	struct callout	  vblank_disable_timer;
 	u32		  max_vblank_count;	/* size of vblank counter register */
 	struct drm_vblank_info *vblank;		/* per crtc vblank info */
 	int		  num_crtcs;
 
 	struct sigio      *buf_sigio;	/* Processes waiting for SIGIO     */
 
 				/* Sysctl support */
 	struct drm_sysctl_info *sysctl;
 
 	drm_agp_head_t    *agp;
 	drm_sg_mem_t      *sg;  /* Scatter gather memory */
 	atomic_t          *ctx_bitmap;
 	void		  *dev_private;
 	unsigned int	  agp_buffer_token;
 	drm_local_map_t   *agp_buffer_map;
 
 	struct unrhdr	  *drw_unrhdr;
 	/* RB tree of drawable infos */
 	RB_HEAD(drawable_tree, bsd_drm_drawable_info) drw_head;
 };
 
 static __inline__ int drm_core_check_feature(struct drm_device *dev,
 					     int feature)
 {
 	return ((dev->driver->driver_features & feature) ? 1 : 0);
 }
 
 #if __OS_HAS_AGP
 static inline int drm_core_has_AGP(struct drm_device *dev)
 {
 	return drm_core_check_feature(dev, DRIVER_USE_AGP);
 }
 #else
 #define drm_core_has_AGP(dev) (0)
 #endif
 
 extern int	drm_debug_flag;
 
 /* Device setup support (drm_drv.c) */
 int	drm_probe(device_t kdev, drm_pci_id_list_t *idlist);
 int	drm_attach(device_t kdev, drm_pci_id_list_t *idlist);
 void	drm_close(void *data);
 int	drm_detach(device_t kdev);
 d_ioctl_t drm_ioctl;
 d_open_t drm_open;
 d_read_t drm_read;
 d_poll_t drm_poll;
 d_mmap_t drm_mmap;
 extern drm_local_map_t	*drm_getsarea(struct drm_device *dev);
 
 /* File operations helpers (drm_fops.c) */
 extern int		drm_open_helper(struct cdev *kdev, int flags, int fmt,
 					 DRM_STRUCTPROC *p,
 					struct drm_device *dev);
 
 /* Memory management support (drm_memory.c) */
 void	drm_mem_init(void);
 void	drm_mem_uninit(void);
 void	*drm_ioremap_wc(struct drm_device *dev, drm_local_map_t *map);
 void	*drm_ioremap(struct drm_device *dev, drm_local_map_t *map);
 void	drm_ioremapfree(drm_local_map_t *map);
 int	drm_mtrr_add(unsigned long offset, size_t size, int flags);
 int	drm_mtrr_del(int handle, unsigned long offset, size_t size, int flags);
 
 int	drm_context_switch(struct drm_device *dev, int old, int new);
 int	drm_context_switch_complete(struct drm_device *dev, int new);
 
 int	drm_ctxbitmap_init(struct drm_device *dev);
 void	drm_ctxbitmap_cleanup(struct drm_device *dev);
 void	drm_ctxbitmap_free(struct drm_device *dev, int ctx_handle);
 int	drm_ctxbitmap_next(struct drm_device *dev);
 
 /* Locking IOCTL support (drm_lock.c) */
 int	drm_lock_take(struct drm_lock_data *lock_data,
 		      unsigned int context);
 int	drm_lock_transfer(struct drm_lock_data *lock_data,
 			  unsigned int context);
 int	drm_lock_free(struct drm_lock_data *lock_data,
 		      unsigned int context);
 
 /* Buffer management support (drm_bufs.c) */
 unsigned long drm_get_resource_start(struct drm_device *dev,
 				     unsigned int resource);
 unsigned long drm_get_resource_len(struct drm_device *dev,
 				   unsigned int resource);
 void	drm_rmmap(struct drm_device *dev, drm_local_map_t *map);
 int	drm_order(unsigned long size);
 int	drm_addmap(struct drm_device *dev, unsigned long offset,
 		   unsigned long size,
 		   enum drm_map_type type, enum drm_map_flags flags,
 		   drm_local_map_t **map_ptr);
 int	drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc *request);
 int	drm_addbufs_sg(struct drm_device *dev, struct drm_buf_desc *request);
 int	drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc *request);
 
 /* DMA support (drm_dma.c) */
 int	drm_dma_setup(struct drm_device *dev);
 void	drm_dma_takedown(struct drm_device *dev);
 void	drm_free_buffer(struct drm_device *dev, drm_buf_t *buf);
 void	drm_reclaim_buffers(struct drm_device *dev, struct drm_file *file_priv);
 #define drm_core_reclaim_buffers drm_reclaim_buffers
 
 /* IRQ support (drm_irq.c) */
 int	drm_irq_install(struct drm_device *dev);
 int	drm_irq_uninstall(struct drm_device *dev);
 irqreturn_t drm_irq_handler(DRM_IRQ_ARGS);
 void	drm_driver_irq_preinstall(struct drm_device *dev);
 void	drm_driver_irq_postinstall(struct drm_device *dev);
 void	drm_driver_irq_uninstall(struct drm_device *dev);
 void	drm_handle_vblank(struct drm_device *dev, int crtc);
 u32	drm_vblank_count(struct drm_device *dev, int crtc);
 int	drm_vblank_get(struct drm_device *dev, int crtc);
 void	drm_vblank_put(struct drm_device *dev, int crtc);
 void	drm_vblank_cleanup(struct drm_device *dev);
 int	drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq);
 int	drm_vblank_init(struct drm_device *dev, int num_crtcs);
 int 	drm_modeset_ctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 /* AGP/PCI Express/GART support (drm_agpsupport.c) */
 int	drm_device_is_agp(struct drm_device *dev);
 int	drm_device_is_pcie(struct drm_device *dev);
 drm_agp_head_t *drm_agp_init(void);
 int	drm_agp_acquire(struct drm_device *dev);
 int	drm_agp_release(struct drm_device *dev);
 int	drm_agp_info(struct drm_device * dev, struct drm_agp_info *info);
 int	drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode);
 void	*drm_agp_allocate_memory(size_t pages, u32 type);
 int	drm_agp_free_memory(void *handle);
 int	drm_agp_bind_memory(void *handle, off_t start);
 int	drm_agp_unbind_memory(void *handle);
 int	drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request);
 int	drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request);
 int	drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request);
 int	drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request);
 
 /* Scatter Gather Support (drm_scatter.c) */
 void	drm_sg_cleanup(drm_sg_mem_t *entry);
 int	drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
 
 /* sysctl support (drm_sysctl.h) */
 extern int		drm_sysctl_init(struct drm_device *dev);
 extern int		drm_sysctl_cleanup(struct drm_device *dev);
 
 /* ATI PCIGART support (ati_pcigart.c) */
 int	drm_ati_pcigart_init(struct drm_device *dev,
 				struct drm_ati_pcigart_info *gart_info);
 int	drm_ati_pcigart_cleanup(struct drm_device *dev,
 				struct drm_ati_pcigart_info *gart_info);
 
 /* Locking IOCTL support (drm_drv.c) */
 int	drm_lock(struct drm_device *dev, void *data,
 		 struct drm_file *file_priv);
 int	drm_unlock(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_version(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 int	drm_setversion(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 
 /* Misc. IOCTL support (drm_ioctl.c) */
 int	drm_irq_by_busid(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 int	drm_getunique(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 int	drm_setunique(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 int	drm_getmap(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_getclient(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 int	drm_getstats(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 int	drm_noop(struct drm_device *dev, void *data,
 		 struct drm_file *file_priv);
 
 /* Context IOCTL support (drm_context.c) */
 int	drm_resctx(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_addctx(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_modctx(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_getctx(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_switchctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 int	drm_newctx(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_rmctx(struct drm_device *dev, void *data,
 		  struct drm_file *file_priv);
 int	drm_setsareactx(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int	drm_getsareactx(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 /* Drawable IOCTL support (drm_drawable.c) */
 int	drm_adddraw(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 int	drm_rmdraw(struct drm_device *dev, void *data,
 		   struct drm_file *file_priv);
 int	drm_update_draw(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 struct drm_drawable_info *drm_get_drawable_info(struct drm_device *dev,
 						int handle);
 
 /* Drawable support (drm_drawable.c) */
 void drm_drawable_free_all(struct drm_device *dev);
 
 /* Authentication IOCTL support (drm_auth.c) */
 int	drm_getmagic(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 int	drm_authmagic(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 
 /* Buffer management support (drm_bufs.c) */
 int	drm_addmap_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 int	drm_rmmap_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int	drm_addbufs(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 int	drm_infobufs(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 int	drm_markbufs(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 int	drm_freebufs(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 int	drm_mapbufs(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 /* DMA support (drm_dma.c) */
 int	drm_dma(struct drm_device *dev, void *data, struct drm_file *file_priv);
 
 /* IRQ support (drm_irq.c) */
 int	drm_control(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 int	drm_wait_vblank(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 /* AGP/GART support (drm_agpsupport.c) */
 int	drm_agp_acquire_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
 int	drm_agp_release_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
 int	drm_agp_enable_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 int	drm_agp_info_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 int	drm_agp_alloc_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 int	drm_agp_free_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 int	drm_agp_unbind_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 int	drm_agp_bind_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
 /* Scatter Gather Support (drm_scatter.c) */
 int	drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 int	drm_sg_free(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 /* consistent PCI memory functions (drm_pci.c) */
 drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size,
 				size_t align, dma_addr_t maxaddr);
 void	drm_pci_free(struct drm_device *dev, drm_dma_handle_t *dmah);
 
 /* Inline replacements for drm_alloc and friends */
 static __inline__ void *
 drm_alloc(size_t size, struct malloc_type *area)
 {
 	return malloc(size, area, M_NOWAIT);
 }
 
 static __inline__ void *
 drm_calloc(size_t nmemb, size_t size, struct malloc_type *area)
 {
 	return malloc(size * nmemb, area, M_NOWAIT | M_ZERO);
 }
 
 static __inline__ void *
 drm_realloc(void *oldpt, size_t oldsize, size_t size,
     struct malloc_type *area)
 {
 	return reallocf(oldpt, size, area, M_NOWAIT);
 }
 
 static __inline__ void
 drm_free(void *pt, size_t size, struct malloc_type *area)
 {
 	free(pt, area);
 }
 
 /* Inline replacements for DRM_IOREMAP macros */
 static __inline__ void
 drm_core_ioremap_wc(struct drm_local_map *map, struct drm_device *dev)
 {
 	map->virtual = drm_ioremap_wc(dev, map);
 }
 static __inline__ void
 drm_core_ioremap(struct drm_local_map *map, struct drm_device *dev)
 {
 	map->virtual = drm_ioremap(dev, map);
 }
 static __inline__ void
 drm_core_ioremapfree(struct drm_local_map *map, struct drm_device *dev)
 {
 	if ( map->virtual && map->size )
 		drm_ioremapfree(map);
 }
 
 static __inline__ struct drm_local_map *
 drm_core_findmap(struct drm_device *dev, unsigned long offset)
 {
 	drm_local_map_t *map;
 
 	DRM_SPINLOCK_ASSERT(&dev->dev_lock);
 	TAILQ_FOREACH(map, &dev->maplist, link) {
 		if (offset == (unsigned long)map->handle)
 			return map;
 	}
 	return NULL;
 }
 
 static __inline__ void drm_core_dropmap(struct drm_map *map)
 {
 }
 
 #endif /* __KERNEL__ */
 #endif /* _DRM_P_H_ */
Index: stable/11/sys/dev/drm2/drmP.h
===================================================================
--- stable/11/sys/dev/drm2/drmP.h	(revision 331016)
+++ stable/11/sys/dev/drm2/drmP.h	(revision 331017)
@@ -1,1955 +1,1956 @@
 /**
  * \file drmP.h
  * Private header for Direct Rendering Manager
  *
  * \author Rickard E. (Rik) Faith <faith@valinux.com>
  * \author Gareth Hughes <gareth@valinux.com>
  */
 
 /*
  * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
  * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
  * Copyright (c) 2009-2010, Code Aurora Forum.
  * All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _DRM_P_H_
 #define _DRM_P_H_
 
 #if defined(_KERNEL) || defined(__KERNEL__)
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/sglist.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
 #include <sys/uio.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/queue.h>
 #include <sys/signalvar.h>
 #include <sys/poll.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/specialreg.h>
 #endif
 #include <machine/sysarch.h>
 #include <sys/endian.h>
 #include <sys/mman.h>
 #include <sys/rman.h>
 #include <sys/memrange.h>
 #include <dev/agp/agpvar.h>
 #include <sys/agpio.h>
 #include <sys/mutex.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <sys/selinfo.h>
 #include <sys/bus.h>
 
 #include <dev/drm2/drm.h>
 #include <dev/drm2/drm_sarea.h>
 
 #include <dev/drm2/drm_atomic.h>
 #include <dev/drm2/drm_linux_list.h>
 #include <dev/drm2/drm_gem_names.h>
 
 #include <dev/drm2/drm_os_freebsd.h>
 
 #if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE))
 #define __OS_HAS_AGP 1
 #else
 #define __OS_HAS_AGP 0
 #endif
 #if defined(CONFIG_MTRR)
 #define __OS_HAS_MTRR 1
 #else
 #define __OS_HAS_MTRR 0
 #endif
 
 struct drm_file;
 struct drm_device;
 
 #include <dev/drm2/drm_hashtab.h>
 #include <dev/drm2/drm_mm.h>
 
 #include "opt_compat.h"
 #include "opt_drm.h"
 #include "opt_syscons.h"
 #ifdef DRM_DEBUG
 #undef DRM_DEBUG
 #define DRM_DEBUG_DEFAULT_ON 1
 #endif /* DRM_DEBUG */
 
 #define	DRM_DEBUGBITS_DEBUG		0x1
 #define	DRM_DEBUGBITS_KMS		0x2
 #define	DRM_DEBUGBITS_FAILED_IOCTL	0x4
 
 #undef DRM_LINUX
 #define DRM_LINUX 0
 
 /***********************************************************************/
 /** \name DRM template customization defaults */
 /*@{*/
 
 /* driver capabilities and requirements mask */
 #define DRIVER_USE_AGP     0x1
 #define DRIVER_REQUIRE_AGP 0x2
 #define DRIVER_USE_MTRR    0x4
 #define DRIVER_PCI_DMA     0x8
 #define DRIVER_SG          0x10
 #define DRIVER_HAVE_DMA    0x20
 #define DRIVER_HAVE_IRQ    0x40
 #define DRIVER_IRQ_SHARED  0x80
 #define DRIVER_IRQ_VBL     0x100
 #define DRIVER_DMA_QUEUE   0x200
 #define DRIVER_FB_DMA      0x400
 #define DRIVER_IRQ_VBL2    0x800
 #define DRIVER_GEM         0x1000
 #define DRIVER_MODESET     0x2000
 #define DRIVER_PRIME       0x4000
 
 #define DRIVER_BUS_PCI 0x1
 #define DRIVER_BUS_PLATFORM 0x2
 #define DRIVER_BUS_USB 0x3
 
 /***********************************************************************/
 /** \name Begin the DRM... */
 /*@{*/
 
 #define DRM_DEBUG_CODE 2	  /**< Include debugging code if > 1, then
 				     also include looping detection. */
 
 #define DRM_MAGIC_HASH_ORDER  4  /**< Size of key hash table. Must be power of 2. */
 #define DRM_KERNEL_CONTEXT    0	 /**< Change drm_resctx if changed */
 #define DRM_RESERVED_CONTEXTS 1	 /**< Change drm_resctx if changed */
 #define DRM_LOOPING_LIMIT     5000000
 #define DRM_TIME_SLICE	      (HZ/20)  /**< Time slice for GLXContexts */
 #define DRM_LOCK_SLICE	      1	/**< Time slice for lock, in jiffies */
 
 #define DRM_FLAG_DEBUG	  0x01
 
 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8)
 #define DRM_MAP_HASH_OFFSET 0x10000000
 
 /*@}*/
 
 /***********************************************************************/
 /** \name Macros to make printk easier */
 /*@{*/
 
 /**
  * Error output.
  *
  * \param fmt printf() like format string.
  * \param arg arguments
  */
 #define DRM_ERROR(fmt, ...) \
 	printf("error: [" DRM_NAME ":pid%d:%s] *ERROR* " fmt,		\
 	    DRM_CURRENTPID, __func__ , ##__VA_ARGS__)
 
 #define DRM_WARNING(fmt, ...)  printf("warning: [" DRM_NAME "] " fmt , ##__VA_ARGS__)
 #define DRM_INFO(fmt, ...)  printf("info: [" DRM_NAME "] " fmt , ##__VA_ARGS__)
 
 /**
  * Debug output.
  *
  * \param fmt printf() like format string.
  * \param arg arguments
  */
 #define DRM_DEBUG(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_DEBUG) != 0)			\
 		printf("[" DRM_NAME ":pid%d:%s] " fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_DEBUG_DRIVER(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_DEBUG_KMS(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG(fmt, ...) do {						\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_KMS(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_MODE(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_DRIVER(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 /*@}*/
 
 /***********************************************************************/
 /** \name Internal types and structures */
 /*@{*/
 
 #define DRM_ARRAY_SIZE(x) ARRAY_SIZE(x)
 
 #define DRM_LEFTCOUNT(x) (((x)->rp + (x)->count - (x)->wp) % ((x)->count + 1))
 #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x))
 
 #define DRM_IF_VERSION(maj, min) (maj << 16 | min)
 
 /**
  * Test that the hardware lock is held by the caller, returning otherwise.
  *
  * \param dev DRM device.
  * \param filp file pointer of the caller.
  */
 #define LOCK_TEST_WITH_RETURN( dev, _file_priv )				\
 do {										\
 	if (!_DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock) ||	\
 	    _file_priv->master->lock.file_priv != _file_priv)	{		\
 		DRM_ERROR( "%s called without lock held, held  %d owner %p %p\n",\
 			   __func__, _DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock),\
 			   _file_priv->master->lock.file_priv, _file_priv);	\
 		return -EINVAL;							\
 	}									\
 } while (0)
 
 /**
  * Ioctl function type.
  *
  * \param inode device inode.
  * \param file_priv DRM file private pointer.
  * \param cmd command.
  * \param arg argument.
  */
 typedef int drm_ioctl_t(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 #define DRM_IOCTL_NR(n)                ((n) & 0xff)
 #define DRM_MAJOR       226
 
 #define DRM_AUTH	0x1
 #define	DRM_MASTER	0x2
 #define DRM_ROOT_ONLY	0x4
 #define DRM_CONTROL_ALLOW 0x8
 #define DRM_UNLOCKED	0x10
 
 struct drm_ioctl_desc {
 	unsigned long cmd;
 	int flags;
 	drm_ioctl_t *func;
 	unsigned int cmd_drv;
 };
 
 /**
  * Creates a driver or general drm_ioctl_desc array entry for the given
  * ioctl, for use by drm_ioctl().
  */
 
 #define DRM_IOCTL_DEF(ioctl, _func, _flags) \
 	[DRM_IOCTL_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0}
 
 #define DRM_IOCTL_DEF_DRV(ioctl, _func, _flags)			\
 	[DRM_IOCTL_NR(DRM_##ioctl)] = {.cmd = DRM_##ioctl, .func = _func, .flags = _flags, .cmd_drv = DRM_IOCTL_##ioctl}
 
 struct drm_magic_entry {
 	struct list_head head;
 	struct drm_hash_item hash_item;
 	struct drm_file *priv;
 };
 
 /**
  * DMA buffer.
  */
 struct drm_buf {
 	int idx;		       /**< Index into master buflist */
 	int total;		       /**< Buffer size */
 	int order;		       /**< log-base-2(total) */
 	int used;		       /**< Amount of buffer in use (for DMA) */
 	unsigned long offset;	       /**< Byte offset (used internally) */
 	void *address;		       /**< Address of buffer */
 	unsigned long bus_address;     /**< Bus address of buffer */
 	struct drm_buf *next;	       /**< Kernel-only: used for free list */
 	__volatile__ int waiting;      /**< On kernel DMA queue */
 	__volatile__ int pending;      /**< On hardware DMA queue */
 	struct drm_file *file_priv;    /**< Private of holding file descr */
 	int context;		       /**< Kernel queue for this buffer */
 	int while_locked;	       /**< Dispatch this buffer while locked */
 	enum {
 		DRM_LIST_NONE = 0,
 		DRM_LIST_FREE = 1,
 		DRM_LIST_WAIT = 2,
 		DRM_LIST_PEND = 3,
 		DRM_LIST_PRIO = 4,
 		DRM_LIST_RECLAIM = 5
 	} list;			       /**< Which list we're on */
 
 	int dev_priv_size;		 /**< Size of buffer private storage */
 	void *dev_private;		 /**< Per-buffer private storage */
 };
 
 struct drm_freelist {
 	int initialized;	       /**< Freelist in use */
 	atomic_t count;		       /**< Number of free buffers */
 	struct drm_buf *next;	       /**< End pointer */
 
 #ifdef FREEBSD_NOTYET
 	wait_queue_head_t waiting;     /**< Processes waiting on free bufs */
 #endif /* defined(FREEBSD_NOTYET) */
 	int low_mark;		       /**< Low water mark */
 	int high_mark;		       /**< High water mark */
 #ifdef FREEBSD_NOTYET
 	atomic_t wfh;		       /**< If waiting for high mark */
 	spinlock_t lock;
 #endif /* defined(FREEBSD_NOTYET) */
 };
 
 typedef struct drm_dma_handle {
 	void *vaddr;
 	bus_addr_t busaddr;
 	bus_dma_tag_t tag;
 	bus_dmamap_t map;
 } drm_dma_handle_t;
 
 /**
  * Buffer entry.  There is one of this for each buffer size order.
  */
 struct drm_buf_entry {
 	int buf_size;			/**< size */
 	int buf_count;			/**< number of buffers */
 	struct drm_buf *buflist;		/**< buffer list */
 	int seg_count;
 	int page_order;
 	struct drm_dma_handle **seglist;
 
 	struct drm_freelist freelist;
 };
 
 /* Event queued up for userspace to read */
 struct drm_pending_event {
 	struct drm_event *event;
 	struct list_head link;
 	struct drm_file *file_priv;
 	pid_t pid; /* pid of requester, no guarantee it's valid by the time
 		      we deliver the event, for tracing only */
 	void (*destroy)(struct drm_pending_event *event);
 };
 
 /* initial implementaton using a linked list - todo hashtab */
 struct drm_prime_file_private {
 	struct list_head head;
 	struct mtx lock;
 };
 
 struct drm_file {
 	int authenticated;
 	pid_t pid;
 	uid_t uid;
 	drm_magic_t magic;
 	unsigned long ioctl_count;
 	struct list_head lhead;
 	struct drm_minor *minor;
 	unsigned long lock_count;
 
 	void *driver_priv;
 	struct drm_gem_names object_names;
 
 	int is_master; /* this file private is a master for a minor */
 	struct drm_master *master; /* master this node is currently associated with
 				      N.B. not always minor->master */
 	struct list_head fbs;
 
 	struct selinfo event_poll;
 	struct list_head event_list;
 	int event_space;
 
 	struct drm_prime_file_private prime;
 };
 
 /**
  * Lock data.
  */
 struct drm_lock_data {
 	struct drm_hw_lock *hw_lock;	/**< Hardware lock */
 	/** Private of lock holder's file (NULL=kernel) */
 	struct drm_file *file_priv;
 	wait_queue_head_t lock_queue;	/**< Queue of blocked processes */
 	unsigned long lock_time;	/**< Time of last lock in jiffies */
 	struct mtx spinlock;
 	uint32_t kernel_waiters;
 	uint32_t user_waiters;
 	int idle_has_lock;
 };
 
 /**
  * DMA data.
  */
 struct drm_device_dma {
 
 	struct drm_buf_entry bufs[DRM_MAX_ORDER + 1];	/**< buffers, grouped by their size order */
 	int buf_count;			/**< total number of buffers */
 	struct drm_buf **buflist;		/**< Vector of pointers into drm_device_dma::bufs */
 	int seg_count;
 	int page_count;			/**< number of pages */
 	unsigned long *pagelist;	/**< page list */
 	unsigned long byte_count;
 	enum {
 		_DRM_DMA_USE_AGP = 0x01,
 		_DRM_DMA_USE_SG = 0x02,
 		_DRM_DMA_USE_FB = 0x04,
 		_DRM_DMA_USE_PCI_RO = 0x08
 	} flags;
 
 };
 
 /**
  * AGP memory entry.  Stored as a doubly linked list.
  */
 struct drm_agp_mem {
 	unsigned long handle;		/**< handle */
 	DRM_AGP_MEM *memory;
 	unsigned long bound;		/**< address */
 	int pages;
 	struct list_head head;
 };
 
 /**
  * AGP data.
  *
  * \sa drm_agp_init() and drm_device::agp.
  */
 struct drm_agp_head {
 	DRM_AGP_KERN agp_info;		/**< AGP device information */
 	struct list_head memory;
 	unsigned long mode;		/**< AGP mode */
 	device_t bridge;
 	int enabled;			/**< whether the AGP bus as been enabled */
 	int acquired;			/**< whether the AGP device has been acquired */
 	unsigned long base;
 	int agp_mtrr;
 	int cant_use_aperture;
 };
 
 /**
  * Scatter-gather memory.
  */
 struct drm_sg_mem {
 	vm_offset_t vaddr;
 	vm_paddr_t *busaddr;
 	vm_pindex_t pages;
 };
 
 struct drm_sigdata {
 	int context;
 	struct drm_hw_lock *lock;
 };
 
 /**
  * Kernel side of a mapping
  */
 #define DRM_MAP_HANDLE_BITS	(sizeof(void *) == 4 ? 4 : 24)
 #define DRM_MAP_HANDLE_SHIFT	(sizeof(void *) * 8 - DRM_MAP_HANDLE_BITS)
 
 struct drm_local_map {
 	resource_size_t offset;	 /**< Requested physical address (0 for SAREA)*/
 	unsigned long size;	 /**< Requested physical size (bytes) */
 	enum drm_map_type type;	 /**< Type of memory to map */
 	enum drm_map_flags flags;	 /**< Flags */
 	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
 				 /**< Kernel-space: kernel-virtual address */
 	int mtrr;		 /**< MTRR slot used */
 
 				  /* Private data                         */
 	drm_dma_handle_t *dmah;
 };
 
 typedef struct drm_local_map drm_local_map_t;
 
 /**
  * Mappings list
  */
 struct drm_map_list {
 	struct list_head head;		/**< list head */
 	struct drm_hash_item hash;
 	struct drm_local_map *map;	/**< mapping */
 	uint64_t user_token;
 	struct drm_master *master;
 	struct drm_mm_node *file_offset_node;	/**< fake offset */
 };
 
 /**
  * Context handle list
  */
 struct drm_ctx_list {
 	struct list_head head;		/**< list head */
 	drm_context_t handle;		/**< context handle */
 	struct drm_file *tag;		/**< associated fd private data */
 };
 
 /* location of GART table */
 #define DRM_ATI_GART_MAIN 1
 #define DRM_ATI_GART_FB   2
 
 #define DRM_ATI_GART_PCI 1
 #define DRM_ATI_GART_PCIE 2
 #define DRM_ATI_GART_IGP 3
 
 struct drm_ati_pcigart_info {
 	int gart_table_location;
 	int gart_reg_if;
 	void *addr;
 	dma_addr_t bus_addr;
 	dma_addr_t table_mask;
 	struct drm_dma_handle *table_handle;
 	struct drm_local_map mapping;
 	int table_size;
 	struct drm_dma_handle *dmah; /* handle for ATI PCIGART table FIXME */
 };
 
 /**
  * GEM specific mm private for tracking GEM objects
  */
 struct drm_gem_mm {
 	struct unrhdr *idxunr;
 	struct drm_open_hash offset_hash; /**< User token hash table for maps */
 };
 
 /**
  * This structure defines the drm_mm memory object, which will be used by the
  * DRM for its buffer objects.
  */
 struct drm_gem_object {
 	/** Reference count of this object */
 	u_int refcount;
 
 	/** Handle count of this object. Each handle also holds a reference */
 	atomic_t handle_count; /* number of handles on this object */
 
 	/** Related drm device */
 	struct drm_device *dev;
 
 	/** File representing the shmem storage: filp in Linux parlance */
 	vm_object_t vm_obj;
 
 	/* Mapping info for this object */
 	bool on_map;
 	struct drm_hash_item map_list;
 
 	/**
 	 * Size of the object, in bytes.  Immutable over the object's
 	 * lifetime.
 	 */
 	size_t size;
 
 	/**
 	 * Global name for this object, starts at 1. 0 means unnamed.
 	 * Access is covered by the object_name_lock in the related drm_device
 	 */
 	int name;
 
 	/**
 	 * Memory domains. These monitor which caches contain read/write data
 	 * related to the object. When transitioning from one set of domains
 	 * to another, the driver is called to ensure that caches are suitably
 	 * flushed and invalidated
 	 */
 	uint32_t read_domains;
 	uint32_t write_domain;
 
 	/**
 	 * While validating an exec operation, the
 	 * new read/write domain values are computed here.
 	 * They will be transferred to the above values
 	 * at the point that any cache flushing occurs
 	 */
 	uint32_t pending_read_domains;
 	uint32_t pending_write_domain;
 
 	void *driver_private;
 
 #ifdef FREEBSD_NOTYET
 	/* dma buf exported from this GEM object */
 	struct dma_buf *export_dma_buf;
 
 	/* dma buf attachment backing this object */
 	struct dma_buf_attachment *import_attach;
 #endif /* FREEBSD_NOTYET */
 };
 
 #include <dev/drm2/drm_crtc.h>
 
 /* per-master structure */
 struct drm_master {
 
 	u_int refcount; /* refcount for this master */
 
 	struct list_head head; /**< each minor contains a list of masters */
 	struct drm_minor *minor; /**< link back to minor we are a master for */
 
 	char *unique;			/**< Unique identifier: e.g., busid */
 	int unique_len;			/**< Length of unique field */
 	int unique_size;		/**< amount allocated */
 
 	int blocked;			/**< Blocked due to VC switch? */
 
 	/** \name Authentication */
 	/*@{ */
 	struct drm_open_hash magiclist;
 	struct list_head magicfree;
 	/*@} */
 
 	struct drm_lock_data lock;	/**< Information on hardware lock */
 
 	void *driver_priv; /**< Private structure for driver to use */
 };
 
 /* Size of ringbuffer for vblank timestamps. Just double-buffer
  * in initial implementation.
  */
 #define DRM_VBLANKTIME_RBSIZE 2
 
 /* Flags and return codes for get_vblank_timestamp() driver function. */
 #define DRM_CALLED_FROM_VBLIRQ 1
 #define DRM_VBLANKTIME_SCANOUTPOS_METHOD (1 << 0)
 #define DRM_VBLANKTIME_INVBL             (1 << 1)
 
 /* get_scanout_position() return flags */
 #define DRM_SCANOUTPOS_VALID        (1 << 0)
 #define DRM_SCANOUTPOS_INVBL        (1 << 1)
 #define DRM_SCANOUTPOS_ACCURATE     (1 << 2)
 
 struct drm_bus {
 	int bus_type;
 	int (*get_irq)(struct drm_device *dev);
 	void (*free_irq)(struct drm_device *dev);
 	const char *(*get_name)(struct drm_device *dev);
 	int (*set_busid)(struct drm_device *dev, struct drm_master *master);
 	int (*set_unique)(struct drm_device *dev, struct drm_master *master,
 			  struct drm_unique *unique);
 	int (*irq_by_busid)(struct drm_device *dev, struct drm_irq_busid *p);
 	/* hooks that are for PCI */
 	int (*agp_init)(struct drm_device *dev);
 
 };
 
 /**
  * DRM driver structure. This structure represent the common code for
  * a family of cards. There will one drm_device for each card present
  * in this family
  */
 struct drm_driver {
 	int (*load) (struct drm_device *, unsigned long flags);
 	int (*firstopen) (struct drm_device *);
 	int (*open) (struct drm_device *, struct drm_file *);
 	void (*preclose) (struct drm_device *, struct drm_file *file_priv);
 	void (*postclose) (struct drm_device *, struct drm_file *);
 	void (*lastclose) (struct drm_device *);
 	int (*unload) (struct drm_device *);
 	int (*suspend) (struct drm_device *, pm_message_t state);
 	int (*resume) (struct drm_device *);
 	int (*dma_ioctl) (struct drm_device *dev, void *data, struct drm_file *file_priv);
 	int (*dma_quiescent) (struct drm_device *);
 	int (*context_dtor) (struct drm_device *dev, int context);
 
 	/**
 	 * get_vblank_counter - get raw hardware vblank counter
 	 * @dev: DRM device
 	 * @crtc: counter to fetch
 	 *
 	 * Driver callback for fetching a raw hardware vblank counter for @crtc.
 	 * If a device doesn't have a hardware counter, the driver can simply
 	 * return the value of drm_vblank_count. The DRM core will account for
 	 * missed vblank events while interrupts where disabled based on system
 	 * timestamps.
 	 *
 	 * Wraparound handling and loss of events due to modesetting is dealt
 	 * with in the DRM core code.
 	 *
 	 * RETURNS
 	 * Raw vblank counter value.
 	 */
 	u32 (*get_vblank_counter) (struct drm_device *dev, int crtc);
 
 	/**
 	 * enable_vblank - enable vblank interrupt events
 	 * @dev: DRM device
 	 * @crtc: which irq to enable
 	 *
 	 * Enable vblank interrupts for @crtc.  If the device doesn't have
 	 * a hardware vblank counter, this routine should be a no-op, since
 	 * interrupts will have to stay on to keep the count accurate.
 	 *
 	 * RETURNS
 	 * Zero on success, appropriate errno if the given @crtc's vblank
 	 * interrupt cannot be enabled.
 	 */
 	int (*enable_vblank) (struct drm_device *dev, int crtc);
 
 	/**
 	 * disable_vblank - disable vblank interrupt events
 	 * @dev: DRM device
 	 * @crtc: which irq to enable
 	 *
 	 * Disable vblank interrupts for @crtc.  If the device doesn't have
 	 * a hardware vblank counter, this routine should be a no-op, since
 	 * interrupts will have to stay on to keep the count accurate.
 	 */
 	void (*disable_vblank) (struct drm_device *dev, int crtc);
 
 	/**
 	 * Called by \c drm_device_is_agp.  Typically used to determine if a
 	 * card is really attached to AGP or not.
 	 *
 	 * \param dev  DRM device handle
 	 *
 	 * \returns
 	 * One of three values is returned depending on whether or not the
 	 * card is absolutely \b not AGP (return of 0), absolutely \b is AGP
 	 * (return of 1), or may or may not be AGP (return of 2).
 	 */
 	int (*device_is_agp) (struct drm_device *dev);
 
 	/**
 	 * Called by vblank timestamping code.
 	 *
 	 * Return the current display scanout position from a crtc.
 	 *
 	 * \param dev  DRM device.
 	 * \param crtc Id of the crtc to query.
 	 * \param *vpos Target location for current vertical scanout position.
 	 * \param *hpos Target location for current horizontal scanout position.
 	 *
 	 * Returns vpos as a positive number while in active scanout area.
 	 * Returns vpos as a negative number inside vblank, counting the number
 	 * of scanlines to go until end of vblank, e.g., -1 means "one scanline
 	 * until start of active scanout / end of vblank."
 	 *
 	 * \return Flags, or'ed together as follows:
 	 *
 	 * DRM_SCANOUTPOS_VALID = Query successful.
 	 * DRM_SCANOUTPOS_INVBL = Inside vblank.
 	 * DRM_SCANOUTPOS_ACCURATE = Returned position is accurate. A lack of
 	 * this flag means that returned position may be offset by a constant
 	 * but unknown small number of scanlines wrt. real scanout position.
 	 *
 	 */
 	int (*get_scanout_position) (struct drm_device *dev, int crtc,
 				     int *vpos, int *hpos);
 
 	/**
 	 * Called by \c drm_get_last_vbltimestamp. Should return a precise
 	 * timestamp when the most recent VBLANK interval ended or will end.
 	 *
 	 * Specifically, the timestamp in @vblank_time should correspond as
 	 * closely as possible to the time when the first video scanline of
 	 * the video frame after the end of VBLANK will start scanning out,
 	 * the time immediately after end of the VBLANK interval. If the
 	 * @crtc is currently inside VBLANK, this will be a time in the future.
 	 * If the @crtc is currently scanning out a frame, this will be the
 	 * past start time of the current scanout. This is meant to adhere
 	 * to the OpenML OML_sync_control extension specification.
 	 *
 	 * \param dev dev DRM device handle.
 	 * \param crtc crtc for which timestamp should be returned.
 	 * \param *max_error Maximum allowable timestamp error in nanoseconds.
 	 *                   Implementation should strive to provide timestamp
 	 *                   with an error of at most *max_error nanoseconds.
 	 *                   Returns true upper bound on error for timestamp.
 	 * \param *vblank_time Target location for returned vblank timestamp.
 	 * \param flags 0 = Defaults, no special treatment needed.
 	 * \param       DRM_CALLED_FROM_VBLIRQ = Function is called from vblank
 	 *	        irq handler. Some drivers need to apply some workarounds
 	 *              for gpu-specific vblank irq quirks if flag is set.
 	 *
 	 * \returns
 	 * Zero if timestamping isn't supported in current display mode or a
 	 * negative number on failure. A positive status code on success,
 	 * which describes how the vblank_time timestamp was computed.
 	 */
 	int (*get_vblank_timestamp) (struct drm_device *dev, int crtc,
 				     int *max_error,
 				     struct timeval *vblank_time,
 				     unsigned flags);
 
 	/* these have to be filled in */
 
 	irqreturn_t(*irq_handler) (DRM_IRQ_ARGS);
 	void (*irq_preinstall) (struct drm_device *dev);
 	int (*irq_postinstall) (struct drm_device *dev);
 	void (*irq_uninstall) (struct drm_device *dev);
 	void (*set_version) (struct drm_device *dev,
 			     struct drm_set_version *sv);
 
 	/* Master routines */
 	int (*master_create)(struct drm_device *dev, struct drm_master *master);
 	void (*master_destroy)(struct drm_device *dev, struct drm_master *master);
 	/**
 	 * master_set is called whenever the minor master is set.
 	 * master_drop is called whenever the minor master is dropped.
 	 */
 
 	int (*master_set)(struct drm_device *dev, struct drm_file *file_priv,
 			  bool from_open);
 	void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv,
 			    bool from_release);
 
 	/**
 	 * Driver-specific constructor for drm_gem_objects, to set up
 	 * obj->driver_private.
 	 *
 	 * Returns 0 on success.
 	 */
 	int (*gem_init_object) (struct drm_gem_object *obj);
 	void (*gem_free_object) (struct drm_gem_object *obj);
 	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
 	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
 
 #ifdef FREEBSD_NOTYET
 	/* prime: */
 	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
 	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
 				uint32_t handle, uint32_t flags, int *prime_fd);
 	/* import fd -> handle (see drm_gem_prime_fd_to_handle() helper) */
 	int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv,
 				int prime_fd, uint32_t *handle);
 	/* export GEM -> dmabuf */
 	struct dma_buf * (*gem_prime_export)(struct drm_device *dev,
 				struct drm_gem_object *obj, int flags);
 	/* import dmabuf -> GEM */
 	struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev,
 				struct dma_buf *dma_buf);
 #endif /* defined(FREEBSD_NOTYET) */
 
 	/* dumb alloc support */
 	int (*dumb_create)(struct drm_file *file_priv,
 			   struct drm_device *dev,
 			   struct drm_mode_create_dumb *args);
 	int (*dumb_map_offset)(struct drm_file *file_priv,
 			       struct drm_device *dev, uint32_t handle,
 			       uint64_t *offset);
 	int (*dumb_destroy)(struct drm_file *file_priv,
 			    struct drm_device *dev,
 			    uint32_t handle);
 
 	/* Driver private ops for this object */
 	struct cdev_pager_ops *gem_pager_ops;
 
 	int	(*sysctl_init)(struct drm_device *dev,
 		    struct sysctl_ctx_list *ctx, struct sysctl_oid *top);
 	void	(*sysctl_cleanup)(struct drm_device *dev);
 
 	int major;
 	int minor;
 	int patchlevel;
 	char *name;
 	char *desc;
 	char *date;
 
 	u32 driver_features;
 	int dev_priv_size;
 	struct drm_ioctl_desc *ioctls;
 	int num_ioctls;
 	struct drm_bus *bus;
 #ifdef COMPAT_FREEBSD32
 	struct drm_ioctl_desc *compat_ioctls;
 	int *num_compat_ioctls;
 #endif
 
 	int	buf_priv_size;
 };
 
 #define DRM_MINOR_UNASSIGNED 0
 #define DRM_MINOR_LEGACY 1
 #define DRM_MINOR_CONTROL 2
 #define DRM_MINOR_RENDER 3
 
 /**
  * DRM minor structure. This structure represents a drm minor number.
  */
 struct drm_minor {
 	int index;			/**< Minor device number */
 	int type;                       /**< Control or render */
 	struct cdev *device;		/**< Device number for mknod */
 	device_t kdev;			/**< OS device */
 	struct drm_device *dev;
 
 	struct drm_master *master; /* currently active master for this node */
 	struct list_head master_list;
 	struct drm_mode_group mode_group;
 
 	struct sigio *buf_sigio;	/* Processes waiting for SIGIO     */
 };
 
 /* mode specified on the command line */
 struct drm_cmdline_mode {
 	bool specified;
 	bool refresh_specified;
 	bool bpp_specified;
 	int xres, yres;
 	int bpp;
 	int refresh;
 	bool rb;
 	bool interlace;
 	bool cvt;
 	bool margins;
 	enum drm_connector_force force;
 };
 
 
 struct drm_pending_vblank_event {
 	struct drm_pending_event base;
 	int pipe;
 	struct drm_event_vblank event;
 };
 
 /**
  * DRM device structure. This structure represent a complete card that
  * may contain multiple heads.
  */
 struct drm_device {
 	int if_version;			/**< Highest interface version set */
 
 	/** \name Locks */
 	/*@{ */
 	struct mtx count_lock;		/**< For inuse, drm_device::open_count, drm_device::buf_use */
 	struct sx dev_struct_lock;	/**< For others */
 	/*@} */
 
 	/** \name Usage Counters */
 	/*@{ */
 	int open_count;			/**< Outstanding files open */
 	atomic_t ioctl_count;		/**< Outstanding IOCTLs pending */
 	atomic_t vma_count;		/**< Outstanding vma areas open */
 	int buf_use;			/**< Buffers in use -- cannot alloc */
 	atomic_t buf_alloc;		/**< Buffer allocation in progress */
 	/*@} */
 
 	/** \name Performance counters */
 	/*@{ */
 	unsigned long counters;
 	enum drm_stat_type types[15];
 	atomic_t counts[15];
 	/*@} */
 
 	struct list_head filelist;
 
 	/** \name Memory management */
 	/*@{ */
 	struct list_head maplist;	/**< Linked list of regions */
 	int map_count;			/**< Number of mappable regions */
 	struct drm_open_hash map_hash;	/**< User token hash table for maps */
 
 	/** \name Context handle management */
 	/*@{ */
 	struct list_head ctxlist;	/**< Linked list of context handles */
 	int ctx_count;			/**< Number of context handles */
 	struct mtx ctxlist_mutex;	/**< For ctxlist */
 	drm_local_map_t **context_sareas;
 	int max_context;
 	unsigned long *ctx_bitmap;
 
 	/*@} */
 
 	/** \name DMA support */
 	/*@{ */
 	struct drm_device_dma *dma;		/**< Optional pointer for DMA support */
 	/*@} */
 
 	/** \name Context support */
 	/*@{ */
 	int irq_enabled;		/**< True if irq handler is enabled */
 	atomic_t context_flag;		/**< Context swapping flag */
 	atomic_t interrupt_flag;	/**< Interruption handler flag */
 	atomic_t dma_flag;		/**< DMA dispatch flag */
 	wait_queue_head_t context_wait;	/**< Processes waiting on ctx switch */
 	int last_checked;		/**< Last context checked for DMA */
 	int last_context;		/**< Last current context */
 	unsigned long last_switch;	/**< jiffies at last context switch */
 	/*@} */
 
 	/** \name VBLANK IRQ support */
 	/*@{ */
 
 	/*
 	 * At load time, disabling the vblank interrupt won't be allowed since
 	 * old clients may not call the modeset ioctl and therefore misbehave.
 	 * Once the modeset ioctl *has* been called though, we can safely
 	 * disable them when unused.
 	 */
 	int vblank_disable_allowed;
 
 	atomic_t *_vblank_count;        /**< number of VBLANK interrupts (driver must alloc the right number of counters) */
 	struct timeval *_vblank_time;   /**< timestamp of current vblank_count (drivers must alloc right number of fields) */
 	struct mtx vblank_time_lock;    /**< Protects vblank count and time updates during vblank enable/disable */
 	struct mtx vbl_lock;
 	atomic_t *vblank_refcount;      /* number of users of vblank interruptsper crtc */
 	u32 *last_vblank;               /* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int *vblank_enabled;            /* so we don't call enable more than
 					   once per disable */
 	int *vblank_inmodeset;          /* Display driver is setting mode */
 	u32 *last_vblank_wait;		/* Last vblank seqno waited per CRTC */
 	struct callout vblank_disable_callout;
 
 	u32 max_vblank_count;           /**< size of vblank counter register */
 
 	/**
 	 * List of events
 	 */
 	struct list_head vblank_event_list;
 	struct mtx event_lock;
 
 	/*@} */
 
 	struct drm_agp_head *agp;	/**< AGP data */
 
 	device_t dev;			/* Device instance from newbus */
 	uint16_t pci_device;		/* PCI device id */
 	uint16_t pci_vendor;		/* PCI vendor id */
 	uint16_t pci_subdevice;		/* PCI subsystem device id */
 	uint16_t pci_subvendor;		/* PCI subsystem vendor id */
 
 	struct drm_sg_mem *sg;	/**< Scatter gather memory */
 	unsigned int num_crtcs;                  /**< Number of CRTCs on this device */
 	void *dev_private;		/**< device private data */
 	void *mm_private;
 	struct drm_sigdata sigdata;	   /**< For block_all_signals */
 	sigset_t sigmask;
 
 	struct drm_driver *driver;
 	struct drm_local_map *agp_buffer_map;
 	unsigned int agp_buffer_token;
 	struct drm_minor *control;		/**< Control node for card */
 	struct drm_minor *primary;		/**< render type primary screen head */
 
         struct drm_mode_config mode_config;	/**< Current mode config */
 
 	/** \name GEM information */
 	/*@{ */
 	struct sx object_name_lock;
 	struct drm_gem_names object_names;
 	/*@} */
 	int switch_power_state;
 
 	atomic_t unplugged; /* device has been unplugged or gone away */
 
 				/* Locks */
 	struct mtx	  dma_lock;	/* protects dev->dma */
 	struct mtx	  irq_lock;	/* protects irq condition checks */
 
 				/* Context support */
 	int		  irq;		/* Interrupt used by board	   */
 	int		  msi_enabled;	/* MSI enabled */
 	int		  irqrid;	/* Interrupt used by board */
 	struct resource   *irqr;	/* Resource for interrupt used by board	   */
 	void		  *irqh;	/* Handle from bus_setup_intr      */
 
 	/* Storage of resource pointers for drm_get_resource_* */
 #define	DRM_MAX_PCI_RESOURCE	6
 	struct resource   *pcir[DRM_MAX_PCI_RESOURCE];
 	int		  pcirid[DRM_MAX_PCI_RESOURCE];
 	struct mtx	  pcir_lock;
 
 	int		  pci_domain;
 	int		  pci_bus;
 	int		  pci_slot;
 	int		  pci_func;
 
 				/* Sysctl support */
 	struct drm_sysctl_info *sysctl;
 	int		  sysctl_node_idx;
 
 	void		  *drm_ttm_bdev;
 
 	void *sysctl_private;
 	char busid_str[128];
 	int modesetting;
 
 	const drm_pci_id_list_t *id_entry;	/* PCI ID, name, and chipset private */
 };
 
 #define DRM_SWITCH_POWER_ON 0
 #define DRM_SWITCH_POWER_OFF 1
 #define DRM_SWITCH_POWER_CHANGING 2
 
 static __inline__ int drm_core_check_feature(struct drm_device *dev,
 					     int feature)
 {
 	return ((dev->driver->driver_features & feature) ? 1 : 0);
 }
 
 static inline int drm_dev_to_irq(struct drm_device *dev)
 {
 	return dev->driver->bus->get_irq(dev);
 }
 
 
 #if __OS_HAS_AGP
 static inline int drm_core_has_AGP(struct drm_device *dev)
 {
 	return drm_core_check_feature(dev, DRIVER_USE_AGP);
 }
 #else
 #define drm_core_has_AGP(dev) (0)
 #endif
 
 #if __OS_HAS_MTRR
 static inline int drm_core_has_MTRR(struct drm_device *dev)
 {
 	return drm_core_check_feature(dev, DRIVER_USE_MTRR);
 }
 
 #define DRM_MTRR_WC		MDF_WRITECOMBINE
 
 int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags);
 int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags);
 
 #else
 #define drm_core_has_MTRR(dev) (0)
 
 #define DRM_MTRR_WC		0
 
 static inline int drm_mtrr_add(unsigned long offset, unsigned long size,
 			       unsigned int flags)
 {
 	return 0;
 }
 
 static inline int drm_mtrr_del(int handle, unsigned long offset,
 			       unsigned long size, unsigned int flags)
 {
 	return 0;
 }
 #endif
 
 /******************************************************************/
 /** \name Internal function definitions */
 /*@{*/
 
 				/* Driver support (drm_drv.h) */
 d_ioctl_t drm_ioctl;
 extern int drm_lastclose(struct drm_device *dev);
 
 				/* Device support (drm_fops.h) */
 extern struct sx drm_global_mutex;
 d_open_t drm_open;
 d_read_t drm_read;
 extern void drm_release(void *data);
 
 				/* Mapping support (drm_vm.h) */
 d_mmap_t drm_mmap;
 int	drm_mmap_single(struct cdev *kdev, vm_ooffset_t *offset,
 	    vm_size_t size, struct vm_object **obj_res, int nprot);
 d_poll_t drm_poll;
 
 
 				/* Misc. IOCTL support (drm_ioctl.h) */
 extern int drm_irq_by_busid(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 extern int drm_getunique(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_setunique(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_getmap(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_getclient(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_getstats(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_getcap(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_setversion(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
 extern int drm_noop(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 				/* Context IOCTL support (drm_context.h) */
 extern int drm_resctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_addctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_modctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_getctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_switchctx(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_newctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_rmctx(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 
 extern int drm_ctxbitmap_init(struct drm_device *dev);
 extern void drm_ctxbitmap_cleanup(struct drm_device *dev);
 extern void drm_ctxbitmap_free(struct drm_device *dev, int ctx_handle);
 
 extern int drm_setsareactx(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 extern int drm_getsareactx(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
 				/* Authentication IOCTL support (drm_auth.h) */
 extern int drm_getmagic(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_authmagic(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic);
 
 /* Cache management (drm_cache.c) */
 void drm_clflush_pages(vm_page_t *pages, unsigned long num_pages);
 void drm_clflush_virt_range(char *addr, unsigned long length);
 
 				/* Locking IOCTL support (drm_lock.h) */
 extern int drm_lock(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 extern int drm_unlock(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_lock_free(struct drm_lock_data *lock_data, unsigned int context);
 extern void drm_idlelock_take(struct drm_lock_data *lock_data);
 extern void drm_idlelock_release(struct drm_lock_data *lock_data);
 
 /*
  * These are exported to drivers so that they can implement fencing using
  * DMA quiscent + idle. DMA quiescent usually requires the hardware lock.
  */
 
 extern int drm_i_have_hw_lock(struct drm_device *dev, struct drm_file *file_priv);
 
 				/* Buffer management support (drm_bufs.h) */
 extern int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc * request);
 extern int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc * request);
 extern int drm_addmap(struct drm_device *dev, resource_size_t offset,
 		      unsigned int size, enum drm_map_type type,
 		      enum drm_map_flags flags, struct drm_local_map **map_ptr);
 extern int drm_addmap_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 extern int drm_rmmap(struct drm_device *dev, struct drm_local_map *map);
 extern int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map);
 extern int drm_rmmap_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 extern int drm_addbufs(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_infobufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_markbufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_freebufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_mapbufs(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_order(unsigned long size);
 
 				/* DMA support (drm_dma.h) */
 extern int drm_dma_setup(struct drm_device *dev);
 extern void drm_dma_takedown(struct drm_device *dev);
 extern void drm_free_buffer(struct drm_device *dev, struct drm_buf * buf);
 extern void drm_core_reclaim_buffers(struct drm_device *dev,
 				     struct drm_file *filp);
 
 				/* IRQ support (drm_irq.h) */
 extern int drm_control(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_irq_install(struct drm_device *dev);
 extern int drm_irq_uninstall(struct drm_device *dev);
 
 extern int drm_vblank_init(struct drm_device *dev, int num_crtcs);
 extern int drm_wait_vblank(struct drm_device *dev, void *data,
 			   struct drm_file *filp);
 extern int drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq);
 extern u32 drm_vblank_count(struct drm_device *dev, int crtc);
 extern u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc,
 				     struct timeval *vblanktime);
 extern void drm_send_vblank_event(struct drm_device *dev, int crtc,
 				     struct drm_pending_vblank_event *e);
 extern bool drm_handle_vblank(struct drm_device *dev, int crtc);
 extern int drm_vblank_get(struct drm_device *dev, int crtc);
 extern void drm_vblank_put(struct drm_device *dev, int crtc);
 extern void drm_vblank_off(struct drm_device *dev, int crtc);
 extern void drm_vblank_cleanup(struct drm_device *dev);
 extern u32 drm_get_last_vbltimestamp(struct drm_device *dev, int crtc,
 				     struct timeval *tvblank, unsigned flags);
 extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev,
 						 int crtc, int *max_error,
 						 struct timeval *vblank_time,
 						 unsigned flags,
 						 struct drm_crtc *refcrtc);
 extern void drm_calc_timestamping_constants(struct drm_crtc *crtc);
 
 extern bool
 drm_mode_parse_command_line_for_connector(const char *mode_option,
 					  struct drm_connector *connector,
 					  struct drm_cmdline_mode *mode);
 
 extern struct drm_display_mode *
 drm_mode_create_from_cmdline_mode(struct drm_device *dev,
 				  struct drm_cmdline_mode *cmd);
 
 /* Modesetting support */
 extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc);
 extern void drm_vblank_post_modeset(struct drm_device *dev, int crtc);
 extern int drm_modeset_ctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
 
 				/* Stub support (drm_stub.h) */
 extern int drm_setmaster_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file_priv);
 extern int drm_dropmaster_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
 struct drm_master *drm_master_create(struct drm_minor *minor);
 extern struct drm_master *drm_master_get(struct drm_master *master);
 extern void drm_master_put(struct drm_master **master);
 
 extern void drm_put_dev(struct drm_device *dev);
 extern int drm_put_minor(struct drm_minor **minor);
 extern void drm_unplug_dev(struct drm_device *dev);
 extern unsigned int drm_debug;
 extern unsigned int drm_notyet;
 
 extern unsigned int drm_vblank_offdelay;
 extern unsigned int drm_timestamp_precision;
 extern unsigned int drm_timestamp_monotonic;
 
 extern struct drm_local_map *drm_getsarea(struct drm_device *dev);
 
 
 #ifdef FREEBSD_NOTYET
 extern int drm_gem_prime_handle_to_fd(struct drm_device *dev,
 		struct drm_file *file_priv, uint32_t handle, uint32_t flags,
 		int *prime_fd);
 extern int drm_gem_prime_fd_to_handle(struct drm_device *dev,
 		struct drm_file *file_priv, int prime_fd, uint32_t *handle);
 
 extern int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv);
 extern int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv);
 
 extern int drm_prime_sg_to_page_addr_arrays(struct sg_table *sgt, vm_page_t *pages,
 					    dma_addr_t *addrs, int max_pages);
 extern struct sg_table *drm_prime_pages_to_sg(vm_page_t *pages, int nr_pages);
 extern void drm_prime_gem_destroy(struct drm_gem_object *obj, struct sg_table *sg);
 
 
 void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv);
 void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv);
 int drm_prime_add_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t handle);
 int drm_prime_lookup_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t *handle);
 void drm_prime_remove_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf);
 
 int drm_prime_add_dma_buf(struct drm_device *dev, struct drm_gem_object *obj);
 int drm_prime_lookup_obj(struct drm_device *dev, struct dma_buf *buf,
 			 struct drm_gem_object **obj);
 #endif /* FREEBSD_NOTYET */
 
 				/* Scatter Gather Support (drm_scatter.h) */
 extern void drm_sg_cleanup(struct drm_sg_mem * entry);
 extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
 extern int drm_sg_free(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 
 			       /* ATI PCIGART support (ati_pcigart.h) */
 extern int drm_ati_pcigart_init(struct drm_device *dev,
 				struct drm_ati_pcigart_info * gart_info);
 extern int drm_ati_pcigart_cleanup(struct drm_device *dev,
 				   struct drm_ati_pcigart_info * gart_info);
 
 extern drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size,
 				       size_t align, dma_addr_t maxaddr);
 extern void __drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah);
 extern void drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah);
 
 /* Graphics Execution Manager library functions (drm_gem.c) */
 int drm_gem_init(struct drm_device *dev);
 void drm_gem_destroy(struct drm_device *dev);
 void drm_gem_object_release(struct drm_gem_object *obj);
 void drm_gem_object_free(struct drm_gem_object *obj);
 struct drm_gem_object *drm_gem_object_alloc(struct drm_device *dev,
 					    size_t size);
 int drm_gem_object_init(struct drm_device *dev,
 			struct drm_gem_object *obj, size_t size);
 int drm_gem_private_object_init(struct drm_device *dev,
 			struct drm_gem_object *obj, size_t size);
 void drm_gem_object_handle_free(struct drm_gem_object *obj);
 int drm_gem_mmap_single(struct drm_device *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **obj_res, int nprot);
 void drm_gem_pager_dtr(void *obj);
 
 #include <dev/drm2/drm_global.h>
 
 static inline void
 drm_gem_object_reference(struct drm_gem_object *obj)
 {
 
 	KASSERT(obj->refcount > 0, ("Dangling obj %p", obj));
 	refcount_acquire(&obj->refcount);
 }
 
 static inline void
 drm_gem_object_unreference(struct drm_gem_object *obj)
 {
 
 	if (obj == NULL)
 		return;
 	if (refcount_release(&obj->refcount))
 		drm_gem_object_free(obj);
 }
 
 static inline void
 drm_gem_object_unreference_unlocked(struct drm_gem_object *obj)
 {
 	if (obj != NULL) {
 		struct drm_device *dev = obj->dev;
 		DRM_LOCK(dev);
 		drm_gem_object_unreference(obj);
 		DRM_UNLOCK(dev);
 	}
 }
 
 int drm_gem_handle_create(struct drm_file *file_priv,
 			  struct drm_gem_object *obj,
 			  u32 *handlep);
 int drm_gem_handle_delete(struct drm_file *filp, u32 handle);
 
 static inline void
 drm_gem_object_handle_reference(struct drm_gem_object *obj)
 {
 	drm_gem_object_reference(obj);
 	atomic_inc(&obj->handle_count);
 }
 
 static inline void
 drm_gem_object_handle_unreference(struct drm_gem_object *obj)
 {
 	if (obj == NULL)
 		return;
 
 	if (atomic_read(&obj->handle_count) == 0)
 		return;
 	/*
 	 * Must bump handle count first as this may be the last
 	 * ref, in which case the object would disappear before we
 	 * checked for a name
 	 */
 	if (atomic_dec_and_test(&obj->handle_count))
 		drm_gem_object_handle_free(obj);
 	drm_gem_object_unreference(obj);
 }
 
 static inline void
 drm_gem_object_handle_unreference_unlocked(struct drm_gem_object *obj)
 {
 	if (obj == NULL)
 		return;
 
 	if (atomic_read(&obj->handle_count) == 0)
 		return;
 
 	/*
 	* Must bump handle count first as this may be the last
 	* ref, in which case the object would disappear before we
 	* checked for a name
 	*/
 
 	if (atomic_dec_and_test(&obj->handle_count))
 		drm_gem_object_handle_free(obj);
 	drm_gem_object_unreference_unlocked(obj);
 }
 
 void drm_gem_free_mmap_offset(struct drm_gem_object *obj);
 int drm_gem_create_mmap_offset(struct drm_gem_object *obj);
 
 struct drm_gem_object *drm_gem_object_lookup(struct drm_device *dev,
 					     struct drm_file *filp,
 					     u32 handle);
 int drm_gem_close_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int drm_gem_flink_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int drm_gem_open_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 void drm_gem_open(struct drm_device *dev, struct drm_file *file_private);
 void drm_gem_release(struct drm_device *dev, struct drm_file *file_private);
 
 extern void drm_core_ioremap(struct drm_local_map *map, struct drm_device *dev);
 extern void drm_core_ioremap_wc(struct drm_local_map *map, struct drm_device *dev);
 extern void drm_core_ioremapfree(struct drm_local_map *map, struct drm_device *dev);
 
 static __inline__ struct drm_local_map *drm_core_findmap(struct drm_device *dev,
 							 unsigned int token)
 {
 	struct drm_map_list *_entry;
 	list_for_each_entry(_entry, &dev->maplist, head)
 	    if (_entry->user_token == token)
 		return _entry->map;
 	return NULL;
 }
 
 static __inline__ void drm_core_dropmap(struct drm_local_map *map)
 {
 }
 
 #include <dev/drm2/drm_mem_util.h>
 
 extern int drm_fill_in_dev(struct drm_device *dev,
 			   struct drm_driver *driver);
 extern void drm_cancel_fill_in_dev(struct drm_device *dev);
 int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type);
 /*@}*/
 
 /* PCI section */
 int drm_pci_device_is_agp(struct drm_device *dev);
 int drm_pci_device_is_pcie(struct drm_device *dev);
 
 extern int drm_get_pci_dev(device_t kdev, struct drm_device *dev,
 			   struct drm_driver *driver);
 
 #define DRM_PCIE_SPEED_25 1
 #define DRM_PCIE_SPEED_50 2
 #define DRM_PCIE_SPEED_80 4
 
 extern int drm_pcie_get_speed_cap_mask(struct drm_device *dev, u32 *speed_mask);
 
 #define	drm_can_sleep()	(DRM_HZ & 1)
 
 /* Platform section */
 int drm_get_platform_dev(device_t kdev, struct drm_device *dev,
 			 struct drm_driver *driver);
 
 /* FreeBSD specific -- should be moved to drm_os_freebsd.h */
 
 #define	DRM_GEM_MAPPING_MASK	(3ULL << 62)
 #define	DRM_GEM_MAPPING_KEY	(2ULL << 62) /* Non-canonical address form */
 #define	DRM_GEM_MAX_IDX		0x3fffff
 #define	DRM_GEM_MAPPING_IDX(o)	(((o) >> 40) & DRM_GEM_MAX_IDX)
 #define	DRM_GEM_MAPPING_OFF(i)	(((uint64_t)(i)) << 40)
 #define	DRM_GEM_MAPPING_MAPOFF(o) \
     ((o) & ~(DRM_GEM_MAPPING_OFF(DRM_GEM_MAX_IDX) | DRM_GEM_MAPPING_KEY))
 
 SYSCTL_DECL(_hw_drm);
 
 #define DRM_DEV_MODE	(S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
 #define DRM_DEV_UID	UID_ROOT
 #define DRM_DEV_GID	GID_VIDEO
 
 #define DRM_WAKEUP(w)		wakeup((void *)w)
 #define DRM_WAKEUP_INT(w)	wakeup(w)
 #define DRM_INIT_WAITQUEUE(queue) do {(void)(queue);} while (0)
 
 #define DRM_CURPROC		curthread
 #define DRM_STRUCTPROC		struct thread
 #define DRM_SPINTYPE		struct mtx
 #define DRM_SPININIT(l,name)	mtx_init(l, name, NULL, MTX_DEF)
 #define DRM_SPINUNINIT(l)	mtx_destroy(l)
 #define DRM_SPINLOCK(l)		mtx_lock(l)
 #define DRM_SPINUNLOCK(u)	mtx_unlock(u)
 #define DRM_SPINLOCK_IRQSAVE(l, irqflags) do {		\
 	mtx_lock(l);					\
 	(void)irqflags;					\
 } while (0)
 #define DRM_SPINUNLOCK_IRQRESTORE(u, irqflags) mtx_unlock(u)
 #define DRM_SPINLOCK_ASSERT(l)	mtx_assert(l, MA_OWNED)
 #define	DRM_LOCK_SLEEP(dev, chan, flags, msg, timeout)			\
     (sx_sleep((chan), &(dev)->dev_struct_lock, (flags), (msg), (timeout)))
 #if defined(INVARIANTS)
 #define	DRM_LOCK_ASSERT(dev)	sx_assert(&(dev)->dev_struct_lock, SA_XLOCKED)
 #define	DRM_UNLOCK_ASSERT(dev)	sx_assert(&(dev)->dev_struct_lock, SA_UNLOCKED)
 #else
 #define	DRM_LOCK_ASSERT(d)
 #define	DRM_UNLOCK_ASSERT(d)
 #endif
 
 #define DRM_SYSCTL_HANDLER_ARGS	(SYSCTL_HANDLER_ARGS)
 
 enum {
 	DRM_IS_NOT_AGP,
 	DRM_IS_AGP,
 	DRM_MIGHT_BE_AGP
 };
 
 #define DRM_VERIFYAREA_READ( uaddr, size )		\
 	(!useracc(__DECONST(caddr_t, uaddr), size, VM_PROT_READ))
 
 #define DRM_COPY_TO_USER(user, kern, size) \
 	copyout(kern, user, size)
 #define DRM_COPY_FROM_USER(kern, user, size) \
 	copyin(user, kern, size)
 #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) 	\
 	copyin(arg2, arg1, arg3)
 #define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3)	\
 	copyout(arg2, arg1, arg3)
 #define DRM_GET_USER_UNCHECKED(val, uaddr)		\
 	((val) = fuword32(uaddr), 0)
 
 #define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do {	\
 	(_map) = (_dev)->context_sareas[_ctx];		\
 } while(0)
 
 /* Returns -errno to shared code */
 #define DRM_WAIT_ON( ret, queue, timeout, condition )		\
 for ( ret = 0 ; !ret && !(condition) ; ) {			\
 	DRM_UNLOCK(dev);					\
 	mtx_lock(&dev->irq_lock);				\
 	if (!(condition))					\
 	    ret = -mtx_sleep(&(queue), &dev->irq_lock, 		\
 		PCATCH, "drmwtq", (timeout));			\
 	    if (ret == -ERESTART)				\
 	        ret = -ERESTARTSYS;				\
 	mtx_unlock(&dev->irq_lock);				\
 	DRM_LOCK(dev);						\
 }
 
 #define	dev_err(dev, fmt, ...)						\
 	device_printf((dev), "error: " fmt, ## __VA_ARGS__)
 #define	dev_warn(dev, fmt, ...)						\
 	device_printf((dev), "warning: " fmt, ## __VA_ARGS__)
 #define	dev_info(dev, fmt, ...)						\
 	device_printf((dev), "info: " fmt, ## __VA_ARGS__)
 #define	dev_dbg(dev, fmt, ...) do {					\
 	if ((drm_debug& DRM_DEBUGBITS_KMS) != 0) {			\
 		device_printf((dev), "debug: " fmt, ## __VA_ARGS__);	\
 	}								\
 } while (0)
 
 struct drm_msi_blacklist_entry
 {
 	int vendor;
 	int device;
 };
 
 struct drm_vblank_info {
 	wait_queue_head_t queue;	/* vblank wait queue */
 	atomic_t count;			/* number of VBLANK interrupts */
 					/* (driver must alloc the right number of counters) */
 	atomic_t refcount;		/* number of users of vblank interrupts */
 	u32 last;			/* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int enabled;			/* so we don't call enable more than */
 					/* once per disable */
 	int inmodeset;			/* Display driver is setting mode */
 };
 
 #ifndef DMA_BIT_MASK
 #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : (1ULL<<(n)) - 1)
 #endif
 
 #define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
 
 enum dmi_field {
         DMI_NONE,
         DMI_BIOS_VENDOR,
         DMI_BIOS_VERSION,
         DMI_BIOS_DATE,
         DMI_SYS_VENDOR,
         DMI_PRODUCT_NAME,
         DMI_PRODUCT_VERSION,
         DMI_PRODUCT_SERIAL,
         DMI_PRODUCT_UUID,
         DMI_BOARD_VENDOR,
         DMI_BOARD_NAME,
         DMI_BOARD_VERSION,
         DMI_BOARD_SERIAL,
         DMI_BOARD_ASSET_TAG,
         DMI_CHASSIS_VENDOR,
         DMI_CHASSIS_TYPE,
         DMI_CHASSIS_VERSION,
         DMI_CHASSIS_SERIAL,
         DMI_CHASSIS_ASSET_TAG,
         DMI_STRING_MAX,
 };
 
 struct dmi_strmatch {
 	unsigned char slot;
 	char substr[79];
 };
 
 struct dmi_system_id {
         int (*callback)(const struct dmi_system_id *);
         const char *ident;
         struct dmi_strmatch matches[4];
 };
 #define	DMI_MATCH(a, b) {(a), (b)}
 bool dmi_check_system(const struct dmi_system_id *);
 
 /* Device setup support (drm_drv.c) */
 int	drm_probe_helper(device_t kdev, const drm_pci_id_list_t *idlist);
 int	drm_attach_helper(device_t kdev, const drm_pci_id_list_t *idlist,
 	    struct drm_driver *driver);
 int	drm_generic_suspend(device_t kdev);
 int	drm_generic_resume(device_t kdev);
 int	drm_generic_detach(device_t kdev);
 
 void drm_event_wakeup(struct drm_pending_event *e);
 
 int drm_add_busid_modesetting(struct drm_device *dev,
     struct sysctl_ctx_list *ctx, struct sysctl_oid *top);
 
 /* Buffer management support (drm_bufs.c) */
 unsigned long drm_get_resource_start(struct drm_device *dev,
 				     unsigned int resource);
 unsigned long drm_get_resource_len(struct drm_device *dev,
 				   unsigned int resource);
 
 /* IRQ support (drm_irq.c) */
 irqreturn_t drm_irq_handler(DRM_IRQ_ARGS);
 void	drm_driver_irq_preinstall(struct drm_device *dev);
 void	drm_driver_irq_postinstall(struct drm_device *dev);
 void	drm_driver_irq_uninstall(struct drm_device *dev);
 
 /* sysctl support (drm_sysctl.h) */
 extern int		drm_sysctl_init(struct drm_device *dev);
 extern int		drm_sysctl_cleanup(struct drm_device *dev);
 
 int	drm_version(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 /* consistent PCI memory functions (drm_pci.c) */
 int	drm_pci_set_busid(struct drm_device *dev, struct drm_master *master);
 int	drm_pci_set_unique(struct drm_device *dev, struct drm_master *master,
 	    struct drm_unique *u);
 int	drm_pci_agp_init(struct drm_device *dev);
 int	drm_pci_enable_msi(struct drm_device *dev);
 void	drm_pci_disable_msi(struct drm_device *dev);
 
 struct ttm_bo_device;
 int ttm_bo_mmap_single(struct ttm_bo_device *bdev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **obj_res, int nprot);
 struct ttm_buffer_object;
 void ttm_bo_release_mmap(struct ttm_buffer_object *bo);
 
 #if  __OS_HAS_AGP
 				/* Memory management support (drm_memory.h) */
 extern void drm_free_agp(DRM_AGP_MEM * handle, int pages);
 extern int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start);
 #ifdef FREEBSD_NOTYET
 extern DRM_AGP_MEM *drm_agp_bind_pages(struct drm_device *dev,
 				       struct page **pages,
 				       unsigned long num_pages,
 				       uint32_t gtt_offset,
 				       uint32_t type);
 #endif /* FREEBSD_NOTYET */
 extern int drm_unbind_agp(DRM_AGP_MEM * handle);
 
 				/* AGP/GART support (drm_agpsupport.h) */
 extern struct drm_agp_head *drm_agp_init(struct drm_device *dev);
 extern int drm_agp_acquire(struct drm_device *dev);
 extern int drm_agp_acquire_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file_priv);
 extern int drm_agp_release(struct drm_device *dev);
 extern int drm_agp_release_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file_priv);
 extern int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode);
 extern int drm_agp_enable_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
 extern int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info);
 extern int drm_agp_info_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request);
 extern int drm_agp_alloc_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request);
 extern int drm_agp_free_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request);
 extern int drm_agp_unbind_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
 extern int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request);
 extern int drm_agp_bind_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 #else
 
 static inline void drm_free_agp(DRM_AGP_MEM * handle, int pages)
 {
 }
 
 static inline int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start)
 {
 	return -ENODEV;
 }
 
 static inline int drm_unbind_agp(DRM_AGP_MEM * handle)
 {
 	return -ENODEV;
 }
 #ifdef FREEBSD_NOTYET
 static inline struct agp_memory *drm_agp_bind_pages(struct drm_device *dev,
 					      struct page **pages,
 					      unsigned long num_pages,
 					      uint32_t gtt_offset,
 					      uint32_t type)
 {
 	return NULL;
 }
 #endif
 static inline struct drm_agp_head *drm_agp_init(struct drm_device *dev)
 {
 	return NULL;
 }
 
 static inline void drm_agp_clear(struct drm_device *dev)
 {
 }
 
 static inline int drm_agp_acquire(struct drm_device *dev)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_acquire_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_release(struct drm_device *dev)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_release_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_enable(struct drm_device *dev,
 				 struct drm_agp_mode mode)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_enable_ioctl(struct drm_device *dev, void *data,
 				       struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_info(struct drm_device *dev,
 			       struct drm_agp_info *info)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_info_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_alloc(struct drm_device *dev,
 				struct drm_agp_buffer *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_alloc_ioctl(struct drm_device *dev, void *data,
 				      struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_free(struct drm_device *dev,
 			       struct drm_agp_buffer *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_free_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_unbind(struct drm_device *dev,
 				 struct drm_agp_binding *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_unbind_ioctl(struct drm_device *dev, void *data,
 				       struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_bind(struct drm_device *dev,
 			       struct drm_agp_binding *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_bind_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 #endif /* __OS_HAS_AGP */
 
 #endif				/* __KERNEL__ */
 #endif
Index: stable/11/sys/fs/msdosfs/msdosfs_denode.c
===================================================================
--- stable/11/sys/fs/msdosfs/msdosfs_denode.c	(revision 331016)
+++ stable/11/sys/fs/msdosfs/msdosfs_denode.c	(revision 331017)
@@ -1,614 +1,615 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
+#include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 static MALLOC_DEFINE(M_MSDOSFSNODE, "msdosfs_node", "MSDOSFS vnode private part");
 
 static int
 de_vncmpf(struct vnode *vp, void *arg)
 {
 	struct denode *de;
 	uint64_t *a;
 
 	a = arg;
 	de = VTODE(vp);
 	return (de->de_inode != *a);
 }
 
 /*
  * If deget() succeeds it returns with the gotten denode locked().
  *
  * pmp	     - address of msdosfsmount structure of the filesystem containing
  *	       the denode of interest.  The address of
  *	       the msdosfsmount structure are used.
  * dirclust  - which cluster bp contains, if dirclust is 0 (root directory)
  *	       diroffset is relative to the beginning of the root directory,
  *	       otherwise it is cluster relative.
  * diroffset - offset past begin of cluster of denode we want
  * depp	     - returns the address of the gotten denode.
  */
 int
 deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset,
     struct denode **depp)
 {
 	int error;
 	uint64_t inode;
 	struct mount *mntp = pmp->pm_mountp;
 	struct direntry *direntptr;
 	struct denode *ldep;
 	struct vnode *nvp, *xvp;
 	struct buf *bp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n",
 	    pmp, dirclust, diroffset, depp);
 #endif
 
 	/*
 	 * On FAT32 filesystems, root is a (more or less) normal
 	 * directory
 	 */
 	if (FAT32(pmp) && dirclust == MSDOSFSROOT)
 		dirclust = pmp->pm_rootdirblk;
 
 	/*
 	 * See if the denode is in the denode cache. Use the location of
 	 * the directory entry to compute the hash value. For subdir use
 	 * address of "." entry. For root dir (if not FAT32) use cluster
 	 * MSDOSFSROOT, offset MSDOSFSROOT_OFS
 	 *
 	 * NOTE: The check for de_refcnt > 0 below insures the denode being
 	 * examined does not represent an unlinked but still open file.
 	 * These files are not to be accessible even when the directory
 	 * entry that represented the file happens to be reused while the
 	 * deleted file is still open.
 	 */
 	inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset;
 
 	error = vfs_hash_get(mntp, inode, LK_EXCLUSIVE, curthread, &nvp,
 	    de_vncmpf, &inode);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*depp = VTODE(nvp);
 		KASSERT((*depp)->de_dirclust == dirclust, ("wrong dirclust"));
 		KASSERT((*depp)->de_diroffset == diroffset, ("wrong diroffset"));
 		return (0);
 	}
 	ldep = malloc(sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Directory entry was not in cache, have to create a vnode and
 	 * copy it from the passed disk buffer.
 	 */
 	/* getnewvnode() does a VREF() on the vnode */
 	error = getnewvnode("msdosfs", mntp, &msdosfs_vnodeops, &nvp);
 	if (error) {
 		*depp = NULL;
 		free(ldep, M_MSDOSFSNODE);
 		return error;
 	}
 	nvp->v_data = ldep;
 	ldep->de_vnode = nvp;
 	ldep->de_flag = 0;
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	ldep->de_inode = inode;
 	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
 	fc_purge(ldep, 0);	/* init the fat cache for this denode */
 	error = insmntque(nvp, mntp);
 	if (error != 0) {
 		free(ldep, M_MSDOSFSNODE);
 		*depp = NULL;
 		return (error);
 	}
 	error = vfs_hash_insert(nvp, inode, LK_EXCLUSIVE, curthread, &xvp,
 	    de_vncmpf, &inode);
 	if (error) {
 		*depp = NULL;
 		return (error);
 	}
 	if (xvp != NULL) {
 		*depp = xvp->v_data;
 		return (0);
 	}
 
 	ldep->de_pmp = pmp;
 	ldep->de_refcnt = 1;
 	/*
 	 * Copy the directory entry into the denode area of the vnode.
 	 */
 	if ((dirclust == MSDOSFSROOT
 	     || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk))
 	    && diroffset == MSDOSFSROOT_OFS) {
 		/*
 		 * Directory entry for the root directory. There isn't one,
 		 * so we manufacture one. We should probably rummage
 		 * through the root directory and find a label entry (if it
 		 * exists), and then use the time and date from that entry
 		 * as the time and date for the root denode.
 		 */
 		nvp->v_vflag |= VV_ROOT; /* should be further down XXX */
 
 		ldep->de_Attributes = ATTR_DIRECTORY;
 		ldep->de_LowerCase = 0;
 		if (FAT32(pmp))
 			ldep->de_StartCluster = pmp->pm_rootdirblk;
 			/* de_FileSize will be filled in further down */
 		else {
 			ldep->de_StartCluster = MSDOSFSROOT;
 			ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE;
 		}
 		/*
 		 * fill in time and date so that fattime2timespec() doesn't
 		 * spit up when called from msdosfs_getattr() with root
 		 * denode
 		 */
 		ldep->de_CHun = 0;
 		ldep->de_CTime = 0x0000;	/* 00:00:00	 */
 		ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT)
 		    | (1 << DD_DAY_SHIFT);
 		/* Jan 1, 1980	 */
 		ldep->de_ADate = ldep->de_CDate;
 		ldep->de_MTime = ldep->de_CTime;
 		ldep->de_MDate = ldep->de_CDate;
 		/* leave the other fields as garbage */
 	} else {
 		error = readep(pmp, dirclust, diroffset, &bp, &direntptr);
 		if (error) {
 			/*
 			 * The denode does not contain anything useful, so
 			 * it would be wrong to leave it on its hash chain.
 			 * Arrange for vput() to just forget about it.
 			 */
 			ldep->de_Name[0] = SLOT_DELETED;
 
 			vput(nvp);
 			*depp = NULL;
 			return (error);
 		}
 		(void)DE_INTERNALIZE(ldep, direntptr);
 		brelse(bp);
 	}
 
 	/*
 	 * Fill in a few fields of the vnode and finish filling in the
 	 * denode.  Then return the address of the found denode.
 	 */
 	if (ldep->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Since DOS directory entries that describe directories
 		 * have 0 in the filesize field, we take this opportunity
 		 * to find out the length of the directory and plug it into
 		 * the denode structure.
 		 */
 		u_long size;
 
 		/*
 		 * XXX it sometimes happens that the "." entry has cluster
 		 * number 0 when it shouldn't.  Use the actual cluster number
 		 * instead of what is written in directory entry.
 		 */
 		if (diroffset == 0 && ldep->de_StartCluster != dirclust) {
 #ifdef MSDOSFS_DEBUG
 			printf("deget(): \".\" entry at clust %lu != %lu\n",
 			    dirclust, ldep->de_StartCluster);
 #endif
 			ldep->de_StartCluster = dirclust;
 		}
 
 		nvp->v_type = VDIR;
 		if (ldep->de_StartCluster != MSDOSFSROOT) {
 			error = pcbmap(ldep, 0xffff, 0, &size, 0);
 			if (error == E2BIG) {
 				ldep->de_FileSize = de_cn2off(pmp, size);
 				error = 0;
 			} else {
 #ifdef MSDOSFS_DEBUG
 				printf("deget(): pcbmap returned %d\n", error);
 #endif
 			}
 		}
 	} else
 		nvp->v_type = VREG;
 	ldep->de_modrev = init_va_filerev();
 	*depp = ldep;
 	return (0);
 }
 
 int
 deupdat(struct denode *dep, int waitfor)
 {
 	struct direntry dir;
 	struct timespec ts;
 	struct buf *bp;
 	struct direntry *dirp;
 	int error;
 
 	if (DETOV(dep)->v_mount->mnt_flag & MNT_RDONLY) {
 		dep->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS |
 		    DE_MODIFIED);
 		return (0);
 	}
 	getnanotime(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	if ((dep->de_flag & DE_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	dep->de_flag &= ~DE_MODIFIED;
 	if (DETOV(dep)->v_vflag & VV_ROOT)
 		return (EINVAL);
 	if (dep->de_refcnt <= 0)
 		return (0);
 	error = readde(dep, &bp, &dirp);
 	if (error)
 		return (error);
 	DE_EXTERNALIZE(&dir, dep);
 	if (bcmp(dirp, &dir, sizeof(dir)) == 0) {
 		if (waitfor == 0 || (bp->b_flags & B_DELWRI) == 0) {
 			brelse(bp);
 			return (0);
 		}
 	} else
 		*dirp = dir;
 	if ((DETOV(dep)->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 		bp->b_flags |= B_CLUSTEROK;
 	if (waitfor)
 		error = bwrite(bp);
 	else if (vm_page_count_severe() || buf_dirty_count_severe())
 		bawrite(bp);
 	else
 		bdwrite(bp);
 	return (error);
 }
 
 /*
  * Truncate the file described by dep to the length specified by length.
  */
 int
 detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred)
 {
 	int error;
 	int allerror;
 	u_long eofentry;
 	u_long chaintofree;
 	daddr_t bn;
 	int boff;
 	int isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags);
 #endif
 
 	/*
 	 * Disallow attempts to truncate the root directory since it is of
 	 * fixed size.  That's just the way dos filesystems are.  We use
 	 * the VROOT bit in the vnode because checking for the directory
 	 * bit and a startcluster of 0 in the denode is not adequate to
 	 * recognize the root directory at this point in a file or
 	 * directory's life.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) {
 #ifdef MSDOSFS_DEBUG
 		printf("detrunc(): can't truncate root directory, clust %ld, offset %ld\n",
 		    dep->de_dirclust, dep->de_diroffset);
 #endif
 		return (EINVAL);
 	}
 
 	if (dep->de_FileSize < length) {
 		vnode_pager_setsize(DETOV(dep), length);
 		return deextend(dep, length, cred);
 	}
 
 	/*
 	 * If the desired length is 0 then remember the starting cluster of
 	 * the file and set the StartCluster field in the directory entry
 	 * to 0.  If the desired length is not zero, then get the number of
 	 * the last cluster in the shortened file.  Then get the number of
 	 * the first cluster in the part of the file that is to be freed.
 	 * Then set the next cluster pointer in the last cluster of the
 	 * file to CLUST_EOFE.
 	 */
 	if (length == 0) {
 		chaintofree = dep->de_StartCluster;
 		dep->de_StartCluster = 0;
 		eofentry = ~0;
 	} else {
 		error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, 
 			       &eofentry, 0);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): pcbmap fails %d\n", error);
 #endif
 			return (error);
 		}
 	}
 
 	fc_purge(dep, de_clcount(pmp, length));
 
 	/*
 	 * If the new length is not a multiple of the cluster size then we
 	 * must zero the tail end of the new last cluster in case it
 	 * becomes part of the file again because of a seek.
 	 */
 	if ((boff = length & pmp->pm_crbomask) != 0) {
 		if (isadir) {
 			bn = cntobn(pmp, eofentry);
 			error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 #ifdef MSDOSFS_DEBUG
 				printf("detrunc(): bread fails %d\n", error);
 #endif
 				return (error);
 			}
 			bzero(bp->b_data + boff, pmp->pm_bpcluster - boff);
 			if (flags & IO_SYNC)
 				bwrite(bp);
 			else
 				bdwrite(bp);
 		}
 	}
 
 	/*
 	 * Write out the updated directory entry.  Even if the update fails
 	 * we free the trailing clusters.
 	 */
 	dep->de_FileSize = length;
 	if (!isadir)
 		dep->de_flag |= DE_UPDATE | DE_MODIFIED;
 	allerror = vtruncbuf(DETOV(dep), cred, length, pmp->pm_bpcluster);
 #ifdef MSDOSFS_DEBUG
 	if (allerror)
 		printf("detrunc(): vtruncbuf error %d\n", allerror);
 #endif
 	error = deupdat(dep, !DOINGASYNC((DETOV(dep))));
 	if (error != 0 && allerror == 0)
 		allerror = error;
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): allerror %d, eofentry %lu\n",
 	       allerror, eofentry);
 #endif
 
 	/*
 	 * If we need to break the cluster chain for the file then do it
 	 * now.
 	 */
 	if (eofentry != ~0) {
 		error = fatentry(FAT_GET_AND_SET, pmp, eofentry,
 				 &chaintofree, CLUST_EOFE);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): fatentry errors %d\n", error);
 #endif
 			return (error);
 		}
 		fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1),
 			    eofentry);
 	}
 
 	/*
 	 * Now free the clusters removed from the file because of the
 	 * truncation.
 	 */
 	if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree))
 		freeclusterchain(pmp, chaintofree);
 
 	return (allerror);
 }
 
 /*
  * Extend the file described by dep to length specified by length.
  */
 int
 deextend(struct denode *dep, u_long length, struct ucred *cred)
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	u_long count;
 	int error;
 
 	/*
 	 * The root of a DOS filesystem cannot be extended.
 	 */
 	if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp))
 		return (EINVAL);
 
 	/*
 	 * Directories cannot be extended.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return (EISDIR);
 
 	if (length <= dep->de_FileSize)
 		panic("deextend: file too large");
 
 	/*
 	 * Compute the number of clusters to allocate.
 	 */
 	count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize);
 	if (count > 0) {
 		if (count > pmp->pm_freeclustercount)
 			return (ENOSPC);
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
 			(void) detrunc(dep, dep->de_FileSize, 0, cred);
 			return (error);
 		}
 	}
 	dep->de_FileSize = length;
 	dep->de_flag |= DE_UPDATE | DE_MODIFIED;
 	return (deupdat(dep, !DOINGASYNC(DETOV(dep))));
 }
 
 /*
  * Move a denode to its correct hash queue after the file it represents has
  * been moved to a new directory.
  */
 void
 reinsert(struct denode *dep)
 {
 	struct vnode *vp;
 
 	/*
 	 * Fix up the denode cache.  If the denode is for a directory,
 	 * there is nothing to do since the hash is based on the starting
 	 * cluster of the directory file and that hasn't changed.  If for a
 	 * file the hash is based on the location of the directory entry,
 	 * so we must remove it from the cache and re-enter it with the
 	 * hash based on the new location of the directory entry.
 	 */
 #if 0
 	if (dep->de_Attributes & ATTR_DIRECTORY)
 		return;
 #endif
 	vp = DETOV(dep);
 	dep->de_inode = (uint64_t)dep->de_pmp->pm_bpcluster * dep->de_dirclust +
 	    dep->de_diroffset;
 	vfs_hash_rehash(vp, dep->de_inode);
 }
 
 int
 msdosfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n",
 	    dep, dep->de_Name, dep->de_refcnt);
 #endif
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	/*
 	 * Remove the denode from its hash chain.
 	 */
 	vfs_hash_remove(vp);
 	/*
 	 * Purge old data structures associated with the denode.
 	 */
 #if 0 /* XXX */
 	dep->de_flag = 0;
 #endif
 	free(dep, M_MSDOSFSNODE);
 	vp->v_data = NULL;
 
 	return (0);
 }
 
 int
 msdosfs_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]);
 #endif
 
 	/*
 	 * Ignore denodes related to stale file handles.
 	 */
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		goto out;
 
 	/*
 	 * If the file has been deleted and it is on a read/write
 	 * filesystem, then truncate the file, and mark the directory slot
 	 * as empty.  (This may not be necessary for the dos filesystem.)
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %x, MNT_RDONLY %x\n",
 	       dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY);
 #endif
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		error = detrunc(dep, (u_long) 0, 0, NOCRED);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
 	deupdat(dep, 0);
 
 out:
 	/*
 	 * If we are done with the denode, reclaim it
 	 * so that it can be reused immediately.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n",
 	       vrefcnt(vp), dep->de_Name[0]);
 #endif
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
 		vrecycle(vp);
 	return (error);
 }
Index: stable/11/sys/fs/msdosfs/msdosfs_vnops.c
===================================================================
--- stable/11/sys/fs/msdosfs/msdosfs_vnops.c	(revision 331016)
+++ stable/11/sys/fs/msdosfs/msdosfs_vnops.c	(revision 331017)
@@ -1,1962 +1,1963 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/dirent.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
+#include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 #define	DOS_FILESIZE_MAX	0xffffffff
 
 /*
  * Prototypes for MSDOSFS vnode operations
  */
 static vop_create_t	msdosfs_create;
 static vop_mknod_t	msdosfs_mknod;
 static vop_open_t	msdosfs_open;
 static vop_close_t	msdosfs_close;
 static vop_access_t	msdosfs_access;
 static vop_getattr_t	msdosfs_getattr;
 static vop_setattr_t	msdosfs_setattr;
 static vop_read_t	msdosfs_read;
 static vop_write_t	msdosfs_write;
 static vop_fsync_t	msdosfs_fsync;
 static vop_remove_t	msdosfs_remove;
 static vop_link_t	msdosfs_link;
 static vop_rename_t	msdosfs_rename;
 static vop_mkdir_t	msdosfs_mkdir;
 static vop_rmdir_t	msdosfs_rmdir;
 static vop_symlink_t	msdosfs_symlink;
 static vop_readdir_t	msdosfs_readdir;
 static vop_bmap_t	msdosfs_bmap;
 static vop_getpages_t	msdosfs_getpages;
 static vop_strategy_t	msdosfs_strategy;
 static vop_print_t	msdosfs_print;
 static vop_pathconf_t	msdosfs_pathconf;
 static vop_vptofh_t	msdosfs_vptofh;
 
 /*
  * Some general notes:
  *
  * In the ufs filesystem the inodes, superblocks, and indirect blocks are
  * read/written using the vnode for the filesystem. Blocks that represent
  * the contents of a file are read/written using the vnode for the file
  * (including directories when they are read/written as files). This
  * presents problems for the dos filesystem because data that should be in
  * an inode (if dos had them) resides in the directory itself.  Since we
  * must update directory entries without the benefit of having the vnode
  * for the directory we must use the vnode for the filesystem.  This means
  * that when a directory is actually read/written (via read, write, or
  * readdir, or seek) we must use the vnode for the filesystem instead of
  * the vnode for the directory as would happen in ufs. This is to insure we
  * retrieve the correct block from the buffer cache since the hash value is
  * based upon the vnode address and the desired block number.
  */
 
 /*
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 msdosfs_create(struct vop_create_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode ndirent;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct timespec ts;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap);
 #endif
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad;
 	}
 
 	/*
 	 * Create a directory entry for the file, then call createde() to
 	 * have it installed. NOTE: DOS files are always executable.  We
 	 * use the absence of the owner write bit to make the file
 	 * readonly.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_create: no name");
 #endif
 	bzero(&ndirent, sizeof(ndirent));
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_ARCHIVE;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = 0;
 	ndirent.de_FileSize = 0;
 	ndirent.de_pmp = pdep->de_pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	getnanotime(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (0);
 
 bad:
 	return (error);
 }
 
 static int
 msdosfs_mknod(struct vop_mknod_args *ap)
 {
 
     return (EINVAL);
 }
 
 static int
 msdosfs_open(struct vop_open_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td);
 	return 0;
 }
 
 static int
 msdosfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct timespec ts;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1) {
 		getnanotime(&ts);
 		DETIMES(dep, &ts, &ts, &ts);
 	}
 	VI_UNLOCK(vp);
 	return 0;
 }
 
 static int
 msdosfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	mode_t file_mode;
 	accmode_t accmode = ap->a_accmode;
 
 	file_mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 
 	/*
 	 * Disallow writing to directories and regular files if the
 	 * filesystem is read-only.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid,
 	    ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 msdosfs_getattr(struct vop_getattr_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	mode_t mode;
 	struct timespec ts;
 	u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 	uint64_t fileid;
 
 	getnanotime(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	vap->va_fsid = dev2udev(pmp->pm_dev);
 	/*
 	 * The following computation of the fileid must be the same as that
 	 * used in msdosfs_readdir() to compute d_fileno. If not, pwd
 	 * doesn't work.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) *
 		    dirsperblk;
 		if (dep->de_StartCluster == MSDOSFSROOT)
 			fileid = 1;
 	} else {
 		fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) *
 		    dirsperblk;
 		if (dep->de_dirclust == MSDOSFSROOT)
 			fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk;
 		fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry);
 	}
 
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		vap->va_fileid = msdosfs_fileno_map(pmp->pm_mountp, fileid);
 	else
 		vap->va_fileid = (long)fileid;
 
 	mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	vap->va_mode = mode & 
 	    (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 	vap->va_uid = pmp->pm_uid;
 	vap->va_gid = pmp->pm_gid;
 	vap->va_nlink = 1;
 	vap->va_rdev = NODEV;
 	vap->va_size = dep->de_FileSize;
 	fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime;
 	if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) {
 		fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime);
 		fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun,
 		    0, &vap->va_birthtime);
 	} else {
 		vap->va_atime = vap->va_mtime;
 		vap->va_birthtime.tv_sec = -1;
 		vap->va_birthtime.tv_nsec = 0;
 	}
 	vap->va_flags = 0;
 	if (dep->de_Attributes & ATTR_ARCHIVE)
 		vap->va_flags |= UF_ARCHIVE;
 	if (dep->de_Attributes & ATTR_HIDDEN)
 		vap->va_flags |= UF_HIDDEN;
 	if (dep->de_Attributes & ATTR_READONLY)
 		vap->va_flags |= UF_READONLY;
 	if (dep->de_Attributes & ATTR_SYSTEM)
 		vap->va_flags |= UF_SYSTEM;
 	vap->va_gen = 0;
 	vap->va_blocksize = pmp->pm_bpcluster;
 	vap->va_bytes =
 	    (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask;
 	vap->va_type = ap->a_vp->v_type;
 	vap->va_filerev = dep->de_modrev;
 	return (0);
 }
 
 static int
 msdosfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_setattr(): vp %p, vap %p, cred %p\n",
 	    ap->a_vp, vap, cred);
 #endif
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_setattr(): returning EINVAL\n");
 		printf("    va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n",
 		    vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid);
 		printf("    va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n",
 		    vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen);
 		printf("    va_uid %x, va_gid %x\n",
 		    vap->va_uid, vap->va_gid);
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * We don't allow setting attributes on the root directory.
 	 * The special case for the root directory is because before
 	 * FAT32, the root directory didn't have an entry for itself
 	 * (and was otherwise special).  With FAT32, the root
 	 * directory is not so special, but still doesn't have an
 	 * entry for itself.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		return (EINVAL);
 
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		/*
 		 * We are very inconsistent about handling unsupported
 		 * attributes.  We ignored the access time and the
 		 * read and execute bits.  We were strict for the other
 		 * attributes.
 		 */
 		if (vap->va_flags & ~(UF_ARCHIVE | UF_HIDDEN | UF_READONLY |
 		    UF_SYSTEM))
 			return EOPNOTSUPP;
 		if (vap->va_flags & UF_ARCHIVE)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		else
 			dep->de_Attributes &= ~ATTR_ARCHIVE;
 		if (vap->va_flags & UF_HIDDEN)
 			dep->de_Attributes |= ATTR_HIDDEN;
 		else
 			dep->de_Attributes &= ~ATTR_HIDDEN;
 		/* We don't allow changing the readonly bit on directories. */
 		if (vp->v_type != VDIR) {
 			if (vap->va_flags & UF_READONLY)
 				dep->de_Attributes |= ATTR_READONLY;
 			else
 				dep->de_Attributes &= ~ATTR_READONLY;
 		}
 		if (vap->va_flags & UF_SYSTEM)
 			dep->de_Attributes |= ATTR_SYSTEM;
 		else
 			dep->de_Attributes &= ~ATTR_SYSTEM;
 		dep->de_flag |= DE_MODIFIED;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		uid_t uid;
 		gid_t gid;
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		uid = vap->va_uid;
 		if (uid == (uid_t)VNOVAL)
 			uid = pmp->pm_uid;
 		gid = vap->va_gid;
 		if (gid == (gid_t)VNOVAL)
 			gid = pmp->pm_gid;
 		if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid ||
 		    (gid != pmp->pm_gid && !groupmember(gid, cred))) {
 			error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
 			if (error)
 				return (error);
 		}
 		if (uid != pmp->pm_uid || gid != pmp->pm_gid)
 			return EINVAL;
 	}
 
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VREG:
 			/*
 			 * Truncation is only supported for regular files,
 			 * Disallow it if the filesystem is read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support any file types except regular
 			 * files and directories in this file system, so
 			 * this (default) case is unreachable and can do
 			 * anything.  Keep falling through to detrunc()
 			 * for now.
 			 */
 			break;
 		}
 		error = detrunc(dep, vap->va_size, 0, cred);
 		if (error)
 			return error;
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 &&
 		    vap->va_atime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_ACCESS;
 			timespec2fattime(&vap->va_atime, 0,
 			    &dep->de_ADate, NULL, NULL);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_UPDATE;
 			timespec2fattime(&vap->va_mtime, 0,
 			    &dep->de_MDate, &dep->de_MTime, NULL);
 		}
 		/*
 		 * We don't set the archive bit when modifying the time of
 		 * a directory to emulate the Windows/DOS behavior.
 		 */
 		if (vp->v_type != VDIR)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		dep->de_flag |= DE_MODIFIED;
 	}
 	/*
 	 * DOS files only have the ability to have their writability
 	 * attribute set, so we use the owner write bit to set the readonly
 	 * attribute.
 	 */
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
 			if (error)
 				return (error);
 		}
 		if (vp->v_type != VDIR) {
 			/* We ignore the read and execute bits. */
 			if (vap->va_mode & VWRITE)
 				dep->de_Attributes &= ~ATTR_READONLY;
 			else
 				dep->de_Attributes |= ATTR_READONLY;
 			dep->de_Attributes |= ATTR_ARCHIVE;
 			dep->de_flag |= DE_MODIFIED;
 		}
 	}
 	return (deupdat(dep, 0));
 }
 
 static int
 msdosfs_read(struct vop_read_args *ap)
 {
 	int error = 0;
 	int blsize;
 	int isadir;
 	ssize_t orig_resid;
 	u_int n;
 	u_long diff;
 	u_long on;
 	daddr_t lbn;
 	daddr_t rablock;
 	int rasize;
 	int seqcount;
 	struct buf *bp;
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct uio *uio = ap->a_uio;
 
 	/*
 	 * If they didn't ask for any data, then we are done.
 	 */
 	orig_resid = uio->uio_resid;
 	if (orig_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 * We don't need to check for large offsets as in ffs because
 	 * dep->de_FileSize <= DOS_FILESIZE_MAX < OFF_MAX, so large
 	 * offsets cannot cause overflow even in theory.
 	 */
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	do {
 		if (uio->uio_offset >= dep->de_FileSize)
 			break;
 		lbn = de_cluster(pmp, uio->uio_offset);
 		rablock = lbn + 1;
 		blsize = pmp->pm_bpcluster;
 		on = uio->uio_offset & pmp->pm_crbomask;
 		/*
 		 * If we are operating on a directory file then be sure to
 		 * do i/o with the vnode for the filesystem instead of the
 		 * vnode for the directory.
 		 */
 		if (isadir) {
 			/* convert cluster # to block # */
 			error = pcbmap(dep, lbn, &lbn, 0, &blsize);
 			if (error == E2BIG) {
 				error = EINVAL;
 				break;
 			} else if (error)
 				break;
 			error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
 		} else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, dep->de_FileSize, lbn, blsize,
 			    NOCRED, on + uio->uio_resid, seqcount, 0, &bp);
 		} else if (seqcount > 1) {
 			rasize = blsize;
 			error = breadn(vp, lbn,
 			    blsize, &rablock, &rasize, 1, NOCRED, &bp);
 		} else {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			break;
 		}
 		diff = pmp->pm_bpcluster - on;
 		n = diff > uio->uio_resid ? uio->uio_resid : diff;
 		diff = dep->de_FileSize - uio->uio_offset;
 		if (diff < n)
 			n = diff;
 		diff = blsize - bp->b_resid;
 		if (diff < n)
 			n = diff;
 		error = vn_io_fault_uiomove(bp->b_data + on, (int) n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	if (!isadir && (error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		dep->de_flag |= DE_ACCESS;
 	return (error);
 }
 
 /*
  * Write data to a file or directory.
  */
 static int
 msdosfs_write(struct vop_write_args *ap)
 {
 	int n;
 	int croffset;
 	ssize_t resid;
 	u_long osize;
 	int error = 0;
 	u_long count;
 	int seqcount;
 	daddr_t bn, lastcn;
 	struct buf *bp;
 	int ioflag = ap->a_ioflag;
 	struct uio *uio = ap->a_uio;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *thisvp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct ucred *cred = ap->a_cred;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n",
 	    vp, uio, ioflag, cred);
 	printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n",
 	    dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = dep->de_FileSize;
 		thisvp = vp;
 		break;
 	case VDIR:
 		return EISDIR;
 	default:
 		panic("msdosfs_write(): bad file type");
 	}
 
 	/*
 	 * This is needed (unlike in ffs_write()) because we extend the
 	 * file outside of the loop but we don't want to extend the file
 	 * for writes of 0 bytes.
 	 */
 	if (uio->uio_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 */
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > DOS_FILESIZE_MAX)
 		return (EFBIG);
 
 	/*
 	 * If they've exceeded their filesize limit, tell them about it.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	/*
 	 * If the offset we are starting the write at is beyond the end of
 	 * the file, then they've done a seek.  Unix filesystems allow
 	 * files with holes in them, DOS doesn't so we must fill the hole
 	 * with zeroed blocks.
 	 */
 	if (uio->uio_offset > dep->de_FileSize) {
 		error = deextend(dep, uio->uio_offset, cred);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Remember some values in case the write fails.
 	 */
 	resid = uio->uio_resid;
 	osize = dep->de_FileSize;
 
 	/*
 	 * If we write beyond the end of the file, extend it to its ultimate
 	 * size ahead of the time to hopefully get a contiguous area.
 	 */
 	if (uio->uio_offset + resid > osize) {
 		count = de_clcount(pmp, uio->uio_offset + resid) -
 			de_clcount(pmp, osize);
 		error = extendfile(dep, count, NULL, NULL, 0);
 		if (error &&  (error != ENOSPC || (ioflag & IO_UNIT)))
 			goto errexit;
 		lastcn = dep->de_fc[FC_LASTFC].fc_frcn;
 	} else
 		lastcn = de_clcount(pmp, osize) - 1;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	do {
 		if (de_cluster(pmp, uio->uio_offset) > lastcn) {
 			error = ENOSPC;
 			break;
 		}
 
 		croffset = uio->uio_offset & pmp->pm_crbomask;
 		n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
 		if (uio->uio_offset + n > dep->de_FileSize) {
 			dep->de_FileSize = uio->uio_offset + n;
 			/* The object size needs to be set before buffer is allocated */
 			vnode_pager_setsize(vp, dep->de_FileSize);
 		}
 
 		bn = de_cluster(pmp, uio->uio_offset);
 		if ((uio->uio_offset & pmp->pm_crbomask) == 0
 		    && (de_cluster(pmp, uio->uio_offset + uio->uio_resid)
 			> de_cluster(pmp, uio->uio_offset)
 			|| uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) {
 			/*
 			 * If either the whole cluster gets written,
 			 * or we write the cluster from its start beyond EOF,
 			 * then no need to read data from disk.
 			 */
 			bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 			/*
 			 * This call to vfs_bio_clrbuf() ensures that
 			 * even if vn_io_fault_uiomove() below faults,
 			 * garbage from the newly instantiated buffer
 			 * is not exposed to the userspace via mmap().
 			 */
 			vfs_bio_clrbuf(bp);
 			/*
 			 * Do the bmap now, since pcbmap needs buffers
 			 * for the fat table. (see msdosfs_strategy)
 			 */
 			if (bp->b_blkno == bp->b_lblkno) {
 				error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0);
 				if (error)
 					bp->b_blkno = -1;
 				else
 					bp->b_blkno = bn;
 			}
 			if (bp->b_blkno == -1) {
 				brelse(bp);
 				if (!error)
 					error = EIO;		/* XXX */
 				break;
 			}
 		} else {
 			/*
 			 * The block we need to write into exists, so read it in.
 			 */
 			error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 
 		/*
 		 * Should these vnode_pager_* functions be done on dir
 		 * files?
 		 */
 
 		/*
 		 * Copy the data from user space into the buf header.
 		 */
 		error = vn_io_fault_uiomove(bp->b_data + croffset, n, uio);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		/* Prepare for clustered writes in some else clauses. */
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 			bp->b_flags |= B_CLUSTEROK;
 
 		/*
 		 * If IO_SYNC, then each buffer is written synchronously.
 		 * Otherwise, if we have a severe page deficiency then
 		 * write the buffer asynchronously.  Otherwise, if on a
 		 * cluster boundary then write the buffer asynchronously,
 		 * combining it with contiguous clusters if permitted and
 		 * possible, since we don't expect more writes into this
 		 * buffer soon.  Otherwise, do a delayed write because we
 		 * expect more writes into this buffer soon.
 		 */
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (vm_page_count_severe() || buf_dirty_count_severe())
 			bawrite(bp);
 		else if (n + croffset == pmp->pm_bpcluster) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 				cluster_write(vp, bp, dep->de_FileSize,
 				    seqcount, 0);
 			else
 				bawrite(bp);
 		} else
 			bdwrite(bp);
 		dep->de_flag |= DE_UPDATE;
 	} while (error == 0 && uio->uio_resid > 0);
 
 	/*
 	 * If the write failed and they want us to, truncate the file back
 	 * to the size it was before the write was attempted.
 	 */
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
 			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
 	} else if (ioflag & IO_SYNC)
 		error = deupdat(dep, 1);
 	return (error);
 }
 
 /*
  * Flush the blocks of a file to disk.
  */
 static int
 msdosfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *devvp;
 	int allerror, error;
 
 	vop_stdfsync(ap);
 
 	/*
 	* If the syncing request comes from fsync(2), sync the entire
 	* FAT and any other metadata that happens to be on devvp.  We
 	* need this mainly for the FAT.  We write the FAT sloppily, and
 	* syncing it all now is the best we can easily do to get all
 	* directory entries associated with the file (not just the file)
 	* fully synced.  The other metadata includes critical metadata
 	* for all directory entries, but only in the MNT_ASYNC case.  We
 	* will soon sync all metadata in the file's directory entry.
 	* Non-critical metadata for associated directory entries only
 	* gets synced accidentally, as in most file systems.
 	*/
 	if (ap->a_waitfor == MNT_WAIT) {
 		devvp = VTODE(ap->a_vp)->de_pmp->pm_devvp;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		allerror = VOP_FSYNC(devvp, MNT_WAIT, ap->a_td);
 		VOP_UNLOCK(devvp, 0);
 	} else
 		allerror = 0;
 
 	error = deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT);
 	if (allerror == 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 msdosfs_remove(struct vop_remove_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct denode *ddep = VTODE(ap->a_dvp);
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
 		error = EPERM;
 	else
 		error = removede(ddep, dep);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount);
 #endif
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what links are.
  */
 static int
 msdosfs_link(struct vop_link_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 /*
  * Renames on files require moving the denode to a new hash queue since the
  * denode's location is used to compute which hash queue to put the file
  * in. Unless it is a rename in place.  For example "mv a b".
  *
  * What follows is the basic algorithm:
  *
  * if (file move) {
  *	if (dest file exists) {
  *		remove dest file
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing directory slot
  *	} else {
  *		write new entry in dest directory
  *		update offset and dirclust in denode
  *		move denode to new hash chain
  *		clear old directory entry
  *	}
  * } else {
  *	directory move
  *	if (dest directory exists) {
  *		if (dest is not empty) {
  *			return ENOTEMPTY
  *		}
  *		remove dest directory
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing entry
  *	} else {
  *		be sure dest is not a child of src directory
  *		write entry in dest directory
  *		update "." and ".." in moved directory
  *		clear old directory entry for moved directory
  *	}
  * }
  *
  * On entry:
  *	source's parent directory is unlocked
  *	source file or directory is unlocked
  *	destination's parent directory is locked
  *	destination file or directory is locked if it exists
  *
  * On exit:
  *	all denodes should be released
  */
 static int
 msdosfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct denode *ip, *xp, *dp, *zp;
 	u_char toname[12], oldname[11];
 	u_long from_diroffset, to_diroffset;
 	u_char to_count;
 	int doingdirectory = 0, newparent = 0;
 	int error;
 	u_long cn, pcl;
 	daddr_t bn;
 	struct msdosfsmount *pmp;
 	struct direntry *dotdotp;
 	struct buf *bp;
 
 	pmp = VFSTOMSDOSFS(fdvp->v_mount);
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	/*
 	 * If source and dest are the same, do nothing.
 	 */
 	if (tvp == fvp) {
 		error = 0;
 		goto abortit;
 	}
 
 	error = vn_lock(fvp, LK_EXCLUSIVE);
 	if (error)
 		goto abortit;
 	dp = VTODE(fdvp);
 	ip = VTODE(fvp);
 
 	/*
 	 * Be sure we are not renaming ".", "..", or an alias of ".". This
 	 * leads to a crippled directory tree.  It's pretty tough to do a
 	 * "ls" or "pwd" with the "." directory entry missing, and "cd .."
 	 * doesn't work if the ".." entry is missing.
 	 */
 	if (ip->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip ||
 		    (fcnp->cn_flags & ISDOTDOT) ||
 		    (tcnp->cn_flags & ISDOTDOT) ||
 		    (ip->de_flag & DE_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->de_flag |= DE_RENAME;
 		doingdirectory++;
 	}
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTODE(tdvp);
 	xp = tvp ? VTODE(tvp) : NULL;
 	/*
 	 * Remember direntry place to use for destination
 	 */
 	to_diroffset = dp->de_fndoffset;
 	to_count = dp->de_fndcnt;
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to doscheckpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster)
 		newparent = 1;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		/*
 		 * doscheckpath() vput()'s dp,
 		 * so we have to do a relookup afterwards
 		 */
 		error = doscheckpath(ip, dp);
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("msdosfs_rename: lost to startdir");
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		dp = VTODE(tdvp);
 		xp = tvp ? VTODE(tvp) : NULL;
 	}
 
 	if (xp != NULL) {
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if (xp->de_Attributes & ATTR_DIRECTORY) {
 			if (!dosdirempty(xp)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = removede(dp, xp);
 		if (error)
 			goto bad;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * Convert the filename in tcnp into a dos filename. We copy this
 	 * into the denode and directory entry for the destination
 	 * file/directory.
 	 */
 	error = uniqdosname(VTODE(tdvp), tcnp, toname);
 	if (error)
 		goto abortit;
 
 	/*
 	 * Since from wasn't locked at various places above,
 	 * have to do a relookup here.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("msdosfs_rename: lost from startdir");
 	if (!newparent)
 		VOP_UNLOCK(tdvp, 0);
 	if (relookup(fdvp, &fvp, fcnp) == 0)
 		vrele(fdvp);
 	if (fvp == NULL) {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		if (newparent)
 			VOP_UNLOCK(tdvp, 0);
 		vrele(tdvp);
 		vrele(ap->a_fvp);
 		return 0;
 	}
 	xp = VTODE(fvp);
 	zp = VTODE(fdvp);
 	from_diroffset = zp->de_fndoffset;
 
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed till now. If the source is a file the entry may
 	 * have been unlinked or renamed. In either case there is
 	 * no further work to be done. If the source is a directory
 	 * then it cannot have been rmdir'ed or renamed; this is
 	 * prohibited by the DE_RENAME flag.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		VOP_UNLOCK(fvp, 0);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 		vrele(ap->a_fvp);
 		xp = NULL;
 	} else {
 		vrele(fvp);
 		xp = NULL;
 
 		/*
 		 * First write a new entry in the destination
 		 * directory and mark the entry in the source directory
 		 * as deleted.  Then move the denode to the correct hash
 		 * chain for its new location in the filesystem.  And, if
 		 * we moved a directory, then update its .. entry to point
 		 * to the new parent directory.
 		 */
 		bcopy(ip->de_Name, oldname, 11);
 		bcopy(toname, ip->de_Name, 11);	/* update denode */
 		dp->de_fndoffset = to_diroffset;
 		dp->de_fndcnt = to_count;
 		error = createde(ip, dp, (struct denode **)0, tcnp);
 		if (error) {
 			bcopy(oldname, ip->de_Name, 11);
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		/*
 		 * If ip is for a directory, then its name should always
 		 * be "." since it is for the directory entry in the
 		 * directory itself (msdosfs_lookup() always translates
 		 * to the "." entry so as to get a unique denode, except
 		 * for the root directory there are different
 		 * complications).  However, we just corrupted its name
 		 * to pass the correct name to createde().  Undo this.
 		 */
 		if ((ip->de_Attributes & ATTR_DIRECTORY) != 0)
 			bcopy(oldname, ip->de_Name, 11);
 		ip->de_refcnt++;
 		zp->de_fndoffset = from_diroffset;
 		error = removede(zp, ip);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		if (!doingdirectory) {
 			error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0,
 				       &ip->de_dirclust, 0);
 			if (error) {
 				/* XXX should downgrade to ro here, fs is corrupt */
 				if (newparent)
 					VOP_UNLOCK(fdvp, 0);
 				VOP_UNLOCK(fvp, 0);
 				goto bad;
 			}
 			if (ip->de_dirclust == MSDOSFSROOT)
 				ip->de_diroffset = to_diroffset;
 			else
 				ip->de_diroffset = to_diroffset & pmp->pm_crbomask;
 		}
 		reinsert(ip);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 	}
 
 	/*
 	 * If we moved a directory to a new parent directory, then we must
 	 * fixup the ".." entry in the moved directory.
 	 */
 	if (doingdirectory && newparent) {
 		cn = ip->de_StartCluster;
 		if (cn == MSDOSFSROOT) {
 			/* this should never happen */
 			panic("msdosfs_rename(): updating .. in root directory?");
 		} else
 			bn = cntobn(pmp, cn);
 		error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			      NOCRED, &bp);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			brelse(bp);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		dotdotp = (struct direntry *)bp->b_data + 1;
 		pcl = dp->de_StartCluster;
 		if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 			pcl = MSDOSFSROOT;
 		putushort(dotdotp->deStartCluster, pcl);
 		if (FAT32(pmp))
 			putushort(dotdotp->deHighClust, pcl >> 16);
 		if (DOINGASYNC(fvp))
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 	}
 
 	/*
 	 * The msdosfs lookup is case insensitive. Several aliases may
 	 * be inserted for a single directory entry. As a consequnce,
 	 * name cache purge done by lookup for fvp when DELETE op for
 	 * namei is specified, might be not enough to expunge all
 	 * namecache entries that were installed for this direntry.
 	 */
 	cache_purge(fvp);
 	VOP_UNLOCK(fvp, 0);
 bad:
 	if (xp)
 		vput(tvp);
 	vput(tdvp);
 out:
 	ip->de_flag &= ~DE_RENAME;
 	vrele(fdvp);
 	vrele(fvp);
 	return (error);
 
 }
 
 static struct {
 	struct direntry dot;
 	struct direntry dotdot;
 } dosdirtemplate = {
 	{	".          ",				/* the . entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	},
 	{	"..         ",				/* the .. entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	}
 };
 
 static int
 msdosfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct direntry *denp;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	struct buf *bp;
 	u_long newcluster, pcl;
 	int bn;
 	int error;
 	struct denode ndirent;
 	struct timespec ts;
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad2;
 	}
 
 	/*
 	 * Allocate a cluster to hold the about to be created directory.
 	 */
 	error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL);
 	if (error)
 		goto bad2;
 
 	bzero(&ndirent, sizeof(ndirent));
 	ndirent.de_pmp = pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	getnanotime(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 
 	/*
 	 * Now fill the cluster with the "." and ".." entries. And write
 	 * the cluster to disk.  This way it is there for the parent
 	 * directory to be pointing at if there were a crash.
 	 */
 	bn = cntobn(pmp, newcluster);
 	/* always succeeds */
 	bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 	bzero(bp->b_data, pmp->pm_bpcluster);
 	bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate);
 	denp = (struct direntry *)bp->b_data;
 	putushort(denp[0].deStartCluster, newcluster);
 	putushort(denp[0].deCDate, ndirent.de_CDate);
 	putushort(denp[0].deCTime, ndirent.de_CTime);
 	denp[0].deCHundredth = ndirent.de_CHun;
 	putushort(denp[0].deADate, ndirent.de_ADate);
 	putushort(denp[0].deMDate, ndirent.de_MDate);
 	putushort(denp[0].deMTime, ndirent.de_MTime);
 	pcl = pdep->de_StartCluster;
 	/*
 	 * Although the root directory has a non-magic starting cluster
 	 * number for FAT32, chkdsk and fsck_msdosfs still require
 	 * references to it in dotdot entries to be magic.
 	 */
 	if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 		pcl = MSDOSFSROOT;
 	putushort(denp[1].deStartCluster, pcl);
 	putushort(denp[1].deCDate, ndirent.de_CDate);
 	putushort(denp[1].deCTime, ndirent.de_CTime);
 	denp[1].deCHundredth = ndirent.de_CHun;
 	putushort(denp[1].deADate, ndirent.de_ADate);
 	putushort(denp[1].deMDate, ndirent.de_MDate);
 	putushort(denp[1].deMTime, ndirent.de_MTime);
 	if (FAT32(pmp)) {
 		putushort(denp[0].deHighClust, newcluster >> 16);
 		putushort(denp[1].deHighClust, pcl >> 16);
 	}
 
 	if (DOINGASYNC(ap->a_dvp))
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		goto bad;
 
 	/*
 	 * Now build up a directory entry pointing to the newly allocated
 	 * cluster.  This will be written to an empty slot in the parent
 	 * directory.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_mkdir: no name");
 #endif
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_DIRECTORY;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = newcluster;
 	ndirent.de_FileSize = 0;
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	return (0);
 
 bad:
 	clusterfree(pmp, newcluster, NULL);
 bad2:
 	return (error);
 }
 
 static int
 msdosfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
 	int error;
 
 	ip = VTODE(vp);
 	dp = VTODE(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	/*
 	 * Delete the entry from the directory.  For dos filesystems this
 	 * gets rid of the directory entry on disk, the in memory copy
 	 * still exists but the de_refcnt is <= 0.  This prevents it from
 	 * being found by deget().  When the vput() on dep is done we give
 	 * up access and eventually msdosfs_reclaim() will be called which
 	 * will remove it from the denode cache.
 	 */
 	error = removede(dp, ip);
 	if (error)
 		goto out;
 	/*
 	 * This is where we decrement the link count in the parent
 	 * directory.  Since dos filesystems don't do this we just purge
 	 * the name cache.
 	 */
 	cache_purge(dvp);
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
 	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred);
 	cache_purge(vp);
 
 out:
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what symlinks are.
  */
 static int
 msdosfs_symlink(struct vop_symlink_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 msdosfs_readdir(struct vop_readdir_args *ap)
 {
 	struct mbnambuf nb;
 	int error = 0;
 	int diff;
 	long n;
 	int blsize;
 	long on;
 	u_long cn;
 	uint64_t fileno;
 	u_long dirsperblk;
 	long bias = 0;
 	daddr_t bn, lbn;
 	struct buf *bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	struct dirent dirbuf;
 	struct uio *uio = ap->a_uio;
 	u_long *cookies = NULL;
 	int ncookies = 0;
 	off_t offset, off;
 	int chksum = -1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n",
 	    ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
 #endif
 
 	/*
 	 * msdosfs_readdir() won't operate properly on regular files since
 	 * it does i/o only with the filesystem vnode, and hence can
 	 * retrieve the wrong block from the buffer cache for a plain file.
 	 * So, fail attempts to readdir() on a plain file.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0)
 		return (ENOTDIR);
 
 	/*
 	 * To be safe, initialize dirbuf
 	 */
 	bzero(dirbuf.d_name, sizeof(dirbuf.d_name));
 
 	/*
 	 * If the user buffer is smaller than the size of one dos directory
 	 * entry or the file offset is not a multiple of the size of a
 	 * directory entry, then we fail the read.
 	 */
 	off = offset = uio->uio_offset;
 	if (uio->uio_resid < sizeof(struct direntry) ||
 	    (offset & (sizeof(struct direntry) - 1)))
 		return (EINVAL);
 
 	if (ap->a_ncookies) {
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long), M_TEMP,
 		       M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 
 	/*
 	 * If they are reading from the root directory then, we simulate
 	 * the . and .. entries since these don't exist in the root
 	 * directory.  We also set the offset bias to make up for having to
 	 * simulate these entries. By this I mean that at file offset 64 we
 	 * read the first entry in the root directory that lives on disk.
 	 */
 	if (dep->de_StartCluster == MSDOSFSROOT
 	    || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) {
 #if 0
 		printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n",
 		    offset);
 #endif
 		bias = 2 * sizeof(struct direntry);
 		if (offset < bias) {
 			for (n = (int)offset / sizeof(struct direntry);
 			     n < 2; n++) {
 				if (FAT32(pmp))
 					fileno = (uint64_t)cntobn(pmp,
 								 pmp->pm_rootdirblk)
 							  * dirsperblk;
 				else
 					fileno = 1;
 				if (pmp->pm_flags & MSDOSFS_LARGEFS) {
 					dirbuf.d_fileno =
 					    msdosfs_fileno_map(pmp->pm_mountp,
 					    fileno);
 				} else {
 
 					dirbuf.d_fileno = (uint32_t)fileno;
 				}
 				dirbuf.d_type = DT_DIR;
 				switch (n) {
 				case 0:
 					dirbuf.d_namlen = 1;
 					strcpy(dirbuf.d_name, ".");
 					break;
 				case 1:
 					dirbuf.d_namlen = 2;
 					strcpy(dirbuf.d_name, "..");
 					break;
 				}
 				dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 				if (uio->uio_resid < dirbuf.d_reclen)
 					goto out;
 				error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 				if (error)
 					goto out;
 				offset += sizeof(struct direntry);
 				off = offset;
 				if (cookies) {
 					*cookies++ = offset;
 					if (--ncookies <= 0)
 						goto out;
 				}
 			}
 		}
 	}
 
 	mbnambuf_init(&nb);
 	off = offset;
 	while (uio->uio_resid > 0) {
 		lbn = de_cluster(pmp, offset - bias);
 		on = (offset - bias) & pmp->pm_crbomask;
 		n = min(pmp->pm_bpcluster - on, uio->uio_resid);
 		diff = dep->de_FileSize - (offset - bias);
 		if (diff <= 0)
 			break;
 		n = min(n, diff);
 		error = pcbmap(dep, lbn, &bn, &cn, &blsize);
 		if (error)
 			break;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		n = min(n, blsize - bp->b_resid);
 		if (n == 0) {
 			brelse(bp);
 			return (EIO);
 		}
 
 		/*
 		 * Convert from dos directory entries to fs-independent
 		 * directory entries.
 		 */
 		for (dentp = (struct direntry *)(bp->b_data + on);
 		     (char *)dentp < bp->b_data + on + n;
 		     dentp++, offset += sizeof(struct direntry)) {
 #if 0
 			printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n",
 			    dentp, prev, crnt, dentp->deName[0], dentp->deAttributes);
 #endif
 			/*
 			 * If this is an unused entry, we can stop.
 			 */
 			if (dentp->deName[0] == SLOT_EMPTY) {
 				brelse(bp);
 				goto out;
 			}
 			/*
 			 * Skip deleted entries.
 			 */
 			if (dentp->deName[0] == SLOT_DELETED) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 
 			/*
 			 * Handle Win95 long directory entries
 			 */
 			if (dentp->deAttributes == ATTR_WIN95) {
 				if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 					continue;
 				chksum = win2unixfn(&nb,
 				    (struct winentry *)dentp, chksum, pmp);
 				continue;
 			}
 
 			/*
 			 * Skip volume labels
 			 */
 			if (dentp->deAttributes & ATTR_VOLUME) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 			/*
 			 * This computation of d_fileno must match
 			 * the computation of va_fileid in
 			 * msdosfs_getattr.
 			 */
 			if (dentp->deAttributes & ATTR_DIRECTORY) {
 				fileno = getushort(dentp->deStartCluster);
 				if (FAT32(pmp))
 					fileno |= getushort(dentp->deHighClust) << 16;
 				/* if this is the root directory */
 				if (fileno == MSDOSFSROOT)
 					if (FAT32(pmp))
 						fileno = (uint64_t)cntobn(pmp,
 								pmp->pm_rootdirblk)
 							 * dirsperblk;
 					else
 						fileno = 1;
 				else
 					fileno = (uint64_t)cntobn(pmp, fileno) *
 					    dirsperblk;
 				dirbuf.d_type = DT_DIR;
 			} else {
 				fileno = (uoff_t)offset /
 				    sizeof(struct direntry);
 				dirbuf.d_type = DT_REG;
 			}
 			if (pmp->pm_flags & MSDOSFS_LARGEFS) {
 				dirbuf.d_fileno =
 				    msdosfs_fileno_map(pmp->pm_mountp, fileno);
 			} else
 				dirbuf.d_fileno = (uint32_t)fileno;
 
 			if (chksum != winChksum(dentp->deName)) {
 				dirbuf.d_namlen = dos2unixfn(dentp->deName,
 				    (u_char *)dirbuf.d_name,
 				    dentp->deLowerCase |
 					((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ?
 					(LCASE_BASE | LCASE_EXT) : 0),
 				    pmp);
 				mbnambuf_init(&nb);
 			} else
 				mbnambuf_flush(&nb, &dirbuf);
 			chksum = -1;
 			dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 			if (uio->uio_resid < dirbuf.d_reclen) {
 				brelse(bp);
 				goto out;
 			}
 			error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 			if (error) {
 				brelse(bp);
 				goto out;
 			}
 			if (cookies) {
 				*cookies++ = offset + sizeof(struct direntry);
 				if (--ncookies <= 0) {
 					brelse(bp);
 					goto out;
 				}
 			}
 			off = offset + sizeof(struct direntry);
 		}
 		brelse(bp);
 	}
 out:
 	/* Subtract unused cookies */
 	if (ap->a_ncookies)
 		*ap->a_ncookies -= ncookies;
 
 	uio->uio_offset = off;
 
 	/*
 	 * Set the eofflag (NFS uses it)
 	 */
 	if (ap->a_eofflag) {
 		if (dep->de_FileSize - (offset - bias) <= 0)
 			*ap->a_eofflag = 1;
 		else
 			*ap->a_eofflag = 0;
 	}
 	return (error);
 }
 
 /*-
  * a_vp   - pointer to the file's vnode
  * a_bn   - logical block number within the file (cluster number for us)
  * a_bop  - where to return the bufobj of the special file containing the fs
  * a_bnp  - where to return the "physical" block number corresponding to a_bn
  *          (relative to the special file; units are blocks of size DEV_BSIZE)
  * a_runp - where to return the "run past" a_bn.  This is the count of logical
  *          blocks whose physical blocks (together with a_bn's physical block)
  *          are contiguous.
  * a_runb - where to return the "run before" a_bn.
  */
 static int
 msdosfs_bmap(struct vop_bmap_args *ap)
 {
 	struct denode *dep;
 	struct mount *mp;
 	struct msdosfsmount *pmp;
 	struct vnode *vp;
 	daddr_t runbn;
 	u_long cn;
 	int bnpercn, error, maxio, maxrun, run;
 
 	vp = ap->a_vp;
 	dep = VTODE(vp);
 	pmp = dep->de_pmp;
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &pmp->pm_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	cn = ap->a_bn;
 	if (cn != ap->a_bn)
 		return (EFBIG);
 	error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL);
 	if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL))
 		return (error);
 
 	mp = vp->v_mount;
 	maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize;
 	bnpercn = de_cn2bn(pmp, 1);
 	if (ap->a_runp != NULL) {
 		maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn);
 		for (run = 1; run <= maxrun; run++) {
 			if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp + run * bnpercn)
 				break;
 		}
 		*ap->a_runp = run - 1;
 	}
 	if (ap->a_runb != NULL) {
 		maxrun = ulmin(maxio - 1, cn);
 		for (run = 1; run < maxrun; run++) {
 			if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp - run * bnpercn)
 				break;
 		}
 		*ap->a_runb = run - 1;
 	}
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, msdosfs, CTLFLAG_RW, 0, "msdos filesystem");
 static int use_buf_pager = 0;
 SYSCTL_INT(_vfs_msdosfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 msdosfs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (de_cluster(VTODE(vp)->de_pmp, off));
 }
 
 static int
 msdosfs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 
 	return (VTODE(vp)->de_pmp->pm_bpcluster);
 }
 
 static int
 msdosfs_getpages(struct vop_getpages_args *ap)
 {
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(ap->a_vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, msdosfs_gbp_getblkno,
 		    msdosfs_gbp_getblksz));
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 static int
 msdosfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct bufobj *bo;
 	int error = 0;
 	daddr_t blkno;
 
 	/*
 	 * If we don't already know the filesystem relative block number
 	 * then get it using pcbmap().  If pcbmap() returns the block
 	 * number as -1 then we've got a hole in the file.  DOS filesystems
 	 * don't allow files with holes, so we shouldn't ever see this.
 	 */
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if (bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	/*
 	 * Read/write the block from/to the disk that contains the desired
 	 * file block.
 	 */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = dep->de_pmp->pm_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 msdosfs_print(struct vop_print_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 
 	printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ",
 	       dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset);
 	printf("on dev %s\n", devtoname(dep->de_pmp->pm_dev));
 	return (0);
 }
 
 static int
 msdosfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp;
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 static int
 msdosfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct denode *dep;
 	struct defid *defhp;
 
 	dep = VTODE(ap->a_vp);
 	defhp = (struct defid *)ap->a_fhp;
 	defhp->defid_len = sizeof(struct defid);
 	defhp->defid_dirclust = dep->de_dirclust;
 	defhp->defid_dirofs = dep->de_diroffset;
 	/* defhp->defid_gen = dep->de_gen; */
 	return (0);
 }
 
 /* Global vfs data structures for msdosfs */
 struct vop_vector msdosfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		msdosfs_access,
 	.vop_bmap =		msdosfs_bmap,
 	.vop_getpages =		msdosfs_getpages,
 	.vop_cachedlookup =	msdosfs_lookup,
 	.vop_open =		msdosfs_open,
 	.vop_close =		msdosfs_close,
 	.vop_create =		msdosfs_create,
 	.vop_fsync =		msdosfs_fsync,
 	.vop_fdatasync =	vop_stdfdatasync_buf,
 	.vop_getattr =		msdosfs_getattr,
 	.vop_inactive =		msdosfs_inactive,
 	.vop_link =		msdosfs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		msdosfs_mkdir,
 	.vop_mknod =		msdosfs_mknod,
 	.vop_pathconf =		msdosfs_pathconf,
 	.vop_print =		msdosfs_print,
 	.vop_read =		msdosfs_read,
 	.vop_readdir =		msdosfs_readdir,
 	.vop_reclaim =		msdosfs_reclaim,
 	.vop_remove =		msdosfs_remove,
 	.vop_rename =		msdosfs_rename,
 	.vop_rmdir =		msdosfs_rmdir,
 	.vop_setattr =		msdosfs_setattr,
 	.vop_strategy =		msdosfs_strategy,
 	.vop_symlink =		msdosfs_symlink,
 	.vop_write =		msdosfs_write,
 	.vop_vptofh =		msdosfs_vptofh,
 };
Index: stable/11/sys/kern/kern_mib.c
===================================================================
--- stable/11/sys/kern/kern_mib.c	(revision 331016)
+++ stable/11/sys/kern/kern_mib.c	(revision 331017)
@@ -1,596 +1,597 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 #include "opt_config.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
+#include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 
 SYSCTL_ROOT_NODE(0,	  sysctl, CTLFLAG_RW, 0,
 	"Sysctl internal magic");
 SYSCTL_ROOT_NODE(CTL_KERN,	  kern,   CTLFLAG_RW|CTLFLAG_CAPRD, 0,
 	"High kernel, proc, limits &c");
 SYSCTL_ROOT_NODE(CTL_VM,	  vm,     CTLFLAG_RW, 0,
 	"Virtual memory");
 SYSCTL_ROOT_NODE(CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
 	"File system");
 SYSCTL_ROOT_NODE(CTL_NET,	  net,    CTLFLAG_RW, 0,
 	"Network, (see socket.h)");
 SYSCTL_ROOT_NODE(CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
 	"Debugging");
 SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW, 0,
 	"Sizeof various things");
 SYSCTL_ROOT_NODE(CTL_HW,	  hw,     CTLFLAG_RW, 0,
 	"hardware");
 SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
 	"machine dependent");
 SYSCTL_ROOT_NODE(CTL_USER,	  user,   CTLFLAG_RW, 0,
 	"user-level");
 SYSCTL_ROOT_NODE(CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
 	"p1003_1b, (see p1003_1b.h)");
 
 SYSCTL_ROOT_NODE(OID_AUTO,  compat, CTLFLAG_RW, 0,
 	"Compatibility code");
 SYSCTL_ROOT_NODE(OID_AUTO, security, CTLFLAG_RW, 0, 
      	"Security");
 #ifdef REGRESSION
 SYSCTL_ROOT_NODE(OID_AUTO, regression, CTLFLAG_RW, 0,
      "Regression test MIB");
 #endif
 
 SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD|CTLFLAG_MPSAFE,
     kern_ident, 0, "Kernel identifier");
 
 SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, BSD, "Operating system revision");
 
 SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD|CTLFLAG_MPSAFE,
     version, 0, "Kernel version");
 
 SYSCTL_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD|CTLFLAG_MPSAFE,
     compiler_version, 0, "Version of compiler used to compile kernel");
 
 SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE|
     CTLFLAG_CAPRD, ostype, 0, "Operating system type");
 
 SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &maxproc, 0, "Maximum number of processes");
 
 SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
     &maxprocperuid, 0, "Maximum processes allowed per userid");
 
 SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &maxusers, 0, "Hint for kernel tuning");
 
 SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, ARG_MAX, "Maximum bytes of argument to execve(2)");
 
 SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, _POSIX_VERSION, "Version of POSIX attempting to comply to");
 
 SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN |
     CTLFLAG_NOFETCH | CTLFLAG_CAPRD, &ngroups_max, 0,
     "Maximum number of supplemental groups a user can belong to");
 
 SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 1, "Whether job control is available");
 
 #ifdef _POSIX_SAVED_IDS
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 1, "Whether saved set-group/user ID is available");
 #else
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available");
 #endif
 
 char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
 
 SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE,
     kernelname, sizeof kernelname, "Name of kernel file booted");
 
 SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD,
     &mp_ncpus, 0, "Number of active CPUs");
 
 SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, BYTE_ORDER, "System byte order");
 
 SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, PAGE_SIZE, "System memory page size");
 
 static int
 sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
 {
 	char buf[256];
 	size_t len;
 
 	/*-
 	 * This is one of the very few legitimate uses of read_random(9).
 	 * Use of arc4random(9) is not recommended as that will ignore
 	 * an unsafe (i.e. unseeded) random(4).
 	 *
 	 * If random(4) is not seeded, then this returns 0, so the
 	 * sysctl will return a zero-length buffer.
 	 */
 	len = read_random(buf, MIN(req->oldlen, sizeof(buf)));
 	return (SYSCTL_OUT(req, buf, len));
 }
 
 SYSCTL_PROC(_kern, KERN_ARND, arandom,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0,
     sysctl_kern_arnd, "", "arc4rand");
 
 static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 
 	val = ctob(physmem);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "LU", "");
 
 static int
 sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	val = ctob(realmem);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_realmem, "LU", "");
 static int
 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 
 	val = ctob(physmem - vm_cnt.v_wire_count);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "LU", "");
 
 SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "");
 
 u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
 
 static int
 sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 #ifdef SCTL_MASK32
 	int i;
 	uint32_t pagesizes32[MAXPAGESIZES];
 
 	if (req->flags & SCTL_MASK32) {
 		/*
 		 * Recreate the "pagesizes" array with 32-bit elements.  Truncate
 		 * any page size greater than UINT32_MAX to zero.
 		 */
 		for (i = 0; i < MAXPAGESIZES; i++)
 			pagesizes32[i] = (uint32_t)pagesizes[i];
 
 		error = SYSCTL_OUT(req, pagesizes32, sizeof(pagesizes32));
 	} else
 #endif
 		error = SYSCTL_OUT(req, pagesizes, sizeof(pagesizes));
 	return (error);
 }
 SYSCTL_PROC(_hw, OID_AUTO, pagesizes, CTLTYPE_ULONG | CTLFLAG_RD,
     NULL, 0, sysctl_hw_pagesizes, "LU", "Supported page sizes");
 
 #ifdef SCTL_MASK32
 int adaptive_machine_arch = 1;
 SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
     &adaptive_machine_arch, 1,
     "Adapt reported machine architecture to the ABI of the binary");
 #endif
 
 static int
 sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	static const char machine_arch[] = MACHINE_ARCH;
 #ifdef SCTL_MASK32
 	static const char machine_arch32[] = MACHINE_ARCH32;
 
 	if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch)
 		error = SYSCTL_OUT(req, machine_arch32, sizeof(machine_arch32));
 	else
 #endif
 		error = SYSCTL_OUT(req, machine_arch, sizeof(machine_arch));
 	return (error);
 
 }
 SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine_arch, "A",
     "System architecture");
 
 SYSCTL_STRING(_kern, OID_AUTO, supported_archs, CTLFLAG_RD | CTLFLAG_MPSAFE,
 #ifdef COMPAT_FREEBSD32
     MACHINE_ARCH " " MACHINE_ARCH32, 0, "Supported architectures for binaries");
 #else
     MACHINE_ARCH, 0, "Supported architectures for binaries");
 #endif
 
 static int
 sysctl_hostname(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	size_t pr_offset;
 	char tmpname[MAXHOSTNAMELEN];
 	int descend, error, len;
 
 	/*
 	 * This function can set: hostname domainname hostuuid.
 	 * Keep that in mind when comments say "hostname".
 	 */
 	pr_offset = (size_t)arg1;
 	len = arg2;
 	KASSERT(len <= sizeof(tmpname),
 	    ("length %d too long for %s", len, __func__));
 
 	pr = req->td->td_ucred->cr_prison;
 	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
 		return (EPERM);
 	/*
 	 * Make a local copy of hostname to get/set so we don't have to hold
 	 * the jail mutex during the sysctl copyin/copyout activities.
 	 */
 	mtx_lock(&pr->pr_mtx);
 	bcopy((char *)pr + pr_offset, tmpname, len);
 	mtx_unlock(&pr->pr_mtx);
 
 	error = sysctl_handle_string(oidp, tmpname, len, req);
 
 	if (req->newptr != NULL && error == 0) {
 		/*
 		 * Copy the locally set hostname to all jails that share
 		 * this host info.
 		 */
 		sx_slock(&allprison_lock);
 		while (!(pr->pr_flags & PR_HOST))
 			pr = pr->pr_parent;
 		mtx_lock(&pr->pr_mtx);
 		bcopy(tmpname, (char *)pr + pr_offset, len);
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			if (cpr->pr_flags & PR_HOST)
 				descend = 0;
 			else
 				bcopy(tmpname, (char *)cpr + pr_offset, len);
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
     sysctl_hostname, "A", "Hostname");
 SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
     sysctl_hostname, "A", "Name of the current YP/NIS domain");
 SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
     sysctl_hostname, "A", "Host UUID");
 
 static int	regression_securelevel_nonmonotonic = 0;
 
 #ifdef REGRESSION
 SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
     &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
 #endif
 
 static int
 sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	int descend, error, level;
 
 	pr = req->td->td_ucred->cr_prison;
 
 	/*
 	 * Reading the securelevel is easy, since the current jail's level
 	 * is known to be at least as secure as any higher levels.  Perform
 	 * a lockless read since the securelevel is an integer.
 	 */
 	level = pr->pr_securelevel;
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	/* Permit update only if the new securelevel exceeds the old. */
 	sx_slock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	if (!regression_securelevel_nonmonotonic &&
 	    level < pr->pr_securelevel) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EPERM);
 	}
 	pr->pr_securelevel = level;
 	/*
 	 * Set all child jails to be at least this level, but do not lower
 	 * them (even if regression_securelevel_nonmonotonic).
 	 */
 	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
 		if (cpr->pr_securelevel < level)
 			cpr->pr_securelevel = level;
 	}
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
     CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
     "I", "Current secure level");
 
 #ifdef INCLUDE_CONFIG_FILE
 /* Actual kernel configuration options. */
 extern char kernconfstring[];
 
 SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD | CTLFLAG_MPSAFE,
     kernconfstring, 0, "Kernel configuration file");
 #endif
 
 static int
 sysctl_hostid(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	u_long tmpid;
 	int descend, error;
 
 	/*
 	 * Like sysctl_hostname, except it operates on a u_long
 	 * instead of a string, and is used only for hostid.
 	 */
 	pr = req->td->td_ucred->cr_prison;
 	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
 		return (EPERM);
 	tmpid = pr->pr_hostid;
 	error = sysctl_handle_long(oidp, &tmpid, 0, req);
 
 	if (req->newptr != NULL && error == 0) {
 		sx_slock(&allprison_lock);
 		while (!(pr->pr_flags & PR_HOST))
 			pr = pr->pr_parent;
 		mtx_lock(&pr->pr_mtx);
 		pr->pr_hostid = tmpid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			if (cpr->pr_flags & PR_HOST)
 				descend = 0;
 			else
 				cpr->pr_hostid = tmpid;
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
     CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
     NULL, 0, sysctl_hostid, "LU", "Host ID");
 
 /*
  * The osrelease string is copied from the global (osrelease in vers.c) into
  * prison0 by a sysinit and is inherited by child jails if not changed at jail
  * creation, so we always return the copy from the current prison data.
  */
 static int
 sysctl_osrelease(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 
 	pr = req->td->td_ucred->cr_prison;
 	return (SYSCTL_OUT(req, pr->pr_osrelease, strlen(pr->pr_osrelease) + 1));
 
 }
 
 SYSCTL_PROC(_kern, KERN_OSRELEASE, osrelease,
     CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_osrelease, "A", "Operating system release");
 
 /*
  * The osreldate number is copied from the global (osreldate in vers.c) into
  * prison0 by a sysinit and is inherited by child jails if not changed at jail
  * creation, so we always return the value from the current prison data.
  */
 static int
 sysctl_osreldate(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 
 	pr = req->td->td_ucred->cr_prison;
 	return (SYSCTL_OUT(req, &pr->pr_osreldate, sizeof(pr->pr_osreldate)));
 
 }
 
 /*
  * NOTICE: The *userland* release date is available in
  * /usr/include/osreldate.h
  */
 SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate,
     CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_osreldate, "I", "Kernel release date");
 
 SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features");
 
 #ifdef COMPAT_FREEBSD4
 FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
 #endif
 
 #ifdef COMPAT_FREEBSD5
 FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
 #endif
 
 #ifdef COMPAT_FREEBSD6
 FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
 #endif
 
 #ifdef COMPAT_FREEBSD7
 FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
 #endif
 
 /*
  * This is really cheating.  These actually live in the libc, something
  * which I'm not quite sure is a good idea anyway, but in order for
  * getnext and friends to actually work, we define dummies here.
  *
  * XXXRW: These probably should be CTLFLAG_CAPRD.
  */
 SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
     "", 0, "PATH that finds all the standard utilities");
 SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max ibase/obase values in bc(1)");
 SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max array size in bc(1)");
 SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max scale value in bc(1)");
 SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max string length in bc(1)");
 SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
 SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "");
 SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max length (bytes) of a text-processing utility's input line");
 SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Maximum number of repeats of a regexp permitted");
 SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0,
     "The version of POSIX 1003.2 with which the system attempts to comply");
 SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether C development supports the C bindings option");
 SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports the C development utilities option");
 SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "");
 SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN development utilities");
 SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN runtime utilities");
 SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports creation of locales");
 SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports software development utilities");
 SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports the user portability utilities");
 SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of streams a process may have open at one time");
 SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of types supported for timezone names");
 
 #include <sys/vnode.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct vnode), "sizeof(struct vnode)");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct proc), "sizeof(struct proc)");
 
 static int
 sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
 {
 	int error, pm;
 
 	pm = pid_max;
 	error = sysctl_handle_int(oidp, &pm, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	sx_xlock(&proctree_lock);
 	sx_xlock(&allproc_lock);
 
 	/*
 	 * Only permit the values less then PID_MAX.
 	 * As a safety measure, do not allow to limit the pid_max too much.
 	 */
 	if (pm < 300 || pm > PID_MAX)
 		error = EINVAL;
 	else
 		pid_max = pm;
 	sx_xunlock(&allproc_lock);
 	sx_xunlock(&proctree_lock);
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_pid_max, "I", "Maximum allowed pid");
 
 #include <sys/bio.h>
 #include <sys/buf.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct bio), "sizeof(struct bio)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct buf), "sizeof(struct buf)");
 
 #include <sys/user.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
 
 /* Used by kernel debuggers. */
 const int pcb_size = sizeof(struct pcb);
 SYSCTL_INT(_debug_sizeof, OID_AUTO, pcb, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct pcb), "sizeof(struct pcb)");
 
 /* XXX compatibility, remove for 6.0 */
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
     &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
     "compatibility for kern.fallback_elf_brand");
Index: stable/11/sys/kern/kern_thread.c
===================================================================
--- stable/11/sys/kern/kern_thread.c	(revision 331016)
+++ stable/11/sys/kern/kern_thread.c	(revision 331017)
@@ -1,1259 +1,1260 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_witness.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/selinfo.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/rwlock.h>
 #include <sys/umtx.h>
+#include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/vm_domain.h>
 #include <sys/eventhandler.h>
 
 /*
  * Asserts below verify the stability of struct thread and struct proc
  * layout, as exposed by KBI to modules.  On head, the KBI is allowed
  * to drift, change to the structures must be accompanied by the
  * assert update.
  *
  * On the stable branches after KBI freeze, conditions must not be
  * violated.  Typically new fields are moved to the end of the
  * structures.
  */
 #ifdef __amd64__
 _Static_assert(offsetof(struct thread, td_flags) == 0xe4,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0xec,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x418,
     "struct thread KBI td_frame");
 _Static_assert(offsetof(struct thread, td_emuldata) == 0x4c0,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb0,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0xbc,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x3c0,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x3d0,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x4a0,
     "struct proc KBI p_emuldata");
 #endif
 #ifdef __i386__
 _Static_assert(offsetof(struct thread, td_flags) == 0x8c,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0x94,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x2c0,
     "struct thread KBI td_frame");
 _Static_assert(offsetof(struct thread, td_emuldata) == 0x30c,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0x68,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0x74,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x268,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x274,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x2f4,
     "struct proc KBI p_emuldata");
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE(proc, , , lwp__exit);
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 static struct mtx zombie_lock;
 MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 static void thread_zombie(struct thread *);
 static int thread_unsuspend_one(struct thread *td, struct proc *p,
     bool boundary);
 
 #define TID_BUFFER_SIZE	1024
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 static lwpid_t tid_buffer[TID_BUFFER_SIZE];
 static int tid_head, tid_tail;
 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
 
 struct	tidhashhead *tidhashtbl;
 u_long	tidhash;
 struct	rwlock tidhash_lock;
 
 static lwpid_t
 tid_alloc(void)
 {
 	lwpid_t	tid;
 
 	tid = alloc_unr(tid_unrhdr);
 	if (tid != -1)
 		return (tid);
 	mtx_lock(&tid_lock);
 	if (tid_head == tid_tail) {
 		mtx_unlock(&tid_lock);
 		return (-1);
 	}
 	tid = tid_buffer[tid_head];
 	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	return (tid);
 }
 
 static void
 tid_free(lwpid_t tid)
 {
 	lwpid_t tmp_tid = -1;
 
 	mtx_lock(&tid_lock);
 	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
 		tmp_tid = tid_buffer[tid_head];
 		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	}
 	tid_buffer[tid_tail] = tid;
 	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	if (tmp_tid != -1)
 		free_unr(tid_unrhdr, tmp_tid);
 }
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = tid_alloc();
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 	td->td_lend_user_pri = PRI_MAX;
 	EVENTHANDLER_INVOKE(thread_ctor, td);
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	/* Free all OSD associated to this thread. */
 	osd_thread_exit(td);
 	td_softdep_cleanup(td);
 	MPASS(td->td_su == NULL);
 
 	EVENTHANDLER_INVOKE(thread_dtor, td);
 	tid_free(td->td_tid);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	umtx_thread_init(td);
 	td->td_kstack = 0;
 	td->td_sel = NULL;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
 	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	seltdfini(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup0(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	proc_linkup(p, td);
 }
 
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 
 	/*
 	 * pid_max cannot be greater than PID_MAX.
 	 * leave one number for thread0.
 	 */
 	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    32 - 1, UMA_ZONE_NOFREE);
 	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
 	rw_init(&tidhash_lock, "tidhash");
 }
 
 /*
  * Place an unused thread on the zombie list.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_zombie(struct thread *td)
 {
 	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombie resources.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant.
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			thread_cow_free(td_first);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(int pages)
 {
 	struct thread *td;
 
 	thread_reap(); /* check if any zombies to get */
 
 	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
 	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
 	if (!vm_thread_new(td, pages)) {
 		uma_zfree(thread_zone, td);
 		return (NULL);
 	}
 	cpu_thread_alloc(td);
 	vm_domain_policy_init(&td->td_vm_dom_policy);
 	return (td);
 }
 
 int
 thread_alloc_stack(struct thread *td, int pages)
 {
 
 	KASSERT(td->td_kstack == 0,
 	    ("thread_alloc_stack called on a thread with kstack"));
 	if (!vm_thread_new(td, pages))
 		return (0);
 	cpu_thread_alloc(td);
 	return (1);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	lock_profile_thread_exit(td);
 	if (td->td_cpuset)
 		cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
 	vm_domain_policy_cleanup(&td->td_vm_dom_policy);
 	callout_drain(&td->td_slpcallout);
 	uma_zfree(thread_zone, td);
 }
 
 void
 thread_cow_get_proc(struct thread *newtd, struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	newtd->td_ucred = crhold(p->p_ucred);
 	newtd->td_limit = lim_hold(p->p_limit);
 	newtd->td_cowgen = p->p_cowgen;
 }
 
 void
 thread_cow_get(struct thread *newtd, struct thread *td)
 {
 
 	newtd->td_ucred = crhold(td->td_ucred);
 	newtd->td_limit = lim_hold(td->td_limit);
 	newtd->td_cowgen = td->td_cowgen;
 }
 
 void
 thread_cow_free(struct thread *td)
 {
 
 	if (td->td_ucred != NULL)
 		crfree(td->td_ucred);
 	if (td->td_limit != NULL)
 		lim_free(td->td_limit);
 }
 
 void
 thread_cow_update(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *oldcred;
 	struct plimit *oldlimit;
 
 	p = td->td_proc;
 	oldcred = NULL;
 	oldlimit = NULL;
 	PROC_LOCK(p);
 	if (td->td_ucred != p->p_ucred) {
 		oldcred = td->td_ucred;
 		td->td_ucred = crhold(p->p_ucred);
 	}
 	if (td->td_limit != p->p_limit) {
 		oldlimit = td->td_limit;
 		td->td_limit = lim_hold(p->p_limit);
 	}
 	td->td_cowgen = p->p_cowgen;
 	PROC_UNLOCK(p);
 	if (oldcred != NULL)
 		crfree(oldcred);
 	if (oldlimit != NULL)
 		lim_free(oldlimit);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, td->td_name);
 	SDT_PROBE0(proc, , , lwp__exit);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			thread_unlink(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SINGLE is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					wakeup_swapper = thread_unsuspend_one(
 						p->p_singlethread, p, false);
 					thread_unlock(p->p_singlethread);
 					if (wakeup_swapper)
 						kick_proc0();
 				}
 			}
 
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * If this thread is part of a process that is being tracked by hwpmc(4),
 	 * inform the module of the thread's impending exit.
 	 */
 	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 	PROC_UNLOCK(p);
 	PROC_STATLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	PCPU_INC(cnt.v_swtch);
 
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	ruxagg(p, td);
 	rucollect(&p->p_ru, &td->td_ru);
 	PROC_STATUNLOCK(p);
 
 	td->td_state = TDS_INACTIVE;
 #ifdef WITNESS
 	witness_thread_exit(td);
 #endif
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
 	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
 	td = FIRST_THREAD_IN_PROC(p);
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	lock_profile_thread_exit(td);
 	cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	thread_cow_free(td);
 	callout_drain(&td->td_slpcallout);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * its lock has been created.
 	 * PROC_LOCK_ASSERT(p, MA_OWNED);
 	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 static int
 calc_remaining(struct proc *p, int mode)
 {
 	int remaining;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
 		remaining = p->p_numthreads - p->p_suspcount;
 	else
 		panic("calc_remaining: wrong mode %d", mode);
 	return (remaining);
 }
 
 static int
 remain_for_mode(int mode)
 {
 
 	return (mode == SINGLE_ALLPROC ? 0 : 1);
 }
 
 static int
 weed_inhib(int mode, struct thread *td2, struct proc *p)
 {
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td2, MA_OWNED);
 
 	wakeup_swapper = 0;
 	switch (mode) {
 	case SINGLE_EXIT:
 		if (TD_IS_SUSPENDED(td2))
 			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, EINTR);
 		break;
 	case SINGLE_BOUNDARY:
 	case SINGLE_NO_EXIT:
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, ERESTART);
 		break;
 	case SINGLE_ALLPROC:
 		/*
 		 * ALLPROC suspend tries to avoid spurious EINTR for
 		 * threads sleeping interruptable, by suspending the
 		 * thread directly, similarly to sig_suspend_threads().
 		 * Since such sleep is not performed at the user
 		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
 		 * is used to avoid immediate un-suspend.
 		 */
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
 		    TDF_ALLPROCSUSP)) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) {
 			if ((td2->td_flags & TDF_SBDRY) == 0) {
 				thread_suspend_one(td2);
 				td2->td_flags |= TDF_ALLPROCSUSP;
 			} else {
 				wakeup_swapper |= sleepq_abort(td2, ERESTART);
 			}
 		}
 		break;
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(struct proc *p, int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	int remaining, wakeup_swapper;
 
 	td = curthread;
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	/*
 	 * If allowing non-ALLPROC singlethreading for non-curproc
 	 * callers, calc_remaining() and remain_for_mode() should be
 	 * adjusted to also account for td->td_proc != p.  For now
 	 * this is not implemented because it is not used.
 	 */
 	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
 	    (mode != SINGLE_ALLPROC && td->td_proc == p),
 	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	if (mode == SINGLE_ALLPROC)
 		p->p_flag |= P_TOTAL_STOP;
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	remaining = calc_remaining(p, mode);
 	while (remaining != remain_for_mode(mode)) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		wakeup_swapper = 0;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				wakeup_swapper |= weed_inhib(mode, td2, p);
 #ifdef SMP
 			} else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 #endif
 			}
 			thread_unlock(td2);
 		}
 		if (wakeup_swapper)
 			kick_proc0();
 		remaining = calc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == remain_for_mode(mode))
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td, p);
 		remaining = calc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * Convert the process to an unthreaded process.  The
 		 * SINGLE_EXIT is called by exit1() or execve(), in
 		 * both cases other threads must be retired.
 		 */
 		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
 
 		/*
 		 * Wait for any remaining threads to exit cpu_throw().
 		 */
 		while (p->p_exitthreads != 0) {
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sched_relinquish(td);
 			PROC_LOCK(p);
 			PROC_SLOCK(p);
 		}
 	} else if (mode == SINGLE_BOUNDARY) {
 		/*
 		 * Wait until all suspended threads are removed from
 		 * the processors.  The thread_suspend_check()
 		 * increments p_boundary_count while it is still
 		 * running, which makes it possible for the execve()
 		 * to destroy vmspace while our other threads are
 		 * still using the address space.
 		 *
 		 * We lock the thread, which is only allowed to
 		 * succeed after context switch code finished using
 		 * the address space.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
 			    ("td %p not on boundary", td2));
 			KASSERT(TD_IS_SUSPENDED(td2),
 			    ("td %p is not suspended", td2));
 			thread_unlock(td2);
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 bool
 thread_suspend_check_needed(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
 	    (td->td_dbgflags & TDB_SUSPEND) != 0));
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediately
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediately
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (thread_suspend_check_needed()) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * It is safe to access p->p_singlethread unlocked
 			 * because it can only be set to our address by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/*
 		 * Ignore suspend requests if they are deferred.
 		 */
 		if ((td->td_flags & TDF_SBDRY) != 0) {
 			KASSERT(return_instead,
 			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
 			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
 			    (TDF_SEINTR | TDF_SERESTART),
 			    ("both TDF_SEINTR and TDF_SERESTART"));
 			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
 		}
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			PROC_UNLOCK(p);
 
 			/*
 			 * Allow Linux emulation layer to do some work
 			 * before thread suicide.
 			 */
 			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
 				(p->p_sysent->sv_thread_detach)(td);
 			umtx_thread_exit(td);
 			kern_thr_exit(td);
 			panic("stopped thread did not exit");
 		}
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				wakeup_swapper = thread_unsuspend_one(
 				    p->p_singlethread, p, false);
 				thread_unlock(p->p_singlethread);
 				if (wakeup_swapper)
 					kick_proc0();
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
 		thread_unlock(td);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_switch(struct thread *td, struct proc *p)
 {
 
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	if (p == td->td_proc) {
 		thread_stopped(p);
 		p->p_suspcount++;
 	}
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 }
 
 static int
 thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	td->td_flags &= ~TDF_ALLPROCSUSP;
 	if (td->td_proc == p) {
 		PROC_SLOCK_ASSERT(p, MA_OWNED);
 		p->p_suspcount--;
 		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
 			td->td_flags &= ~TDF_BOUNDARY;
 			p->p_boundary_count--;
 		}
 	}
 	return (setrunnable(td));
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	wakeup_swapper = 0;
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    true);
 			}
 			thread_unlock(td);
 		}
 	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 	    p->p_numthreads == p->p_suspcount) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		if (p->p_singlethread->td_proc == p) {
 			thread_lock(p->p_singlethread);
 			wakeup_swapper = thread_unsuspend_one(
 			    p->p_singlethread, p, false);
 			thread_unlock(p->p_singlethread);
 		}
 	}
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(struct proc *p, int mode)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
 	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
 	    ("mode %d does not match P_TOTAL_STOP", mode));
 	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
 	    ("thread_single_end from other thread %p %p",
 	    curthread, p->p_singlethread));
 	KASSERT(mode != SINGLE_BOUNDARY ||
 	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
 	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
 	    P_TOTAL_STOP);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	wakeup_swapper = 0;
 	/*
 	 * If there are other threads they may now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    mode == SINGLE_BOUNDARY);
 			}
 			thread_unlock(td);
 		}
 	}
 	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
 	    ("inconsistent boundary count %d", p->p_boundary_count));
 	PROC_SUNLOCK(p);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	return (td);
 }
 
 /* Locate a thread by number; return with proc lock held. */
 struct thread *
 tdfind(lwpid_t tid, pid_t pid)
 {
 #define RUN_THRESH	16
 	struct thread *td;
 	int run = 0;
 
 	rw_rlock(&tidhash_lock);
 	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
 		if (td->td_tid == tid) {
 			if (pid != -1 && td->td_proc->p_pid != pid) {
 				td = NULL;
 				break;
 			}
 			PROC_LOCK(td->td_proc);
 			if (td->td_proc->p_state == PRS_NEW) {
 				PROC_UNLOCK(td->td_proc);
 				td = NULL;
 				break;
 			}
 			if (run > RUN_THRESH) {
 				if (rw_try_upgrade(&tidhash_lock)) {
 					LIST_REMOVE(td, td_hash);
 					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
 						td, td_hash);
 					rw_wunlock(&tidhash_lock);
 					return (td);
 				}
 			}
 			break;
 		}
 		run++;
 	}
 	rw_runlock(&tidhash_lock);
 	return (td);
 }
 
 void
 tidhash_add(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
 
 void
 tidhash_remove(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
Index: stable/11/sys/kern/subr_intr.c
===================================================================
--- stable/11/sys/kern/subr_intr.c	(revision 331016)
+++ stable/11/sys/kern/subr_intr.c	(revision 331017)
@@ -1,1654 +1,1655 @@
 /*-
  * Copyright (c) 2015-2016 Svatopluk Kraus
  * Copyright (c) 2015-2016 Michal Meloun
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	New-style Interrupt Framework
  *
  *  TODO: - add support for disconnected PICs.
  *        - to support IPI (PPI) enabling on other CPUs if already started.
  *        - to complete things for removable PICs.
  */
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 #include <sys/rman.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/vmmeter.h>
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/atomic.h>
 #include <machine/intr.h>
 #include <machine/cpu.h>
 #include <machine/smp.h>
 #include <machine/stdarg.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include "pic_if.h"
 #include "msi_if.h"
 
 #define	INTRNAME_LEN	(2*MAXCOMLEN + 1)
 
 #ifdef DEBUG
 #define debugf(fmt, args...) do { printf("%s(): ", __func__);	\
     printf(fmt,##args); } while (0)
 #else
 #define debugf(fmt, args...)
 #endif
 
 MALLOC_DECLARE(M_INTRNG);
 MALLOC_DEFINE(M_INTRNG, "intr", "intr interrupt handling");
 
 /* Main interrupt handler called from assembler -> 'hidden' for C code. */
 void intr_irq_handler(struct trapframe *tf);
 
 /* Root interrupt controller stuff. */
 device_t intr_irq_root_dev;
 static intr_irq_filter_t *irq_root_filter;
 static void *irq_root_arg;
 static u_int irq_root_ipicount;
 
 struct intr_pic_child {
 	SLIST_ENTRY(intr_pic_child)	 pc_next;
 	struct intr_pic			*pc_pic;
 	intr_child_irq_filter_t		*pc_filter;
 	void				*pc_filter_arg;
 	uintptr_t			 pc_start;
 	uintptr_t			 pc_length;
 };
 
 /* Interrupt controller definition. */
 struct intr_pic {
 	SLIST_ENTRY(intr_pic)	pic_next;
 	intptr_t		pic_xref;	/* hardware identification */
 	device_t		pic_dev;
 #define	FLAG_PIC	(1 << 0)
 #define	FLAG_MSI	(1 << 1)
 	u_int			pic_flags;
 	struct mtx		pic_child_lock;
 	SLIST_HEAD(, intr_pic_child) pic_children;
 };
 
 static struct mtx pic_list_lock;
 static SLIST_HEAD(, intr_pic) pic_list;
 
 static struct intr_pic *pic_lookup(device_t dev, intptr_t xref);
 
 /* Interrupt source definition. */
 static struct mtx isrc_table_lock;
 static struct intr_irqsrc *irq_sources[NIRQ];
 u_int irq_next_free;
 
 #ifdef SMP
 static boolean_t irq_assign_cpu = FALSE;
 #endif
 
 /*
  * - 2 counters for each I/O interrupt.
  * - MAXCPU counters for each IPI counters for SMP.
  */
 #ifdef SMP
 #define INTRCNT_COUNT   (NIRQ * 2 + INTR_IPI_COUNT * MAXCPU)
 #else
 #define INTRCNT_COUNT   (NIRQ * 2)
 #endif
 
 /* Data for MI statistics reporting. */
 u_long intrcnt[INTRCNT_COUNT];
 char intrnames[INTRCNT_COUNT * INTRNAME_LEN];
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 static u_int intrcnt_index;
 
 static struct intr_irqsrc *intr_map_get_isrc(u_int res_id);
 static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc);
 static struct intr_map_data * intr_map_get_map_data(u_int res_id);
 static void intr_map_copy_map_data(u_int res_id, device_t *dev, intptr_t *xref,
     struct intr_map_data **data);
 
 /*
  *  Interrupt framework initialization routine.
  */
 static void
 intr_irq_init(void *dummy __unused)
 {
 
 	SLIST_INIT(&pic_list);
 	mtx_init(&pic_list_lock, "intr pic list", NULL, MTX_DEF);
 
 	mtx_init(&isrc_table_lock, "intr isrc table", NULL, MTX_DEF);
 }
 SYSINIT(intr_irq_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_irq_init, NULL);
 
 static void
 intrcnt_setname(const char *name, int index)
 {
 
 	snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s",
 	    INTRNAME_LEN - 1, name);
 }
 
 /*
  *  Update name for interrupt source with interrupt event.
  */
 static void
 intrcnt_updatename(struct intr_irqsrc *isrc)
 {
 
 	/* QQQ: What about stray counter name? */
 	mtx_assert(&isrc_table_lock, MA_OWNED);
 	intrcnt_setname(isrc->isrc_event->ie_fullname, isrc->isrc_index);
 }
 
 /*
  *  Virtualization for interrupt source interrupt counter increment.
  */
 static inline void
 isrc_increment_count(struct intr_irqsrc *isrc)
 {
 
 	if (isrc->isrc_flags & INTR_ISRCF_PPI)
 		atomic_add_long(&isrc->isrc_count[0], 1);
 	else
 		isrc->isrc_count[0]++;
 }
 
 /*
  *  Virtualization for interrupt source interrupt stray counter increment.
  */
 static inline void
 isrc_increment_straycount(struct intr_irqsrc *isrc)
 {
 
 	isrc->isrc_count[1]++;
 }
 
 /*
  *  Virtualization for interrupt source interrupt name update.
  */
 static void
 isrc_update_name(struct intr_irqsrc *isrc, const char *name)
 {
 	char str[INTRNAME_LEN];
 
 	mtx_assert(&isrc_table_lock, MA_OWNED);
 
 	if (name != NULL) {
 		snprintf(str, INTRNAME_LEN, "%s: %s", isrc->isrc_name, name);
 		intrcnt_setname(str, isrc->isrc_index);
 		snprintf(str, INTRNAME_LEN, "stray %s: %s", isrc->isrc_name,
 		    name);
 		intrcnt_setname(str, isrc->isrc_index + 1);
 	} else {
 		snprintf(str, INTRNAME_LEN, "%s:", isrc->isrc_name);
 		intrcnt_setname(str, isrc->isrc_index);
 		snprintf(str, INTRNAME_LEN, "stray %s:", isrc->isrc_name);
 		intrcnt_setname(str, isrc->isrc_index + 1);
 	}
 }
 
 /*
  *  Virtualization for interrupt source interrupt counters setup.
  */
 static void
 isrc_setup_counters(struct intr_irqsrc *isrc)
 {
 	u_int index;
 
 	/*
 	 *  XXX - it does not work well with removable controllers and
 	 *        interrupt sources !!!
 	 */
 	index = atomic_fetchadd_int(&intrcnt_index, 2);
 	isrc->isrc_index = index;
 	isrc->isrc_count = &intrcnt[index];
 	isrc_update_name(isrc, NULL);
 }
 
 /*
  *  Virtualization for interrupt source interrupt counters release.
  */
 static void
 isrc_release_counters(struct intr_irqsrc *isrc)
 {
 
 	panic("%s: not implemented", __func__);
 }
 
 #ifdef SMP
 /*
  *  Virtualization for interrupt source IPI counters setup.
  */
 u_long *
 intr_ipi_setup_counters(const char *name)
 {
 	u_int index, i;
 	char str[INTRNAME_LEN];
 
 	index = atomic_fetchadd_int(&intrcnt_index, MAXCPU);
 	for (i = 0; i < MAXCPU; i++) {
 		snprintf(str, INTRNAME_LEN, "cpu%d:%s", i, name);
 		intrcnt_setname(str, index + i);
 	}
 	return (&intrcnt[index]);
 }
 #endif
 
 /*
  *  Main interrupt dispatch handler. It's called straight
  *  from the assembler, where CPU interrupt is served.
  */
 void
 intr_irq_handler(struct trapframe *tf)
 {
 	struct trapframe * oldframe;
 	struct thread * td;
 
 	KASSERT(irq_root_filter != NULL, ("%s: no filter", __func__));
 
 	PCPU_INC(cnt.v_intr);
 	critical_enter();
 	td = curthread;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = tf;
 	irq_root_filter(irq_root_arg);
 	td->td_intr_frame = oldframe;
 	critical_exit();
 #ifdef HWPMC_HOOKS
 	if (pmc_hook && TRAPF_USERMODE(tf) &&
 	    (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN))
 		pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, tf);
 #endif
 }
 
 int
 intr_child_irq_handler(struct intr_pic *parent, uintptr_t irq)
 {
 	struct intr_pic_child *child;
 	bool found;
 
 	found = false;
 	mtx_lock_spin(&parent->pic_child_lock);
 	SLIST_FOREACH(child, &parent->pic_children, pc_next) {
 		if (child->pc_start <= irq &&
 		    irq < (child->pc_start + child->pc_length)) {
 			found = true;
 			break;
 		}
 	}
 	mtx_unlock_spin(&parent->pic_child_lock);
 
 	if (found)
 		return (child->pc_filter(child->pc_filter_arg, irq));
 
 	return (FILTER_STRAY);
 }
 
 /*
  *  interrupt controller dispatch function for interrupts. It should
  *  be called straight from the interrupt controller, when associated interrupt
  *  source is learned.
  */
 int
 intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf)
 {
 
 	KASSERT(isrc != NULL, ("%s: no source", __func__));
 
 	isrc_increment_count(isrc);
 
 #ifdef INTR_SOLO
 	if (isrc->isrc_filter != NULL) {
 		int error;
 		error = isrc->isrc_filter(isrc->isrc_arg, tf);
 		PIC_POST_FILTER(isrc->isrc_dev, isrc);
 		if (error == FILTER_HANDLED)
 			return (0);
 	} else
 #endif
 	if (isrc->isrc_event != NULL) {
 		if (intr_event_handle(isrc->isrc_event, tf) == 0)
 			return (0);
 	}
 
 	isrc_increment_straycount(isrc);
 	return (EINVAL);
 }
 
 /*
  *  Alloc unique interrupt number (resource handle) for interrupt source.
  *
  *  There could be various strategies how to allocate free interrupt number
  *  (resource handle) for new interrupt source.
  *
  *  1. Handles are always allocated forward, so handles are not recycled
  *     immediately. However, if only one free handle left which is reused
  *     constantly...
  */
 static inline int
 isrc_alloc_irq(struct intr_irqsrc *isrc)
 {
 	u_int maxirqs, irq;
 
 	mtx_assert(&isrc_table_lock, MA_OWNED);
 
 	maxirqs = nitems(irq_sources);
 	if (irq_next_free >= maxirqs)
 		return (ENOSPC);
 
 	for (irq = irq_next_free; irq < maxirqs; irq++) {
 		if (irq_sources[irq] == NULL)
 			goto found;
 	}
 	for (irq = 0; irq < irq_next_free; irq++) {
 		if (irq_sources[irq] == NULL)
 			goto found;
 	}
 
 	irq_next_free = maxirqs;
 	return (ENOSPC);
 
 found:
 	isrc->isrc_irq = irq;
 	irq_sources[irq] = isrc;
 
 	irq_next_free = irq + 1;
 	if (irq_next_free >= maxirqs)
 		irq_next_free = 0;
 	return (0);
 }
 
 /*
  *  Free unique interrupt number (resource handle) from interrupt source.
  */
 static inline int
 isrc_free_irq(struct intr_irqsrc *isrc)
 {
 
 	mtx_assert(&isrc_table_lock, MA_OWNED);
 
 	if (isrc->isrc_irq >= nitems(irq_sources))
 		return (EINVAL);
 	if (irq_sources[isrc->isrc_irq] != isrc)
 		return (EINVAL);
 
 	irq_sources[isrc->isrc_irq] = NULL;
 	isrc->isrc_irq = INTR_IRQ_INVALID;	/* just to be safe */
 	return (0);
 }
 
 /*
  *  Initialize interrupt source and register it into global interrupt table.
  */
 int
 intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags,
     const char *fmt, ...)
 {
 	int error;
 	va_list ap;
 
 	bzero(isrc, sizeof(struct intr_irqsrc));
 	isrc->isrc_dev = dev;
 	isrc->isrc_irq = INTR_IRQ_INVALID;	/* just to be safe */
 	isrc->isrc_flags = flags;
 
 	va_start(ap, fmt);
 	vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap);
 	va_end(ap);
 
 	mtx_lock(&isrc_table_lock);
 	error = isrc_alloc_irq(isrc);
 	if (error != 0) {
 		mtx_unlock(&isrc_table_lock);
 		return (error);
 	}
 	/*
 	 * Setup interrupt counters, but not for IPI sources. Those are setup
 	 * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust
 	 * our counter pool.
 	 */
 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
 		isrc_setup_counters(isrc);
 	mtx_unlock(&isrc_table_lock);
 	return (0);
 }
 
 /*
  *  Deregister interrupt source from global interrupt table.
  */
 int
 intr_isrc_deregister(struct intr_irqsrc *isrc)
 {
 	int error;
 
 	mtx_lock(&isrc_table_lock);
 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
 		isrc_release_counters(isrc);
 	error = isrc_free_irq(isrc);
 	mtx_unlock(&isrc_table_lock);
 	return (error);
 }
 
 #ifdef SMP
 /*
  *  A support function for a PIC to decide if provided ISRC should be inited
  *  on given cpu. The logic of INTR_ISRCF_BOUND flag and isrc_cpu member of
  *  struct intr_irqsrc is the following:
  *
  *     If INTR_ISRCF_BOUND is set, the ISRC should be inited only on cpus
  *     set in isrc_cpu. If not, the ISRC should be inited on every cpu and
  *     isrc_cpu is kept consistent with it. Thus isrc_cpu is always correct.
  */
 bool
 intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu)
 {
 
 	if (isrc->isrc_handlers == 0)
 		return (false);
 	if ((isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) == 0)
 		return (false);
 	if (isrc->isrc_flags & INTR_ISRCF_BOUND)
 		return (CPU_ISSET(cpu, &isrc->isrc_cpu));
 
 	CPU_SET(cpu, &isrc->isrc_cpu);
 	return (true);
 }
 #endif
 
 #ifdef INTR_SOLO
 /*
  *  Setup filter into interrupt source.
  */
 static int
 iscr_setup_filter(struct intr_irqsrc *isrc, const char *name,
     intr_irq_filter_t *filter, void *arg, void **cookiep)
 {
 
 	if (filter == NULL)
 		return (EINVAL);
 
 	mtx_lock(&isrc_table_lock);
 	/*
 	 * Make sure that we do not mix the two ways
 	 * how we handle interrupt sources.
 	 */
 	if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) {
 		mtx_unlock(&isrc_table_lock);
 		return (EBUSY);
 	}
 	isrc->isrc_filter = filter;
 	isrc->isrc_arg = arg;
 	isrc_update_name(isrc, name);
 	mtx_unlock(&isrc_table_lock);
 
 	*cookiep = isrc;
 	return (0);
 }
 #endif
 
 /*
  *  Interrupt source pre_ithread method for MI interrupt framework.
  */
 static void
 intr_isrc_pre_ithread(void *arg)
 {
 	struct intr_irqsrc *isrc = arg;
 
 	PIC_PRE_ITHREAD(isrc->isrc_dev, isrc);
 }
 
 /*
  *  Interrupt source post_ithread method for MI interrupt framework.
  */
 static void
 intr_isrc_post_ithread(void *arg)
 {
 	struct intr_irqsrc *isrc = arg;
 
 	PIC_POST_ITHREAD(isrc->isrc_dev, isrc);
 }
 
 /*
  *  Interrupt source post_filter method for MI interrupt framework.
  */
 static void
 intr_isrc_post_filter(void *arg)
 {
 	struct intr_irqsrc *isrc = arg;
 
 	PIC_POST_FILTER(isrc->isrc_dev, isrc);
 }
 
 /*
  *  Interrupt source assign_cpu method for MI interrupt framework.
  */
 static int
 intr_isrc_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intr_irqsrc *isrc = arg;
 	int error;
 
 	if (isrc->isrc_dev != intr_irq_root_dev)
 		return (EINVAL);
 
 	mtx_lock(&isrc_table_lock);
 	if (cpu == NOCPU) {
 		CPU_ZERO(&isrc->isrc_cpu);
 		isrc->isrc_flags &= ~INTR_ISRCF_BOUND;
 	} else {
 		CPU_SETOF(cpu, &isrc->isrc_cpu);
 		isrc->isrc_flags |= INTR_ISRCF_BOUND;
 	}
 
 	/*
 	 * In NOCPU case, it's up to PIC to either leave ISRC on same CPU or
 	 * re-balance it to another CPU or enable it on more CPUs. However,
 	 * PIC is expected to change isrc_cpu appropriately to keep us well
 	 * informed if the call is successful.
 	 */
 	if (irq_assign_cpu) {
 		error = PIC_BIND_INTR(isrc->isrc_dev, isrc);
 		if (error) {
 			CPU_ZERO(&isrc->isrc_cpu);
 			mtx_unlock(&isrc_table_lock);
 			return (error);
 		}
 	}
 	mtx_unlock(&isrc_table_lock);
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
  *  Create interrupt event for interrupt source.
  */
 static int
 isrc_event_create(struct intr_irqsrc *isrc)
 {
 	struct intr_event *ie;
 	int error;
 
 	error = intr_event_create(&ie, isrc, 0, isrc->isrc_irq,
 	    intr_isrc_pre_ithread, intr_isrc_post_ithread, intr_isrc_post_filter,
 	    intr_isrc_assign_cpu, "%s:", isrc->isrc_name);
 	if (error)
 		return (error);
 
 	mtx_lock(&isrc_table_lock);
 	/*
 	 * Make sure that we do not mix the two ways
 	 * how we handle interrupt sources. Let contested event wins.
 	 */
 #ifdef INTR_SOLO
 	if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) {
 #else
 	if (isrc->isrc_event != NULL) {
 #endif
 		mtx_unlock(&isrc_table_lock);
 		intr_event_destroy(ie);
 		return (isrc->isrc_event != NULL ? EBUSY : 0);
 	}
 	isrc->isrc_event = ie;
 	mtx_unlock(&isrc_table_lock);
 
 	return (0);
 }
 #ifdef notyet
 /*
  *  Destroy interrupt event for interrupt source.
  */
 static void
 isrc_event_destroy(struct intr_irqsrc *isrc)
 {
 	struct intr_event *ie;
 
 	mtx_lock(&isrc_table_lock);
 	ie = isrc->isrc_event;
 	isrc->isrc_event = NULL;
 	mtx_unlock(&isrc_table_lock);
 
 	if (ie != NULL)
 		intr_event_destroy(ie);
 }
 #endif
 /*
  *  Add handler to interrupt source.
  */
 static int
 isrc_add_handler(struct intr_irqsrc *isrc, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, void **cookiep)
 {
 	int error;
 
 	if (isrc->isrc_event == NULL) {
 		error = isrc_event_create(isrc);
 		if (error)
 			return (error);
 	}
 
 	error = intr_event_add_handler(isrc->isrc_event, name, filter, handler,
 	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
 		mtx_lock(&isrc_table_lock);
 		intrcnt_updatename(isrc);
 		mtx_unlock(&isrc_table_lock);
 	}
 
 	return (error);
 }
 
 /*
  *  Lookup interrupt controller locked.
  */
 static inline struct intr_pic *
 pic_lookup_locked(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	mtx_assert(&pic_list_lock, MA_OWNED);
 
 	if (dev == NULL && xref == 0)
 		return (NULL);
 
 	/* Note that pic->pic_dev is never NULL on registered PIC. */
 	SLIST_FOREACH(pic, &pic_list, pic_next) {
 		if (dev == NULL) {
 			if (xref == pic->pic_xref)
 				return (pic);
 		} else if (xref == 0 || pic->pic_xref == 0) {
 			if (dev == pic->pic_dev)
 				return (pic);
 		} else if (xref == pic->pic_xref && dev == pic->pic_dev)
 				return (pic);
 	}
 	return (NULL);
 }
 
 /*
  *  Lookup interrupt controller.
  */
 static struct intr_pic *
 pic_lookup(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	mtx_lock(&pic_list_lock);
 	pic = pic_lookup_locked(dev, xref);
 	mtx_unlock(&pic_list_lock);
 	return (pic);
 }
 
 /*
  *  Create interrupt controller.
  */
 static struct intr_pic *
 pic_create(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	mtx_lock(&pic_list_lock);
 	pic = pic_lookup_locked(dev, xref);
 	if (pic != NULL) {
 		mtx_unlock(&pic_list_lock);
 		return (pic);
 	}
 	pic = malloc(sizeof(*pic), M_INTRNG, M_NOWAIT | M_ZERO);
 	if (pic == NULL) {
 		mtx_unlock(&pic_list_lock);
 		return (NULL);
 	}
 	pic->pic_xref = xref;
 	pic->pic_dev = dev;
 	mtx_init(&pic->pic_child_lock, "pic child lock", NULL, MTX_SPIN);
 	SLIST_INSERT_HEAD(&pic_list, pic, pic_next);
 	mtx_unlock(&pic_list_lock);
 
 	return (pic);
 }
 #ifdef notyet
 /*
  *  Destroy interrupt controller.
  */
 static void
 pic_destroy(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	mtx_lock(&pic_list_lock);
 	pic = pic_lookup_locked(dev, xref);
 	if (pic == NULL) {
 		mtx_unlock(&pic_list_lock);
 		return;
 	}
 	SLIST_REMOVE(&pic_list, pic, intr_pic, pic_next);
 	mtx_unlock(&pic_list_lock);
 
 	free(pic, M_INTRNG);
 }
 #endif
 /*
  *  Register interrupt controller.
  */
 struct intr_pic *
 intr_pic_register(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	if (dev == NULL)
 		return (NULL);
 	pic = pic_create(dev, xref);
 	if (pic == NULL)
 		return (NULL);
 
 	pic->pic_flags |= FLAG_PIC;
 
 	debugf("PIC %p registered for %s <dev %p, xref %x>\n", pic,
 	    device_get_nameunit(dev), dev, xref);
 	return (pic);
 }
 
 /*
  *  Unregister interrupt controller.
  */
 int
 intr_pic_deregister(device_t dev, intptr_t xref)
 {
 
 	panic("%s: not implemented", __func__);
 }
 
 /*
  *  Mark interrupt controller (itself) as a root one.
  *
  *  Note that only an interrupt controller can really know its position
  *  in interrupt controller's tree. So root PIC must claim itself as a root.
  *
  *  In FDT case, according to ePAPR approved version 1.1 from 08 April 2011,
  *  page 30:
  *    "The root of the interrupt tree is determined when traversal
  *     of the interrupt tree reaches an interrupt controller node without
  *     an interrupts property and thus no explicit interrupt parent."
  */
 int
 intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter,
     void *arg, u_int ipicount)
 {
 	struct intr_pic *pic;
 
 	pic = pic_lookup(dev, xref);
 	if (pic == NULL) {
 		device_printf(dev, "not registered\n");
 		return (EINVAL);
 	}
 
 	KASSERT((pic->pic_flags & FLAG_PIC) != 0,
 	    ("%s: Found a non-PIC controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 	if (filter == NULL) {
 		device_printf(dev, "filter missing\n");
 		return (EINVAL);
 	}
 
 	/*
 	 * Only one interrupt controllers could be on the root for now.
 	 * Note that we further suppose that there is not threaded interrupt
 	 * routine (handler) on the root. See intr_irq_handler().
 	 */
 	if (intr_irq_root_dev != NULL) {
 		device_printf(dev, "another root already set\n");
 		return (EBUSY);
 	}
 
 	intr_irq_root_dev = dev;
 	irq_root_filter = filter;
 	irq_root_arg = arg;
 	irq_root_ipicount = ipicount;
 
 	debugf("irq root set to %s\n", device_get_nameunit(dev));
 	return (0);
 }
 
 /*
  * Add a handler to manage a sub range of a parents interrupts.
  */
 struct intr_pic *
 intr_pic_add_handler(device_t parent, struct intr_pic *pic,
     intr_child_irq_filter_t *filter, void *arg, uintptr_t start,
     uintptr_t length)
 {
 	struct intr_pic *parent_pic;
 	struct intr_pic_child *newchild;
 #ifdef INVARIANTS
 	struct intr_pic_child *child;
 #endif
 
 	parent_pic = pic_lookup(parent, 0);
 	if (parent_pic == NULL)
 		return (NULL);
 
 	newchild = malloc(sizeof(*newchild), M_INTRNG, M_WAITOK | M_ZERO);
 	newchild->pc_pic = pic;
 	newchild->pc_filter = filter;
 	newchild->pc_filter_arg = arg;
 	newchild->pc_start = start;
 	newchild->pc_length = length;
 
 	mtx_lock_spin(&parent_pic->pic_child_lock);
 #ifdef INVARIANTS
 	SLIST_FOREACH(child, &parent_pic->pic_children, pc_next) {
 		KASSERT(child->pc_pic != pic, ("%s: Adding a child PIC twice",
 		    __func__));
 	}
 #endif
 	SLIST_INSERT_HEAD(&parent_pic->pic_children, newchild, pc_next);
 	mtx_unlock_spin(&parent_pic->pic_child_lock);
 
 	return (pic);
 }
 
 static int
 intr_resolve_irq(device_t dev, intptr_t xref, struct intr_map_data *data,
     struct intr_irqsrc **isrc)
 {
 	struct intr_pic *pic;
 	struct intr_map_data_msi *msi;
 
 	if (data == NULL)
 		return (EINVAL);
 
 	pic = pic_lookup(dev, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	switch (data->type) {
 	case INTR_MAP_DATA_MSI:
 		KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 		    ("%s: Found a non-MSI controller: %s", __func__,
 		     device_get_name(pic->pic_dev)));
 		msi = (struct intr_map_data_msi *)data;
 		*isrc = msi->isrc;
 		return (0);
 
 	default:
 		KASSERT((pic->pic_flags & FLAG_PIC) != 0,
 		    ("%s: Found a non-PIC controller: %s", __func__,
 		     device_get_name(pic->pic_dev)));
 		return (PIC_MAP_INTR(pic->pic_dev, data, isrc));
 
 	}
 }
 
 int
 intr_activate_irq(device_t dev, struct resource *res)
 {
 	device_t map_dev;
 	intptr_t map_xref;
 	struct intr_map_data *data;
 	struct intr_irqsrc *isrc;
 	u_int res_id;
 	int error;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	if (intr_map_get_isrc(res_id) != NULL)
 		panic("Attempt to double activation of resource id: %u\n",
 		    res_id);
 	intr_map_copy_map_data(res_id, &map_dev, &map_xref, &data);
 	error = intr_resolve_irq(map_dev, map_xref, data, &isrc);
 	if (error != 0) {
 		free(data, M_INTRNG);
 		/* XXX TODO DISCONECTED PICs */
 		/* if (error == EINVAL) return(0); */
 		return (error);
 	}
 	intr_map_set_isrc(res_id, isrc);
 	rman_set_virtual(res, data);
 	return (PIC_ACTIVATE_INTR(isrc->isrc_dev, isrc, res, data));
 }
 
 int
 intr_deactivate_irq(device_t dev, struct resource *res)
 {
 	struct intr_map_data *data;
 	struct intr_irqsrc *isrc;
 	u_int res_id;
 	int error;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	isrc = intr_map_get_isrc(res_id);
 	if (isrc == NULL)
 		panic("Attempt to deactivate non-active resource id: %u\n",
 		    res_id);
 
 	data = rman_get_virtual(res);
 	error = PIC_DEACTIVATE_INTR(isrc->isrc_dev, isrc, res, data);
 	intr_map_set_isrc(res_id, NULL);
 	rman_set_virtual(res, NULL);
 	free(data, M_INTRNG);
 	return (error);
 }
 
 int
 intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt,
     driver_intr_t hand, void *arg, int flags, void **cookiep)
 {
 	int error;
 	struct intr_map_data *data;
 	struct intr_irqsrc *isrc;
 	const char *name;
 	u_int res_id;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	isrc = intr_map_get_isrc(res_id);
 	if (isrc == NULL) {
 		/* XXX TODO DISCONECTED PICs */
 		return (EINVAL);
 	}
 
 	data = rman_get_virtual(res);
 	name = device_get_nameunit(dev);
 
 #ifdef INTR_SOLO
 	/*
 	 * Standard handling is done through MI interrupt framework. However,
 	 * some interrupts could request solely own special handling. This
 	 * non standard handling can be used for interrupt controllers without
 	 * handler (filter only), so in case that interrupt controllers are
 	 * chained, MI interrupt framework is called only in leaf controller.
 	 *
 	 * Note that root interrupt controller routine is served as well,
 	 * however in intr_irq_handler(), i.e. main system dispatch routine.
 	 */
 	if (flags & INTR_SOLO && hand != NULL) {
 		debugf("irq %u cannot solo on %s\n", irq, name);
 		return (EINVAL);
 	}
 
 	if (flags & INTR_SOLO) {
 		error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt,
 		    arg, cookiep);
 		debugf("irq %u setup filter error %d on %s\n", isrc->isrc_irq, error,
 		    name);
 	} else
 #endif
 		{
 		error = isrc_add_handler(isrc, name, filt, hand, arg, flags,
 		    cookiep);
 		debugf("irq %u add handler error %d on %s\n", isrc->isrc_irq, error, name);
 	}
 	if (error != 0)
 		return (error);
 
 	mtx_lock(&isrc_table_lock);
 	error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data);
 	if (error == 0) {
 		isrc->isrc_handlers++;
 		if (isrc->isrc_handlers == 1)
 			PIC_ENABLE_INTR(isrc->isrc_dev, isrc);
 	}
 	mtx_unlock(&isrc_table_lock);
 	if (error != 0)
 		intr_event_remove_handler(*cookiep);
 	return (error);
 }
 
 int
 intr_teardown_irq(device_t dev, struct resource *res, void *cookie)
 {
 	int error;
 	struct intr_map_data *data;
 	struct intr_irqsrc *isrc;
 	u_int res_id;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	isrc = intr_map_get_isrc(res_id);
 	if (isrc == NULL || isrc->isrc_handlers == 0)
 		return (EINVAL);
 
 	data = rman_get_virtual(res);
 
 #ifdef INTR_SOLO
 	if (isrc->isrc_filter != NULL) {
 		if (isrc != cookie)
 			return (EINVAL);
 
 		mtx_lock(&isrc_table_lock);
 		isrc->isrc_filter = NULL;
 		isrc->isrc_arg = NULL;
 		isrc->isrc_handlers = 0;
 		PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
 		PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
 		isrc_update_name(isrc, NULL);
 		mtx_unlock(&isrc_table_lock);
 		return (0);
 	}
 #endif
 	if (isrc != intr_handler_source(cookie))
 		return (EINVAL);
 
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		mtx_lock(&isrc_table_lock);
 		isrc->isrc_handlers--;
 		if (isrc->isrc_handlers == 0)
 			PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
 		PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
 		intrcnt_updatename(isrc);
 		mtx_unlock(&isrc_table_lock);
 	}
 	return (error);
 }
 
 int
 intr_describe_irq(device_t dev, struct resource *res, void *cookie,
     const char *descr)
 {
 	int error;
 	struct intr_irqsrc *isrc;
 	u_int res_id;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	isrc = intr_map_get_isrc(res_id);
 	if (isrc == NULL || isrc->isrc_handlers == 0)
 		return (EINVAL);
 #ifdef INTR_SOLO
 	if (isrc->isrc_filter != NULL) {
 		if (isrc != cookie)
 			return (EINVAL);
 
 		mtx_lock(&isrc_table_lock);
 		isrc_update_name(isrc, descr);
 		mtx_unlock(&isrc_table_lock);
 		return (0);
 	}
 #endif
 	error = intr_event_describe_handler(isrc->isrc_event, cookie, descr);
 	if (error == 0) {
 		mtx_lock(&isrc_table_lock);
 		intrcnt_updatename(isrc);
 		mtx_unlock(&isrc_table_lock);
 	}
 	return (error);
 }
 
 #ifdef SMP
 int
 intr_bind_irq(device_t dev, struct resource *res, int cpu)
 {
 	struct intr_irqsrc *isrc;
 	u_int res_id;
 
 	KASSERT(rman_get_start(res) == rman_get_end(res),
 	    ("%s: more interrupts in resource", __func__));
 
 	res_id = (u_int)rman_get_start(res);
 	isrc = intr_map_get_isrc(res_id);
 	if (isrc == NULL || isrc->isrc_handlers == 0)
 		return (EINVAL);
 #ifdef INTR_SOLO
 	if (isrc->isrc_filter != NULL)
 		return (intr_isrc_assign_cpu(isrc, cpu));
 #endif
 	return (intr_event_bind(isrc->isrc_event, cpu));
 }
 
 /*
  * Return the CPU that the next interrupt source should use.
  * For now just returns the next CPU according to round-robin.
  */
 u_int
 intr_irq_next_cpu(u_int last_cpu, cpuset_t *cpumask)
 {
 
 	if (!irq_assign_cpu || mp_ncpus == 1)
 		return (PCPU_GET(cpuid));
 
 	do {
 		last_cpu++;
 		if (last_cpu > mp_maxid)
 			last_cpu = 0;
 	} while (!CPU_ISSET(last_cpu, cpumask));
 	return (last_cpu);
 }
 
 /*
  *  Distribute all the interrupt sources among the available
  *  CPUs once the AP's have been launched.
  */
 static void
 intr_irq_shuffle(void *arg __unused)
 {
 	struct intr_irqsrc *isrc;
 	u_int i;
 
 	if (mp_ncpus == 1)
 		return;
 
 	mtx_lock(&isrc_table_lock);
 	irq_assign_cpu = TRUE;
 	for (i = 0; i < NIRQ; i++) {
 		isrc = irq_sources[i];
 		if (isrc == NULL || isrc->isrc_handlers == 0 ||
 		    isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI))
 			continue;
 
 		if (isrc->isrc_event != NULL &&
 		    isrc->isrc_flags & INTR_ISRCF_BOUND &&
 		    isrc->isrc_event->ie_cpu != CPU_FFS(&isrc->isrc_cpu) - 1)
 			panic("%s: CPU inconsistency", __func__);
 
 		if ((isrc->isrc_flags & INTR_ISRCF_BOUND) == 0)
 			CPU_ZERO(&isrc->isrc_cpu); /* start again */
 
 		/*
 		 * We are in wicked position here if the following call fails
 		 * for bound ISRC. The best thing we can do is to clear
 		 * isrc_cpu so inconsistency with ie_cpu will be detectable.
 		 */
 		if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0)
 			CPU_ZERO(&isrc->isrc_cpu);
 	}
 	mtx_unlock(&isrc_table_lock);
 }
 SYSINIT(intr_irq_shuffle, SI_SUB_SMP, SI_ORDER_SECOND, intr_irq_shuffle, NULL);
 
 #else
 u_int
 intr_irq_next_cpu(u_int current_cpu, cpuset_t *cpumask)
 {
 
 	return (PCPU_GET(cpuid));
 }
 #endif
 
 /*
  * Allocate memory for new intr_map_data structure.
  * Initialize common fields.
  */
 struct intr_map_data *
 intr_alloc_map_data(enum intr_map_data_type type, size_t len, int flags)
 {
 	struct intr_map_data *data;
 
 	data = malloc(len, M_INTRNG, flags);
 	data->type = type;
 	data->len = len;
 	return (data);
 }
 
 void intr_free_intr_map_data(struct intr_map_data *data)
 {
 
 	free(data, M_INTRNG);
 }
 
 
 /*
  *  Register a MSI/MSI-X interrupt controller
  */
 int
 intr_msi_register(device_t dev, intptr_t xref)
 {
 	struct intr_pic *pic;
 
 	if (dev == NULL)
 		return (EINVAL);
 	pic = pic_create(dev, xref);
 	if (pic == NULL)
 		return (ENOMEM);
 
 	pic->pic_flags |= FLAG_MSI;
 
 	debugf("PIC %p registered for %s <dev %p, xref %jx>\n", pic,
 	    device_get_nameunit(dev), dev, (uintmax_t)xref);
 	return (0);
 }
 
 int
 intr_alloc_msi(device_t pci, device_t child, intptr_t xref, int count,
     int maxcount, int *irqs)
 {
 	struct intr_irqsrc **isrc;
 	struct intr_pic *pic;
 	device_t pdev;
 	struct intr_map_data_msi *msi;
 	int err, i;
 
 	pic = pic_lookup(NULL, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 	    ("%s: Found a non-MSI controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 	isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK);
 	err = MSI_ALLOC_MSI(pic->pic_dev, child, count, maxcount, &pdev, isrc);
 	if (err != 0) {
 		free(isrc, M_INTRNG);
 		return (err);
 	}
 
 	for (i = 0; i < count; i++) {
 		msi = (struct intr_map_data_msi *)intr_alloc_map_data(
 		    INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO);
 		msi-> isrc = isrc[i];
 		irqs[i] = intr_map_irq(pic->pic_dev, xref,
 		    (struct intr_map_data *)msi);
 
 	}
 	free(isrc, M_INTRNG);
 
 	return (err);
 }
 
 int
 intr_release_msi(device_t pci, device_t child, intptr_t xref, int count,
     int *irqs)
 {
 	struct intr_irqsrc **isrc;
 	struct intr_pic *pic;
 	struct intr_map_data_msi *msi;
 	int i, err;
 
 	pic = pic_lookup(NULL, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 	    ("%s: Found a non-MSI controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 	isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK);
 
 	for (i = 0; i < count; i++) {
 		msi = (struct intr_map_data_msi *)
 		    intr_map_get_map_data(irqs[i]);
 		KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI,
 		    ("%s: irq %d map data is not MSI", __func__,
 		    irqs[i]));
 		isrc[i] = msi->isrc;
 	}
 
 	err = MSI_RELEASE_MSI(pic->pic_dev, child, count, isrc);
 
 	for (i = 0; i < count; i++) {
 		if (isrc[i] != NULL)
 			intr_unmap_irq(irqs[i]);
 	}
 
 	free(isrc, M_INTRNG);
 	return (err);
 }
 
 int
 intr_alloc_msix(device_t pci, device_t child, intptr_t xref, int *irq)
 {
 	struct intr_irqsrc *isrc;
 	struct intr_pic *pic;
 	device_t pdev;
 	struct intr_map_data_msi *msi;
 	int err;
 
 	pic = pic_lookup(NULL, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 	    ("%s: Found a non-MSI controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 
 	err = MSI_ALLOC_MSIX(pic->pic_dev, child, &pdev, &isrc);
 	if (err != 0)
 		return (err);
 
 	msi = (struct intr_map_data_msi *)intr_alloc_map_data(
 		    INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO);
 	msi->isrc = isrc;
 	*irq = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi);
 	return (0);
 }
 
 int
 intr_release_msix(device_t pci, device_t child, intptr_t xref, int irq)
 {
 	struct intr_irqsrc *isrc;
 	struct intr_pic *pic;
 	struct intr_map_data_msi *msi;
 	int err;
 
 	pic = pic_lookup(NULL, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 	    ("%s: Found a non-MSI controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 	msi = (struct intr_map_data_msi *)
 	    intr_map_get_map_data(irq);
 	KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI,
 	    ("%s: irq %d map data is not MSI", __func__,
 	    irq));
 	isrc = msi->isrc;
 	if (isrc == NULL) {
 		intr_unmap_irq(irq);
 		return (EINVAL);
 	}
 
 	err = MSI_RELEASE_MSIX(pic->pic_dev, child, isrc);
 	intr_unmap_irq(irq);
 
 	return (err);
 }
 
 int
 intr_map_msi(device_t pci, device_t child, intptr_t xref, int irq,
     uint64_t *addr, uint32_t *data)
 {
 	struct intr_irqsrc *isrc;
 	struct intr_pic *pic;
 	int err;
 
 	pic = pic_lookup(NULL, xref);
 	if (pic == NULL)
 		return (ESRCH);
 
 	KASSERT((pic->pic_flags & FLAG_MSI) != 0,
 	    ("%s: Found a non-MSI controller: %s", __func__,
 	     device_get_name(pic->pic_dev)));
 
 	isrc = intr_map_get_isrc(irq);
 	if (isrc == NULL)
 		return (EINVAL);
 
 	err = MSI_MAP_MSI(pic->pic_dev, child, isrc, addr, data);
 	return (err);
 }
 
 
 void dosoftints(void);
 void
 dosoftints(void)
 {
 }
 
 #ifdef SMP
 /*
  *  Init interrupt controller on another CPU.
  */
 void
 intr_pic_init_secondary(void)
 {
 
 	/*
 	 * QQQ: Only root PIC is aware of other CPUs ???
 	 */
 	KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__));
 
 	//mtx_lock(&isrc_table_lock);
 	PIC_INIT_SECONDARY(intr_irq_root_dev);
 	//mtx_unlock(&isrc_table_lock);
 }
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	u_int i, irqsum;
 	u_long num;
 	struct intr_irqsrc *isrc;
 
 	for (irqsum = 0, i = 0; i < NIRQ; i++) {
 		isrc = irq_sources[i];
 		if (isrc == NULL)
 			continue;
 
 		num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0;
 		db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i,
 		    isrc->isrc_name, isrc->isrc_cpu.__bits[0],
 		    isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num);
 		irqsum += num;
 	}
 	db_printf("irq total %u\n", irqsum);
 }
 #endif
 
 /*
  * Interrupt mapping table functions.
  *
  * Please, keep this part separately, it can be transformed to
  * extension of standard resources.
  */
 struct intr_map_entry
 {
 	device_t 		dev;
 	intptr_t 		xref;
 	struct intr_map_data 	*map_data;
 	struct intr_irqsrc 	*isrc;
 	/* XXX TODO DISCONECTED PICs */
 	/*int			flags */
 };
 
 /* XXX Convert irq_map[] to dynamicaly expandable one. */
 static struct intr_map_entry *irq_map[2 * NIRQ];
 static int irq_map_count = nitems(irq_map);
 static int irq_map_first_free_idx;
 static struct mtx irq_map_lock;
 
 static struct intr_irqsrc *
 intr_map_get_isrc(u_int res_id)
 {
 	struct intr_irqsrc *isrc;
 
 	mtx_lock(&irq_map_lock);
 	if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) {
 		mtx_unlock(&irq_map_lock);
 		return (NULL);
 	}
 	isrc = irq_map[res_id]->isrc;
 	mtx_unlock(&irq_map_lock);
 	return (isrc);
 }
 
 static void
 intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc)
 {
 
 	mtx_lock(&irq_map_lock);
 	if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) {
 		mtx_unlock(&irq_map_lock);
 		return;
 	}
 	irq_map[res_id]->isrc = isrc;
 	mtx_unlock(&irq_map_lock);
 }
 
 /*
  * Get a copy of intr_map_entry data
  */
 static struct intr_map_data *
 intr_map_get_map_data(u_int res_id)
 {
 	struct intr_map_data *data;
 
 	data = NULL;
 	mtx_lock(&irq_map_lock);
 	if (res_id >= irq_map_count || irq_map[res_id] == NULL)
 		panic("Attempt to copy invalid resource id: %u\n", res_id);
 	data = irq_map[res_id]->map_data;
 	mtx_unlock(&irq_map_lock);
 
 	return (data);
 }
 
 /*
  * Get a copy of intr_map_entry data
  */
 static void
 intr_map_copy_map_data(u_int res_id, device_t *map_dev, intptr_t *map_xref,
     struct intr_map_data **data)
 {
 	size_t len;
 
 	len = 0;
 	mtx_lock(&irq_map_lock);
 	if (res_id >= irq_map_count || irq_map[res_id] == NULL)
 		panic("Attempt to copy invalid resource id: %u\n", res_id);
 	if (irq_map[res_id]->map_data != NULL)
 		len = irq_map[res_id]->map_data->len;
 	mtx_unlock(&irq_map_lock);
 
 	if (len == 0)
 		*data = NULL;
 	else
 		*data = malloc(len, M_INTRNG, M_WAITOK | M_ZERO);
 	mtx_lock(&irq_map_lock);
 	if (irq_map[res_id] == NULL)
 		panic("Attempt to copy invalid resource id: %u\n", res_id);
 	if (len != 0) {
 		if (len != irq_map[res_id]->map_data->len)
 			panic("Resource id: %u has changed.\n", res_id);
 		memcpy(*data, irq_map[res_id]->map_data, len);
 	}
 	*map_dev = irq_map[res_id]->dev;
 	*map_xref = irq_map[res_id]->xref;
 	mtx_unlock(&irq_map_lock);
 }
 
 
 /*
  * Allocate and fill new entry in irq_map table.
  */
 u_int
 intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data)
 {
 	u_int i;
 	struct intr_map_entry *entry;
 
 	/* Prepare new entry first. */
 	entry = malloc(sizeof(*entry), M_INTRNG, M_WAITOK | M_ZERO);
 
 	entry->dev = dev;
 	entry->xref = xref;
 	entry->map_data = data;
 	entry->isrc = NULL;
 
 	mtx_lock(&irq_map_lock);
 	for (i = irq_map_first_free_idx; i < irq_map_count; i++) {
 		if (irq_map[i] == NULL) {
 			irq_map[i] = entry;
 			irq_map_first_free_idx = i + 1;
 			mtx_unlock(&irq_map_lock);
 			return (i);
 		}
 	}
 	mtx_unlock(&irq_map_lock);
 
 	/* XXX Expand irq_map table */
 	panic("IRQ mapping table is full.");
 }
 
 /*
  * Remove and free mapping entry.
  */
 void
 intr_unmap_irq(u_int res_id)
 {
 	struct intr_map_entry *entry;
 
 	mtx_lock(&irq_map_lock);
 	if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL))
 		panic("Attempt to unmap invalid resource id: %u\n", res_id);
 	entry = irq_map[res_id];
 	irq_map[res_id] = NULL;
 	irq_map_first_free_idx = res_id;
 	mtx_unlock(&irq_map_lock);
 	intr_free_intr_map_data(entry->map_data);
 	free(entry, M_INTRNG);
 }
 
 /*
  * Clone mapping entry.
  */
 u_int
 intr_map_clone_irq(u_int old_res_id)
 {
 	device_t map_dev;
 	intptr_t map_xref;
 	struct intr_map_data *data;
 
 	intr_map_copy_map_data(old_res_id, &map_dev, &map_xref, &data);
 	return (intr_map_irq(map_dev, map_xref, data));
 }
 
 static void
 intr_map_init(void *dummy __unused)
 {
 
 	mtx_init(&irq_map_lock, "intr map table", NULL, MTX_DEF);
 }
 SYSINIT(intr_map_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_map_init, NULL);
Index: stable/11/sys/kern/subr_syscall.c
===================================================================
--- stable/11/sys/kern/subr_syscall.c	(revision 331016)
+++ stable/11/sys/kern/subr_syscall.c	(revision 331017)
@@ -1,267 +1,268 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (C) 2010 Konstantin Belousov <kib@freebsd.org>
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 __FBSDID("$FreeBSD$");
 
 #include <sys/capsicum.h>
 #include <sys/ktr.h>
+#include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 #include <security/audit/audit.h>
 
 static inline int
 syscallenter(struct thread *td)
 {
 	struct proc *p;
 	struct syscall_args *sa;
 	int error, traced;
 
 	PCPU_INC(cnt.v_syscall);
 	p = td->td_proc;
 	sa = &td->td_sa;
 
 	td->td_pticks = 0;
 	if (td->td_cowgen != p->p_cowgen)
 		thread_cow_update(td);
 	traced = (p->p_flag & P_TRACED) != 0;
 	if (traced || td->td_dbgflags & TDB_USERWR) {
 		PROC_LOCK(p);
 		td->td_dbgflags &= ~TDB_USERWR;
 		if (traced)
 			td->td_dbgflags |= TDB_SCE;
 		PROC_UNLOCK(p);
 	}
 	error = (p->p_sysent->sv_fetch_syscall_args)(td);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(sa->code, sa->narg, sa->args);
 #endif
 	KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code),
 	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0],
 	    "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]);
 
 	if (error == 0) {
 
 		STOPEVENT(p, S_SCE, sa->narg);
 		if (p->p_flag & P_TRACED) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_SCE)
 				ptracestop((td), SIGTRAP, NULL);
 			PROC_UNLOCK(p);
 		}
 		if (td->td_dbgflags & TDB_USERWR) {
 			/*
 			 * Reread syscall number and arguments if
 			 * debugger modified registers or memory.
 			 */
 			error = (p->p_sysent->sv_fetch_syscall_args)(td);
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_SYSCALL))
 				ktrsyscall(sa->code, sa->narg, sa->args);
 #endif
 			if (error != 0)
 				goto retval;
 		}
 
 #ifdef CAPABILITY_MODE
 		/*
 		 * In capability mode, we only allow access to system calls
 		 * flagged with SYF_CAPENABLED.
 		 */
 		if (IN_CAPABILITY_MODE(td) &&
 		    !(sa->callp->sy_flags & SYF_CAPENABLED)) {
 			error = ECAPMODE;
 			goto retval;
 		}
 #endif
 
 		error = syscall_thread_enter(td, sa->callp);
 		if (error != 0)
 			goto retval;
 
 #ifdef KDTRACE_HOOKS
 		/* Give the syscall:::entry DTrace probe a chance to fire. */
 		if (systrace_probe_func != NULL && sa->callp->sy_entry != 0)
 			(*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0);
 #endif
 
 		AUDIT_SYSCALL_ENTER(sa->code, td);
 		error = (sa->callp->sy_call)(td, sa->args);
 		AUDIT_SYSCALL_EXIT(error, td);
 
 		/* Save the latest error return value. */
 		if ((td->td_pflags & TDP_NERRNO) == 0)
 			td->td_errno = error;
 
 #ifdef KDTRACE_HOOKS
 		/* Give the syscall:::return DTrace probe a chance to fire. */
 		if (systrace_probe_func != NULL && sa->callp->sy_return != 0)
 			(*systrace_probe_func)(sa, SYSTRACE_RETURN,
 			    error ? -1 : td->td_retval[0]);
 #endif
 		syscall_thread_exit(td, sa->callp);
 	}
  retval:
 	KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code),
 	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error,
 	    "retval0:%#lx", td->td_retval[0], "retval1:%#lx",
 	    td->td_retval[1]);
 	if (traced) {
 		PROC_LOCK(p);
 		td->td_dbgflags &= ~TDB_SCE;
 		PROC_UNLOCK(p);
 	}
 	(p->p_sysent->sv_set_syscall_retval)(td, error);
 	return (error);
 }
 
 static inline void
 syscallret(struct thread *td, int error)
 {
 	struct proc *p, *p2;
 	struct syscall_args *sa;
 	ksiginfo_t ksi;
 	int traced, error1;
 
 	KASSERT((td->td_pflags & TDP_FORKING) == 0,
 	    ("fork() did not clear TDP_FORKING upon completion"));
 
 	p = td->td_proc;
 	sa = &td->td_sa;
 	if ((trap_enotcap || (p->p_flag2 & P2_TRAPCAP) != 0) &&
 	    IN_CAPABILITY_MODE(td)) {
 		error1 = (td->td_pflags & TDP_NERRNO) == 0 ? error :
 		    td->td_errno;
 		if (error1 == ENOTCAPABLE || error1 == ECAPMODE) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGTRAP;
 			ksi.ksi_errno = error1;
 			ksi.ksi_code = TRAP_CAP;
 			trapsignal(td, &ksi);
 		}
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, td->td_frame);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET)) {
 		ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ?
 		    error : td->td_errno, td->td_retval[0]);
 	}
 #endif
 	td->td_pflags &= ~TDP_NERRNO;
 
 	if (p->p_flag & P_TRACED) {
 		traced = 1;
 		PROC_LOCK(p);
 		td->td_dbgflags |= TDB_SCX;
 		PROC_UNLOCK(p);
 	} else
 		traced = 0;
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, sa->code);
 	if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If tracing the execed process, trap to the debugger
 		 * so that breakpoints can be set before the program
 		 * executes.  If debugger requested tracing of syscall
 		 * returns, do it now too.
 		 */
 		if (traced &&
 		    ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 ||
 		    (p->p_ptevents & PTRACE_SCX) != 0))
 			ptracestop(td, SIGTRAP, NULL);
 		td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_pflags & TDP_RFPPWAIT) {
 		/*
 		 * Preserve synchronization semantics of vfork.  If
 		 * waiting for child to exec or exit, fork set
 		 * P_PPWAIT on child, and there we sleep on our proc
 		 * (in case of exit).
 		 *
 		 * Do it after the ptracestop() above is finished, to
 		 * not block our debugger until child execs or exits
 		 * to finish vfork wait.
 		 */
 		td->td_pflags &= ~TDP_RFPPWAIT;
 		p2 = td->td_rfppwait_p;
 again:
 		PROC_LOCK(p2);
 		while (p2->p_flag & P_PPWAIT) {
 			PROC_LOCK(p);
 			if (thread_suspend_check_needed()) {
 				PROC_UNLOCK(p2);
 				thread_suspend_check(0);
 				PROC_UNLOCK(p);
 				goto again;
 			} else {
 				PROC_UNLOCK(p);
 			}
 			cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
 		}
 		PROC_UNLOCK(p2);
 
 		if (td->td_dbgflags & TDB_VFORK) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_VFORK)
 				ptracestop(td, SIGTRAP, NULL);
 			td->td_dbgflags &= ~TDB_VFORK;
 			PROC_UNLOCK(p);
 		}
 	}
 }
Index: stable/11/sys/mips/include/intr_machdep.h
===================================================================
--- stable/11/sys/mips/include/intr_machdep.h	(revision 331016)
+++ stable/11/sys/mips/include/intr_machdep.h	(revision 331017)
@@ -1,77 +1,78 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004 Juli Mallett <jmallett@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_MACHINE_INTR_MACHDEP_H_
 #define	_MACHINE_INTR_MACHDEP_H_
 
+#include <sys/vmmeter.h>
 #include <machine/atomic.h>
 
 #if defined(CPU_RMI) || defined(CPU_NLM)
 #define XLR_MAX_INTR 64
 #else
 #define NHARD_IRQS	6
 #define NSOFT_IRQS	2
 #endif
 
 struct trapframe;
 
 void cpu_init_interrupts(void);
 void cpu_establish_hardintr(const char *, driver_filter_t *, driver_intr_t *,
     void *, int, int, void **);
 void cpu_establish_softintr(const char *, driver_filter_t *, void (*)(void*),
     void *, int, int, void **);
 void cpu_intr(struct trapframe *);
 
 /*
  * Allow a platform to override the default hard interrupt mask and unmask
  * functions. The 'arg' can be cast safely to an 'int' and holds the mips
  * hard interrupt number to mask or unmask.
  */
 typedef void (*cpu_intr_mask_t)(void *arg);
 typedef void (*cpu_intr_unmask_t)(void *arg);
 void cpu_set_hardintr_mask_func(cpu_intr_mask_t func);
 void cpu_set_hardintr_unmask_func(cpu_intr_unmask_t func);
 
 /*
  * Opaque datatype that represents intr counter
  */
 typedef unsigned long* mips_intrcnt_t;
 
 mips_intrcnt_t mips_intrcnt_create(const char *);
 void mips_intrcnt_setname(mips_intrcnt_t, const char *);
 
 static __inline void
 mips_intrcnt_inc(mips_intrcnt_t counter)
 {
 	if (counter)
 		atomic_add_long(counter, 1);
 	PCPU_INC(cnt.v_intr);
 }
 #endif /* !_MACHINE_INTR_MACHDEP_H_ */
Index: stable/11/sys/mips/mips/minidump_machdep.c
===================================================================
--- stable/11/sys/mips/mips/minidump_machdep.c	(revision 331016)
+++ stable/11/sys/mips/mips/minidump_machdep.c	(revision 331017)
@@ -1,342 +1,343 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Oleksandr Tymoshenko <gonzo@freebsd.org>
  * Copyright (c) 2008 Semihalf, Grzegorz Bernacki
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * from: FreeBSD: src/sys/arm/arm/minidump_machdep.c v214223
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
 #include <machine/elf.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 #include <machine/minidump.h>
 #include <machine/cache.h>
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
 /*
  * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
  * is to protect us from metadata and to protect metadata from us.
  */
 #define	SIZEOF_METADATA		(64*1024)
 
 uint32_t *vm_page_dump;
 int vm_page_dump_size;
 
 static struct kerneldumpheader kdh;
 static off_t dumplo;
 static off_t origdumplo;
 
 /* Handle chunked writes. */
 static uint64_t counter, progress;
 /* Just auxiliary bufffer */
 static char tmpbuffer[PAGE_SIZE];
 
 extern pd_entry_t *kernel_segmap;
 
 CTASSERT(sizeof(*vm_page_dump) == 4);
 
 static int
 is_dumpable(vm_paddr_t pa)
 {
 	int i;
 
 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
 			return (1);
 	}
 	return (0);
 }
 
 void
 dump_add_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 5;		/* 2^5 = 32 */
 	bit = pa & 31;
 	atomic_set_int(&vm_page_dump[idx], 1ul << bit);
 }
 
 void
 dump_drop_page(vm_paddr_t pa)
 {
 	int idx, bit;
 
 	pa >>= PAGE_SHIFT;
 	idx = pa >> 5;		/* 2^5 = 32 */
 	bit = pa & 31;
 	atomic_clear_int(&vm_page_dump[idx], 1ul << bit);
 }
 
 #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
 
 static int
 write_buffer(struct dumperinfo *di, char *ptr, size_t sz)
 {
 	size_t len;
 	int error, c;
 	u_int maxdumpsz;
 
 	maxdumpsz = di->maxiosize;
 
 	if (maxdumpsz == 0)	/* seatbelt */
 		maxdumpsz = PAGE_SIZE;
 
 	error = 0;
 
 	while (sz) {
 		len = min(maxdumpsz, sz);
 		counter += len;
 		progress -= len;
 
 		if (counter >> 22) {
 			printf(" %jd", PG2MB(progress >> PAGE_SHIFT));
 			counter &= (1<<22) - 1;
 		}
 
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
 				return (error);
 			dumplo += len;
 			ptr += len;
 			sz -= len;
 		} else {
 			panic("pa is not supported");
 		}
 
 		/* Check for user abort. */
 		c = cncheckc();
 		if (c == 0x03)
 			return (ECANCELED);
 		if (c != -1)
 			printf(" (CTRL-C to abort) ");
 	}
 
 	return (0);
 }
 
 int
 minidumpsys(struct dumperinfo *di)
 {
 	struct minidumphdr mdhdr;
 	uint64_t dumpsize;
 	uint32_t ptesize;
 	uint32_t bits;
 	vm_paddr_t pa;
 	vm_offset_t prev_pte = 0;
 	uint32_t count = 0;
 	vm_offset_t va;
 	pt_entry_t *pte;
 	int i, bit, error;
 	void *dump_va;
 
 	/* Flush cache */
 	mips_dcache_wbinv_all();
 
 	counter = 0;
 	/* Walk page table pages, set bits in vm_page_dump */
 	ptesize = 0;
 
 	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += NBPDR) {
 		ptesize += PAGE_SIZE;
 		pte = pmap_pte(kernel_pmap, va);
 		KASSERT(pte != NULL, ("pte for %jx is NULL", (uintmax_t)va));
 		for (i = 0; i < NPTEPG; i++) {
 			if (pte_test(&pte[i], PTE_V)) {
 				pa = TLBLO_PTE_TO_PA(pte[i]);
 				if (is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
 	}
 
 	/*
 	 * Now mark pages from 0 to phys_avail[0], that's where kernel 
 	 * and pages allocated by pmap_steal reside
 	 */
 	for (pa = 0; pa < phys_avail[0]; pa += PAGE_SIZE) {
 		if (is_dumpable(pa))
 			dump_add_page(pa);
 	}
 
 	/* Calculate dump size. */
 	dumpsize = ptesize;
 	dumpsize += round_page(msgbufp->msg_size);
 	dumpsize += round_page(vm_page_dump_size);
 
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = ffs(bits) - 1;
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) +
 			    bit) * PAGE_SIZE;
 			/* Clear out undumpable pages now if needed */
 			if (is_dumpable(pa))
 				dumpsize += PAGE_SIZE;
 			else
 				dump_drop_page(pa);
 			bits &= ~(1ul << bit);
 		}
 	}
 
 	dumpsize += PAGE_SIZE;
 
 	/* Determine dump offset on device. */
 	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
 		error = ENOSPC;
 		goto fail;
 	}
 
 	origdumplo = dumplo = di->mediaoffset + di->mediasize - dumpsize;
 	dumplo -= sizeof(kdh) * 2;
 	progress = dumpsize;
 
 	/* Initialize mdhdr */
 	bzero(&mdhdr, sizeof(mdhdr));
 	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
 	mdhdr.version = MINIDUMP_VERSION;
 	mdhdr.msgbufsize = msgbufp->msg_size;
 	mdhdr.bitmapsize = vm_page_dump_size;
 	mdhdr.ptesize = ptesize;
 	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
 
 	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_MIPS_VERSION, dumpsize,
 	    di->blocksize);
 
 	printf("Physical memory: %ju MB\n", 
 	    (uintmax_t)ptoa((uintmax_t)physmem) / 1048576);
 	printf("Dumping %llu MB:", (long long)dumpsize >> 20);
 
 	/* Dump leader */
 	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 	dumplo += sizeof(kdh);
 
 	/* Dump my header */
 	bzero(tmpbuffer, sizeof(tmpbuffer));
 	bcopy(&mdhdr, tmpbuffer, sizeof(mdhdr));
 	error = write_buffer(di, tmpbuffer, PAGE_SIZE);
 	if (error)
 		goto fail;
 
 	/* Dump msgbuf up front */
 	error = write_buffer(di, (char *)msgbufp->msg_ptr, 
 	    round_page(msgbufp->msg_size));
 	if (error)
 		goto fail;
 
 	/* Dump bitmap */
 	error = write_buffer(di, (char *)vm_page_dump,
 	    round_page(vm_page_dump_size));
 	if (error)
 		goto fail;
 
 	/* Dump kernel page table pages */
 	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += NBPDR) {
 		pte = pmap_pte(kernel_pmap, va);
 		KASSERT(pte != NULL, ("pte for %jx is NULL", (uintmax_t)va));
 		if (!count) {
 			prev_pte = (vm_offset_t)pte;
 			count++;
 		}
 		else {
 			if ((vm_offset_t)pte == (prev_pte + count * PAGE_SIZE))
 				count++;
 			else {
 				error = write_buffer(di, (char*)prev_pte,
 				    count * PAGE_SIZE);
 				if (error)
 					goto fail;
 				count = 1;
 				prev_pte = (vm_offset_t)pte;
 			}
 		}
 	}
 
 	if (count) {
 		error = write_buffer(di, (char*)prev_pte, count * PAGE_SIZE);
 		if (error)
 			goto fail;
 		count = 0;
 		prev_pte = 0;
 	}
 
 	/* Dump memory chunks  page by page*/
 	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
 		bits = vm_page_dump[i];
 		while (bits) {
 			bit = ffs(bits) - 1;
 			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) +
 			    bit) * PAGE_SIZE;
 			dump_va = pmap_kenter_temporary(pa, 0);
 			error = write_buffer(di, dump_va, PAGE_SIZE);
 			if (error)
 				goto fail;
 			pmap_kenter_temporary_free(pa);
 			bits &= ~(1ul << bit);
 		}
 	}
 
 	/* Dump trailer */
 	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 	dumplo += sizeof(kdh);
 
 	/* Signal completion, signoff and exit stage left. */
 	dump_write(di, NULL, 0, 0, 0);
 	printf("\nDump complete\n");
 	return (0);
 
 fail:
 	if (error < 0)
 		error = -error;
 
 	if (error == ECANCELED)
 		printf("\nDump aborted\n");
 	else if (error == ENOSPC)
 		printf("\nDump failed. Partition too small.\n");
 	else
 		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
 	return (error);
 }
Index: stable/11/sys/mips/mips/uma_machdep.c
===================================================================
--- stable/11/sys/mips/mips/uma_machdep.c	(revision 331016)
+++ stable/11/sys/mips/mips/uma_machdep.c	(revision 331017)
@@ -1,94 +1,95 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 
 void *
 uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
 	int pflags;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
 	pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED;
 #ifndef __mips_n64
 	pflags &= ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
 	pflags |= VM_ALLOC_NOWAIT;
 #endif
 
 	for (;;) {
 		m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
 #ifndef __mips_n64
 		if (m == NULL && vm_page_reclaim_contig(pflags, 1,
 		    0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))
 			continue;
 #endif
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);
 			else
 				VM_WAIT;
 		} else
 			break;
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	va = (void *)MIPS_PHYS_TO_DIRECT(pa);
 	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
 		bzero(va, PAGE_SIZE);
 	return (va);
 }
 
 void
 uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem);
 	m = PHYS_TO_VM_PAGE(pa);
 	m->wire_count--;
 	vm_page_free(m);
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 }
Index: stable/11/sys/ofed/drivers/infiniband/core/umem.c
===================================================================
--- stable/11/sys/ofed/drivers/infiniband/core/umem.c	(revision 331016)
+++ stable/11/sys/ofed/drivers/infiniband/core/umem.c	(revision 331017)
@@ -1,445 +1,446 @@
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #define	LINUXKPI_PARAM_PREFIX ibcore_
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
 #include <linux/dma-attrs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <sys/priv.h>
 #include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 #include "uverbs.h"
 
 #define IB_UMEM_MAX_PAGE_CHUNK		(PAGE_SIZE / sizeof (struct page *))
 
 static int allow_weak_ordering;
 module_param_named(weak_ordering, allow_weak_ordering, int, 0444);
 MODULE_PARM_DESC(weak_ordering,  "Allow weak ordering for data registered memory");
 
 static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem,
 				       struct ib_umem *umem, unsigned long addr,
 				       int dmasync, int invalidation_supported)
 {
 	int ret;
 	const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
 	struct invalidation_ctx *invalidation_ctx = NULL;
 
 	umem->ib_peer_mem = ib_peer_mem;
 	if (invalidation_supported) {
 		invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL);
 		if (!invalidation_ctx) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		umem->invalidation_ctx = invalidation_ctx;
 		invalidation_ctx->umem = umem;
 		mutex_lock(&ib_peer_mem->lock);
 		invalidation_ctx->context_ticket =
 				ib_peer_insert_context(ib_peer_mem, invalidation_ctx);
 		/* unlock before calling get pages to prevent a dead-lock from the callback */
 		mutex_unlock(&ib_peer_mem->lock);
 	}
 
 	ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1,
 				&umem->sg_head, 
 				umem->peer_mem_client_context,
 				invalidation_ctx ?
 				(void *)invalidation_ctx->context_ticket : NULL);
 
 	if (invalidation_ctx) {
 		/* taking the lock back, checking that wasn't invalidated at that time */
 		mutex_lock(&ib_peer_mem->lock);
 		if (invalidation_ctx->peer_invalidated) {
 			printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n");
 			ret = -EINVAL;
 		}
 	}
 
 	if (ret)
 		goto out;
 
 	umem->page_size = peer_mem->get_page_size
 					(umem->peer_mem_client_context);
 	if (umem->page_size <= 0)
 		goto put_pages;
 
 	umem->offset = addr & ((unsigned long)umem->page_size - 1);
 	ret = peer_mem->dma_map(&umem->sg_head,
 					umem->peer_mem_client_context,
 					umem->context->device->dma_device,
 					dmasync,
 					&umem->nmap);
 	if (ret)
 		goto put_pages;
 
 	ib_peer_mem->stats.num_reg_pages +=
 			umem->nmap * (umem->page_size >> PAGE_SHIFT);
 	ib_peer_mem->stats.num_alloc_mrs += 1;
 	return umem;
 
 put_pages:
 
 	peer_mem->put_pages(umem->peer_mem_client_context,
 					&umem->sg_head);
 out:
 	if (invalidation_ctx) {
 		ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
 		mutex_unlock(&umem->ib_peer_mem->lock);
 		kfree(invalidation_ctx);
 	}
 
 	ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
 				umem->peer_mem_srcu_key);
 	kfree(umem);
 	return ERR_PTR(ret);
 }
 
 static void peer_umem_release(struct ib_umem *umem)
 {
 	struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem;
 	const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
 	struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
 
 	if (invalidation_ctx) {
 
 		int peer_callback;
 		int inflight_invalidation;
 		/* If we are not under peer callback we must take the lock before removing
 		  * core ticket from the tree and releasing its umem.
 		  * It will let any inflight callbacks to be ended safely.
 		  * If we are under peer callback or under error flow of reg_mr so that context
 		  * wasn't activated yet lock was already taken.
 		*/
 		if (invalidation_ctx->func && !invalidation_ctx->peer_callback)
 			mutex_lock(&ib_peer_mem->lock);
 		ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
 		/* make sure to check inflight flag after took the lock and remove from tree.
 		  * in addition, from that point using local variables for peer_callback and
 		  * inflight_invalidation as after the complete invalidation_ctx can't be accessed
 		  * any more as it may be freed by the callback.
 		*/
 		peer_callback = invalidation_ctx->peer_callback;
 		inflight_invalidation = invalidation_ctx->inflight_invalidation;
 		if (inflight_invalidation)
 			complete(&invalidation_ctx->comp);
 		/* On peer callback lock is handled externally */
 		if (!peer_callback)
 			/* unlocking before put_pages */
 			mutex_unlock(&ib_peer_mem->lock);
 		/* in case under callback context or callback is pending let it free the invalidation context */
 		if (!peer_callback && !inflight_invalidation)
 			kfree(invalidation_ctx);
 	}
 
 	peer_mem->dma_unmap(&umem->sg_head,
 					umem->peer_mem_client_context,
 					umem->context->device->dma_device);
 	peer_mem->put_pages(&umem->sg_head,
 					  umem->peer_mem_client_context);
 
 	ib_peer_mem->stats.num_dereg_pages +=
 			umem->nmap * (umem->page_size >> PAGE_SHIFT);
 	ib_peer_mem->stats.num_dealloc_mrs += 1;
 	ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
 				umem->peer_mem_srcu_key);
 	kfree(umem);
 
 	return;
 
 }
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
 
 	vm_object_t object;
 	struct scatterlist *sg;
 	struct page *page;
 	int i;
 
 	object = NULL;
 	if (umem->nmap > 0)
 		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
 			umem->nmap,
 			DMA_BIDIRECTIONAL);
 	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
 		page = sg_page(sg);
 			if (umem->writable && dirty) {
 				if (object && object != page->object)
 					VM_OBJECT_WUNLOCK(object);
 				if (object != page->object) {
 					object = page->object;
 					VM_OBJECT_WLOCK(object);
 				}
 				vm_page_dirty(page);
 			}
 		}
 	sg_free_table(&umem->sg_head);
 	if (object)
 		VM_OBJECT_WUNLOCK(object);
 
 }
 
 void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 					       umem_invalidate_func_t func,
 					       void *cookie)
 {
 	struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
 
 	invalidation_ctx->func = func;
 	invalidation_ctx->cookie = cookie;
 
 	/* from that point any pending invalidations can be called */
 	mutex_unlock(&umem->ib_peer_mem->lock);
 	return;
 }
 EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  * @context: userspace context to pin memory for
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
  * @dmasync: flush in-flight DMA when the memory region is written
  */
 struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access, int dmasync,
 			    int invalidation_supported)
 {
 
 	struct ib_umem *umem;
         struct proc *proc;
 	pmap_t pmap;
         vm_offset_t end, last, start;
         vm_size_t npages;
         int error;
 	int ret;
 	int ents;
 	int i;
 	DEFINE_DMA_ATTRS(attrs);
 	struct scatterlist *sg, *sg_list_start;
 	int need_release = 0;
 
 	error = priv_check(curthread, PRIV_VM_MLOCK);
 	if (error)
 		return ERR_PTR(-error);
 
 	last = addr + size;
 	start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
 	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
 	if (last < addr || end < addr)
 		return ERR_PTR(-EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return ERR_PTR(-ENOMEM);
 	umem = kzalloc(sizeof *umem, GFP_KERNEL);
 	if (!umem)
 		return ERR_PTR(-ENOMEM);
 	proc = curthread->td_proc;
 	PROC_LOCK(proc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
 	    lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		kfree(umem);
 		return ERR_PTR(-ENOMEM);
 	}
         PROC_UNLOCK(proc);
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired) {
 		kfree(umem);
 		return ERR_PTR(-EAGAIN);
 	}
 	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
 	    (umem->writable ? VM_MAP_WIRE_WRITE : 0));
 	if (error != KERN_SUCCESS) {
 		kfree(umem);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	umem->context   = context;
 	umem->length    = size;
 	umem->offset    = addr & ~PAGE_MASK;
 	umem->page_size = PAGE_SIZE;
 	umem->start	= addr;
 	/*
 	 * We ask for writable memory if any access flags other than
 	 * "remote read" are set.  "Local write" and "remote write"
 	 * obviously require write access.  "Remote atomic" can do
 	 * things like fetch and add, which will modify memory, and
 	 * "MW bind" can change permissions by binding a window.
 	 */
 	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
 
 	if (invalidation_supported || context->peer_mem_private_data) {
 
 		struct ib_peer_memory_client *peer_mem_client;
 
 		peer_mem_client =  ib_get_peer_client(context, addr, size,
 			&umem->peer_mem_client_context,
 				&umem->peer_mem_srcu_key);
 		if (peer_mem_client)
 			return peer_umem_get(peer_mem_client, umem, addr,
 				dmasync, invalidation_supported);
 	}
 
 	umem->hugetlb = 0;
 
 	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
 
 	if (npages == 0) {
 		ret = -EINVAL;
 			goto out;
 		}
 
 	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
 	if (ret)
 		goto out;
 
 	need_release = 1;
 	sg_list_start = umem->sg_head.sgl;
 
 	while (npages) {
 
 		ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
 		umem->npages += ents;
 
 		for_each_sg(sg_list_start, sg, ents, i) {
 			vm_paddr_t pa;
 
 			pa = pmap_extract(pmap, start);
 			if (pa == 0) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			sg_set_page(sg, PHYS_TO_VM_PAGE(pa),
 			    PAGE_SIZE, 0);
 			npages--;
 			start += PAGE_SIZE;
 		}
 
 		/* preparing for next loop */
 		sg_list_start = sg;
 	}
 
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 					umem->sg_head.sgl,
 					umem->npages,
 						  DMA_BIDIRECTIONAL,
 						  &attrs);
 	if (umem->nmap != umem->npages) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
 out:
 	if (ret < 0) {
 		if (need_release)
 		__ib_umem_release(context->device, umem, 0);
 		kfree(umem);
 	}
 
 	return ret < 0 ? ERR_PTR(ret) : umem;
 }
 EXPORT_SYMBOL(ib_umem_get_ex);
 
 struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access, int dmasync)
 {
 	return ib_umem_get_ex(context, addr,
 			    size, access, dmasync, 0);
 }
 EXPORT_SYMBOL(ib_umem_get);
 
 /**
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
 void ib_umem_release(struct ib_umem *umem)
 {
 
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
 	int error;
 
 	if (umem->ib_peer_mem) {
 		peer_umem_release(umem);
 		return;
 	}
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
 	if (umem->context->closing) {
 		kfree(umem);
 		return;
 	}
 
 	error = priv_check(curthread, PRIV_VM_MUNLOCK);
 
 	if (error)
 		return;
 
 	addr = umem->start;
 	size = umem->length;
 	last = addr + size;
         start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
 	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
 	vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	kfree(umem);
 
 }
 EXPORT_SYMBOL(ib_umem_release);
 
 int ib_umem_page_count(struct ib_umem *umem)
 {
 	int shift;
 	int i;
 	int n;
 	struct scatterlist *sg;
 
 	shift = ilog2(umem->page_size);
 
 	n = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
 		n += sg_dma_len(sg) >> shift;
 
 	return n;
 }
 EXPORT_SYMBOL(ib_umem_page_count);
Index: stable/11/sys/powerpc/powerpc/uma_machdep.c
===================================================================
--- stable/11/sys/powerpc/powerpc/uma_machdep.c	(revision 331016)
+++ stable/11/sys/powerpc/powerpc/uma_machdep.c	(revision 331017)
@@ -1,98 +1,99 @@
 /*-
  * Copyright (c) 2003 The FreeBSD Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
+#include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 
 static int hw_uma_mdpages;
 SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0,
 	   "UMA MD pages in use");
 
 void *
 uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
 {
 	void *va;
 	vm_paddr_t pa;
 	vm_page_t m;
 	
 	*flags = UMA_SLAB_PRIV;
 
 	m = vm_page_alloc(NULL, 0,
 	    malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
 	if (m == NULL) 
 		return (NULL);
 
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/* On book-e sizeof(void *) < sizeof(vm_paddr_t) */
 	if ((vm_offset_t)pa != pa)
 		return (NULL);
 
 	va = (void *)(vm_offset_t)pa;
 
 	if (!hw_direct_map)
 		pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m));
 
 	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
 		bzero(va, PAGE_SIZE);
 	atomic_add_int(&hw_uma_mdpages, 1);
 
 	return (va);
 }
 
 void
 uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
 {
 	vm_page_t m;
 
 	if (!hw_direct_map)
 		pmap_remove(kernel_pmap,(vm_offset_t)mem,
 		    (vm_offset_t)mem + PAGE_SIZE);
 
 	m = PHYS_TO_VM_PAGE((vm_offset_t)mem);
 	m->wire_count--;
 	vm_page_free(m);
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	atomic_subtract_int(&hw_uma_mdpages, 1);
 }
Index: stable/11/sys/sparc64/sparc64/intr_machdep.c
===================================================================
--- stable/11/sys/sparc64/sparc64/intr_machdep.c	(revision 331016)
+++ stable/11/sys/sparc64/sparc64/intr_machdep.c	(revision 331017)
@@ -1,559 +1,560 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2001 Jake Burkholder.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)isa.c	7.2 (Berkeley) 5/13/91
  *	form: src/sys/i386/isa/intr_machdep.c,v 1.57 2001/07/20
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/errno.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
+#include <sys/vmmeter.h>
 
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 
 #define	MAX_STRAY_LOG	5
 
 CTASSERT((1 << IV_SHIFT) == sizeof(struct intr_vector));
 
 ih_func_t *intr_handlers[PIL_MAX];
 uint16_t pil_countp[PIL_MAX];
 static uint16_t pil_stray_count[PIL_MAX];
 
 struct intr_vector intr_vectors[IV_MAX];
 uint16_t intr_countp[IV_MAX];
 static uint16_t intr_stray_count[IV_MAX];
 
 static const char *const pil_names[] = {
 	"stray",
 	"low",		/* PIL_LOW */
 	"preempt",	/* PIL_PREEMPT */
 	"ithrd",	/* PIL_ITHREAD */
 	"rndzvs",	/* PIL_RENDEZVOUS */
 	"ast",		/* PIL_AST */
 	"hardclock",	/* PIL_HARDCLOCK */
 	"stray", "stray", "stray", "stray",
 	"filter",	/* PIL_FILTER */
 	"bridge",	/* PIL_BRIDGE */
 	"stop",		/* PIL_STOP */
 	"tick",		/* PIL_TICK */
 };
 
 /* protect the intr_vectors table */
 static struct sx intr_table_lock;
 /* protect intrcnt_index */
 static struct mtx intrcnt_lock;
 
 #ifdef SMP
 static int assign_cpu;
 
 static void intr_assign_next_cpu(struct intr_vector *iv);
 static void intr_shuffle_irqs(void *arg __unused);
 #endif
 
 static int intr_assign_cpu(void *arg, int cpu);
 static void intr_execute_handlers(void *);
 static void intr_stray_level(struct trapframe *);
 static void intr_stray_vector(void *);
 static int intrcnt_setname(const char *, int);
 static void intrcnt_updatename(int, const char *, int);
 
 static void
 intrcnt_updatename(int vec, const char *name, int ispil)
 {
 	static int intrcnt_index, stray_pil_index, stray_vec_index;
 	int name_index;
 
 	mtx_lock_spin(&intrcnt_lock);
 	if (intrnames[0] == '\0') {
 		/* for bitbucket */
 		if (bootverbose)
 			printf("initalizing intr_countp\n");
 		intrcnt_setname("???", intrcnt_index++);
 
 		stray_vec_index = intrcnt_index++;
 		intrcnt_setname("stray", stray_vec_index);
 		for (name_index = 0; name_index < IV_MAX; name_index++)
 			intr_countp[name_index] = stray_vec_index;
 
 		stray_pil_index = intrcnt_index++;
 		intrcnt_setname("pil", stray_pil_index);
 		for (name_index = 0; name_index < PIL_MAX; name_index++)
 			pil_countp[name_index] = stray_pil_index;
 	}
 
 	if (name == NULL)
 		name = "???";
 
 	if (!ispil && intr_countp[vec] != stray_vec_index)
 		name_index = intr_countp[vec];
 	else if (ispil && pil_countp[vec] != stray_pil_index)
 		name_index = pil_countp[vec];
 	else
 		name_index = intrcnt_index++;
 
 	if (intrcnt_setname(name, name_index))
 		name_index = 0;
 
 	if (!ispil)
 		intr_countp[vec] = name_index;
 	else
 		pil_countp[vec] = name_index;
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static int
 intrcnt_setname(const char *name, int index)
 {
 
 	if ((MAXCOMLEN + 1) * index >= sintrnames)
 		return (E2BIG);
 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
 	    MAXCOMLEN, name);
 	return (0);
 }
 
 void
 intr_setup(int pri, ih_func_t *ihf, int vec, iv_func_t *ivf, void *iva)
 {
 	char pilname[MAXCOMLEN + 1];
 	register_t s;
 
 	s = intr_disable();
 	if (vec != -1) {
 		intr_vectors[vec].iv_func = ivf;
 		intr_vectors[vec].iv_arg = iva;
 		intr_vectors[vec].iv_pri = pri;
 		intr_vectors[vec].iv_vec = vec;
 	}
 	intr_handlers[pri] = ihf;
 	intr_restore(s);
 	snprintf(pilname, MAXCOMLEN + 1, "pil%d: %s", pri, pil_names[pri]);
 	intrcnt_updatename(pri, pilname, 1);
 }
 
 static void
 intr_stray_level(struct trapframe *tf)
 {
 	uint64_t level;
 
 	level = tf->tf_level;
 	if (pil_stray_count[level] < MAX_STRAY_LOG) {
 		printf("stray level interrupt %ld\n", level);
 		pil_stray_count[level]++;
 		if (pil_stray_count[level] >= MAX_STRAY_LOG)
 			printf("got %d stray level interrupt %ld's: not "
 			    "logging anymore\n", MAX_STRAY_LOG, level);
 	}
 }
 
 static void
 intr_stray_vector(void *cookie)
 {
 	struct intr_vector *iv;
 	u_int vec;
 
 	iv = cookie;
 	vec = iv->iv_vec;
 	if (intr_stray_count[vec] < MAX_STRAY_LOG) {
 		printf("stray vector interrupt %d\n", vec);
 		intr_stray_count[vec]++;
 		if (intr_stray_count[vec] >= MAX_STRAY_LOG)
 			printf("got %d stray vector interrupt %d's: not "
 			    "logging anymore\n", MAX_STRAY_LOG, vec);
 	}
 }
 
 void
 intr_init1()
 {
 	int i;
 
 	/* Mark all interrupts as being stray. */
 	for (i = 0; i < PIL_MAX; i++)
 		intr_handlers[i] = intr_stray_level;
 	for (i = 0; i < IV_MAX; i++) {
 		intr_vectors[i].iv_func = intr_stray_vector;
 		intr_vectors[i].iv_arg = &intr_vectors[i];
 		intr_vectors[i].iv_pri = PIL_LOW;
 		intr_vectors[i].iv_vec = i;
 		intr_vectors[i].iv_refcnt = 0;
 	}
 	intr_handlers[PIL_LOW] = intr_fast;
 }
 
 void
 intr_init2()
 {
 
 	sx_init(&intr_table_lock, "intr sources");
 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 
 static int
 intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct pcpu *pc;
 	struct intr_vector *iv;
 
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
 		pc = pcpu_find(cpu);
 		if (pc == NULL)
 			return (EINVAL);
 		iv = arg;
 		sx_xlock(&intr_table_lock);
 		iv->iv_mid = pc->pc_mid;
 		iv->iv_ic->ic_assign(iv);
 		sx_xunlock(&intr_table_lock);
 	}
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 static void
 intr_execute_handlers(void *cookie)
 {
 	struct intr_vector *iv;
 
 	iv = cookie;
 	if (__predict_false(intr_event_handle(iv->iv_event, NULL) != 0))
 		intr_stray_vector(iv);
 }
 
 int
 intr_controller_register(int vec, const struct intr_controller *ic,
     void *icarg)
 {
 	struct intr_event *ie;
 	struct intr_vector *iv;
 	int error;
 
 	if (vec < 0 || vec >= IV_MAX)
 		return (EINVAL);
 	sx_xlock(&intr_table_lock);
 	iv = &intr_vectors[vec];
 	ie = iv->iv_event;
 	sx_xunlock(&intr_table_lock);
 	if (ie != NULL)
 		return (EEXIST);
 	error = intr_event_create(&ie, iv, 0, vec, NULL, ic->ic_clear,
 	    ic->ic_clear, intr_assign_cpu, "vec%d:", vec);
 	if (error != 0)
 		return (error);
 	sx_xlock(&intr_table_lock);
 	if (iv->iv_event != NULL) {
 		sx_xunlock(&intr_table_lock);
 		intr_event_destroy(ie);
 		return (EEXIST);
 	}
 	iv->iv_ic = ic;
 	iv->iv_icarg = icarg;
 	iv->iv_event = ie;
 	iv->iv_mid = PCPU_GET(mid);
 	sx_xunlock(&intr_table_lock);
 	return (0);
 }
 
 int
 inthand_add(const char *name, int vec, driver_filter_t *filt,
     driver_intr_t *handler, void *arg, int flags, void **cookiep)
 {
 	const struct intr_controller *ic;
 	struct intr_event *ie;
 	struct intr_handler *ih;
 	struct intr_vector *iv;
 	int error, filter;
 
 	if (vec < 0 || vec >= IV_MAX)
 		return (EINVAL);
 	/*
 	 * INTR_BRIDGE filters/handlers are special purpose only, allowing
 	 * them to be shared just would complicate things unnecessarily.
 	 */
 	if ((flags & INTR_BRIDGE) != 0 && (flags & INTR_EXCL) == 0)
 		return (EINVAL);
 	sx_xlock(&intr_table_lock);
 	iv = &intr_vectors[vec];
 	ic = iv->iv_ic;
 	ie = iv->iv_event;
 	sx_xunlock(&intr_table_lock);
 	if (ic == NULL || ie == NULL)
 		return (EINVAL);
 	error = intr_event_add_handler(ie, name, filt, handler, arg,
 	    intr_priority(flags), flags, cookiep);
 	if (error != 0)
 		return (error);
 	sx_xlock(&intr_table_lock);
 	/* Disable the interrupt while we fiddle with it. */
 	ic->ic_disable(iv);
 	iv->iv_refcnt++;
 	if (iv->iv_refcnt == 1)
 		intr_setup((flags & INTR_BRIDGE) != 0 ? PIL_BRIDGE :
 		    filt != NULL ? PIL_FILTER : PIL_ITHREAD, intr_fast,
 		    vec, intr_execute_handlers, iv);
 	else if (filt != NULL) {
 		/*
 		 * Check if we need to upgrade from PIL_ITHREAD to PIL_FILTER.
 		 * Given that apart from the on-board SCCs and UARTs shared
 		 * interrupts are rather uncommon on sparc64 this should be
 		 * pretty rare in practice.
 		 */
 		filter = 0;
 		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 			if (ih->ih_filter != NULL && ih->ih_filter != filt) {
 				filter = 1;
 				break;
 			}
 		}
 		if (filter == 0)
 			intr_setup(PIL_FILTER, intr_fast, vec,
 			    intr_execute_handlers, iv);
 	}
 	intr_stray_count[vec] = 0;
 	intrcnt_updatename(vec, ie->ie_fullname, 0);
 #ifdef SMP
 	if (assign_cpu)
 		intr_assign_next_cpu(iv);
 #endif
 	ic->ic_enable(iv);
 	/* Ensure the interrupt is cleared, it might have triggered before. */
 	if (ic->ic_clear != NULL)
 		ic->ic_clear(iv);
 	sx_xunlock(&intr_table_lock);
 	return (0);
 }
 
 int
 inthand_remove(int vec, void *cookie)
 {
 	struct intr_vector *iv;
 	int error;
 
 	if (vec < 0 || vec >= IV_MAX)
 		return (EINVAL);
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		/*
 		 * XXX: maybe this should be done regardless of whether
 		 * intr_event_remove_handler() succeeded?
 		 */
 		sx_xlock(&intr_table_lock);
 		iv = &intr_vectors[vec];
 		iv->iv_refcnt--;
 		if (iv->iv_refcnt == 0) {
 			/*
 			 * Don't disable the interrupt for now, so that
 			 * stray interrupts get detected...
 			 */
 			intr_setup(PIL_LOW, intr_fast, vec,
 			    intr_stray_vector, iv);
 		}
 		sx_xunlock(&intr_table_lock);
 	}
 	return (error);
 }
 
 /* Add a description to an active interrupt handler. */
 int
 intr_describe(int vec, void *ih, const char *descr)
 {
 	struct intr_vector *iv;
 	int error;
 
 	if (vec < 0 || vec >= IV_MAX)
 		return (EINVAL);
 	sx_xlock(&intr_table_lock);
 	iv = &intr_vectors[vec];
 	if (iv == NULL) {
 		sx_xunlock(&intr_table_lock);
 		return (EINVAL);
 	}
 	error = intr_event_describe_handler(iv->iv_event, ih, descr);
 	if (error) {
 		sx_xunlock(&intr_table_lock);
 		return (error);
 	}
 	intrcnt_updatename(vec, iv->iv_event->ie_fullname, 0);
 	sx_xunlock(&intr_table_lock);
 	return (error);
 }
 
 #ifdef SMP
 /*
  * Support for balancing interrupt sources across CPUs.  For now we just
  * allocate CPUs round-robin.
  */
 
 static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
 static int current_cpu;
 
 static void
 intr_assign_next_cpu(struct intr_vector *iv)
 {
 	struct pcpu *pc;
 
 	sx_assert(&intr_table_lock, SA_XLOCKED);
 
 	/*
 	 * Assign this source to a CPU in a round-robin fashion.
 	 */
 	pc = pcpu_find(current_cpu);
 	if (pc == NULL)
 		return;
 	iv->iv_mid = pc->pc_mid;
 	iv->iv_ic->ic_assign(iv);
 	do {
 		current_cpu++;
 		if (current_cpu > mp_maxid)
 			current_cpu = 0;
 	} while (!CPU_ISSET(current_cpu, &intr_cpus));
 }
 
 /* Attempt to bind the specified IRQ to the specified CPU. */
 int
 intr_bind(int vec, u_char cpu)
 {
 	struct intr_vector *iv;
 	int error;
 
 	if (vec < 0 || vec >= IV_MAX)
 		return (EINVAL);
 	sx_xlock(&intr_table_lock);
 	iv = &intr_vectors[vec];
 	if (iv == NULL) {
 		sx_xunlock(&intr_table_lock);
 		return (EINVAL);
 	}
 	error = intr_event_bind(iv->iv_event, cpu);
 	sx_xunlock(&intr_table_lock);
 	return (error);
 }
 
 /*
  * Add a CPU to our mask of valid CPUs that can be destinations of
  * interrupts.
  */
 void
 intr_add_cpu(u_int cpu)
 {
 
 	if (cpu >= MAXCPU)
 		panic("%s: Invalid CPU ID", __func__);
 	if (bootverbose)
 		printf("INTR: Adding CPU %d as a target\n", cpu);
 
 	CPU_SET(cpu, &intr_cpus);
 }
 
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * APs have been launched.
  */
 static void
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct pcpu *pc;
 	struct intr_vector *iv;
 	int i;
 
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
 
 	sx_xlock(&intr_table_lock);
 	assign_cpu = 1;
 	for (i = 0; i < IV_MAX; i++) {
 		iv = &intr_vectors[i];
 		if (iv != NULL && iv->iv_refcnt > 0) {
 			/*
 			 * If this event is already bound to a CPU,
 			 * then assign the source to that CPU instead
 			 * of picking one via round-robin.
 			 */
 			if (iv->iv_event->ie_cpu != NOCPU &&
 			    (pc = pcpu_find(iv->iv_event->ie_cpu)) != NULL) {
 				iv->iv_mid = pc->pc_mid;
 				iv->iv_ic->ic_assign(iv);
 			} else
 				intr_assign_next_cpu(iv);
 		}
 	}
 	sx_xunlock(&intr_table_lock);
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
 #endif
Index: stable/11/sys/sparc64/sparc64/machdep.c
===================================================================
--- stable/11/sys/sparc64/sparc64/machdep.c	(revision 331016)
+++ stable/11/sys/sparc64/sparc64/machdep.c	(revision 331017)
@@ -1,1114 +1,1115 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  *	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
+#include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/cache.h>
 #include <machine/cmt.h>
 #include <machine/cpu.h>
 #include <machine/fireplane.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/jbus.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/upa.h>
 #include <machine/ver.h>
 
 typedef int ofw_vec_t(void *);
 
 int dtlb_slots;
 int itlb_slots;
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 void *dpcpu0;
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct trapframe frame0;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 u_int tba_taken_over;
 
 char sparc64_model[32];
 
 static int cpu_use_vis = 1;
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl);
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
     ofw_vec_t *vec);
 static void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *arg);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz / PAGE_SIZE;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
 	printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE,
 	    vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 	cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = rdpr(pil);
 		wrpr(pil, 0, PIL_TICK);
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_pil = pil;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	critical_exit();
 	pil = td->td_md.md_saved_pil;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		wrpr(pil, pil, 0);
 }
 
 static phandle_t
 find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl)
 {
 	char type[sizeof("cpu")];
 	phandle_t child;
 	uint32_t portid;
 
 	for (; node != 0; node = OF_peer(node)) {
 		child = OF_child(node);
 		if (child > 0) {
 			child = find_bsp(child, bspid, cpu_impl);
 			if (child > 0)
 				return (child);
 		} else {
 			if (OF_getprop(node, "device_type", type,
 			    sizeof(type)) <= 0)
 				continue;
 			if (strcmp(type, "cpu") != 0)
 				continue;
 			if (OF_getprop(node, cpu_portid_prop(cpu_impl),
 			    &portid, sizeof(portid)) <= 0)
 				continue;
 			if (portid == bspid)
 				return (node);
 		}
 	}
 	return (0);
 }
 
 const char *
 cpu_portid_prop(u_int cpu_impl)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return ("upa-portid");
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return ("portid");
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return ("cpuid");
 	default:
 		return ("");
 	}
 }
 
 uint32_t
 cpu_get_mid(u_int cpu_impl)
 {
 
 	switch (cpu_impl) {
 	case CPU_IMPL_SPARC64:
 	case CPU_IMPL_SPARC64V:
 	case CPU_IMPL_ULTRASPARCI:
 	case CPU_IMPL_ULTRASPARCII:
 	case CPU_IMPL_ULTRASPARCIIi:
 	case CPU_IMPL_ULTRASPARCIIe:
 		return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIII:
 	case CPU_IMPL_ULTRASPARCIIIp:
 		return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG,
 		    ASI_FIREPLANE_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIIIi:
 	case CPU_IMPL_ULTRASPARCIIIip:
 		return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG)));
 	case CPU_IMPL_ULTRASPARCIV:
 	case CPU_IMPL_ULTRASPARCIVp:
 		return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID)));
 	default:
 		return (0);
 	}
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	char *env;
 	struct pcpu *pc;
 	vm_offset_t end;
 	vm_offset_t va;
 	caddr_t kmdp;
 	phandle_t root;
 	u_int cpu_impl;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Find out what kind of CPU we have first, for anything that changes
 	 * behaviour.
 	 */
 	cpu_impl = VER_IMPL(rdpr(ver));
 
 	/*
 	 * Do CPU-specific initialization.
 	 */
 	if (cpu_impl >= CPU_IMPL_ULTRASPARCIII)
 		cheetah_init(cpu_impl);
 	else if (cpu_impl == CPU_IMPL_SPARC64V)
 		zeus_init(cpu_impl);
 
 	/*
 	 * Clear (S)TICK timer (including NPT).
 	 */
 	tick_clear(cpu_impl);
 
 	/*
 	 * UltraSparc II[e,i] based systems come up with the tick interrupt
 	 * enabled and a handler that resets the tick counter, causing DELAY()
 	 * to not work properly when used early in boot.
 	 * UltraSPARC III based systems come up with the system tick interrupt
 	 * enabled, causing an interrupt storm on startup since they are not
 	 * handled.
 	 */
 	tick_stop(cpu_impl);
 
 	/*
 	 * Set up Open Firmware entry points.
 	 */
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit() gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *),
 			    0);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
 	init_param1();
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_install(OFW_STD_DIRECT, 0);
 	OF_init(ofw_entry);
 
 	/*
 	 * Prime our per-CPU data page for use.  Note, we are using it for
 	 * our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init
 	 * or it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_addr = (vm_offset_t)pcpu0;
 	pc->pc_impl = cpu_impl;
 	pc->pc_mid = cpu_get_mid(cpu_impl);
 	pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;
 
 	/*
 	 * Determine the OFW node and frequency of the BSP (and ensure the
 	 * BSP is in the device tree in the first place).
 	 */
 	root = OF_peer(0);
 	pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl);
 	if (pc->pc_node == 0)
 		OF_panic("%s: cannot find boot CPU node", __func__);
 	if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock,
 	    sizeof(pc->pc_clock)) <= 0)
 		OF_panic("%s: cannot determine boot CPU clock", __func__);
 
 	/*
 	 * Panic if there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL || end == 0 ||
 	    kernel_tlb_slots == 0 || kernel_tlbs == NULL)
 		OF_panic("%s: missing loader metadata.\nThis probably means "
 		    "you are not using loader(8).", __func__);
 
 	/*
 	 * Work around the broken loader behavior of not demapping no
 	 * longer used kernel TLB slots when unloading the kernel or
 	 * modules.
 	 */
 	for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M;
 	    va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) {
 		if (bootverbose)
 			OF_printf("demapping unused kernel TLB slot "
 			    "(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1);
 		stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
 		    ASI_DMMU_DEMAP, 0);
 		stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE,
 		    ASI_IMMU_DEMAP, 0);
 		flush(KERNBASE);
 		kernel_tlb_slots--;
 	}
 
 	/*
 	 * Determine the TLB slot maxima, which are expected to be
 	 * equal across all CPUs.
 	 * NB: for cheetah-class CPUs, these properties only refer
 	 * to the t16s.
 	 */
 	if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots,
 	    sizeof(dtlb_slots)) == -1)
 		OF_panic("%s: cannot determine number of dTLB slots",
 		    __func__);
 	if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots,
 	    sizeof(itlb_slots)) == -1)
 		OF_panic("%s: cannot determine number of iTLB slots",
 		    __func__);
 
 	/*
 	 * Initialize and enable the caches.  Note that this may include
 	 * applying workarounds.
 	 */
 	cache_init(pc);
 	cache_enable(cpu_impl);
 	uma_set_align(pc->pc_cache.dc_linesize - 1);
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 	getenv_int("machdep.use_vis", &cpu_use_vis);
 	if (cpu_use_vis) {
 		switch (cpu_impl) {
 		case CPU_IMPL_SPARC64:
 		case CPU_IMPL_ULTRASPARCI:
 		case CPU_IMPL_ULTRASPARCII:
 		case CPU_IMPL_ULTRASPARCIIi:
 		case CPU_IMPL_ULTRASPARCIIe:
 		case CPU_IMPL_ULTRASPARCIII:	/* NB: we've disabled P$. */
 		case CPU_IMPL_ULTRASPARCIIIp:
 		case CPU_IMPL_ULTRASPARCIIIi:
 		case CPU_IMPL_ULTRASPARCIV:
 		case CPU_IMPL_ULTRASPARCIVp:
 		case CPU_IMPL_ULTRASPARCIIIip:
 			cpu_block_copy = spitfire_block_copy;
 			cpu_block_zero = spitfire_block_zero;
 			break;
 		case CPU_IMPL_SPARC64V:
 			cpu_block_copy = zeus_block_copy;
 			cpu_block_zero = zeus_block_zero;
 			break;
 		}
 	}
 
 #ifdef SMP
 	mp_init();
 #endif
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(cpu_impl);
 
 	/*
 	 * Initialize tunables.
 	 */
 	init_param2(physmem);
 	env = kern_getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Initialize the interrupt tables.
 	 */
 	intr_init1();
 
 	/*
 	 * Initialize proc0, set kstack0, frame0, curthread and curpcb.
 	 */
 	proc_linkup0(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	thread0.td_kstack = kstack0;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 
 	/*
 	 * Initialize global registers.
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Take over the trap table via the PROM.  Using the PROM for this
 	 * is necessary in order to set obp-control-relinquished to true
 	 * within the PROM so obtaining /virtual-memory/translations doesn't
 	 * trigger a fatal reset error or worse things further down the road.
 	 * XXX it should be possible to use this solely instead of writing
 	 * %tba in cpu_setregs().  Doing so causes a hang however.
 	 *
 	 * NB: the low-level console drivers require a working DELAY() and
 	 * some compiler optimizations may cause the curthread accesses of
 	 * mutex(9) to be factored out even if the latter aren't actually
 	 * called.  Both of these require PCPU_REG to be set.  However, we
 	 * can't set PCPU_REG without also taking over the trap table or the
 	 * firmware will overwrite it.
 	 */
 	sun4u_set_traptable(tl0_base);
 
 	/*
 	 * Initialize the dynamic per-CPU area for the BSP and the message
 	 * buffer (after setting the trap table).
 	 */
 	dpcpu_init(dpcpu0, 0);
 	msgbufinit(msgbufp, msgbufsize);
 
 	/*
 	 * Initialize mutexes.
 	 */
 	mutex_init();
 
 	/*
 	 * Initialize console now that we have a reasonable set of system
 	 * services.
 	 */
 	cninit();
 
 	/*
 	 * Finish the interrupt initialization now that mutexes work and
 	 * enable them.
 	 */
 	intr_init2();
 	wrpr(pil, 0, 0);
 	wrpr(pstate, 0, PSTATE_KERNEL);
 
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	u_long sp;
 	int oonstack;
 	int sig;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal trampoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
 	    ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->_mc_tpc, mc->_mc_sp, mc->_mc_tstate);
 	return (EJUSTRETURN);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/*
 	 * Copy the registers which will be restored by tl0_ret() from the
 	 * trapframe.
 	 * Note that we skip %g7 which is used as the userland TLS register
 	 * and %wstate.
 	 */
 	mc->_mc_flags = _MC_VERSION;
 	mc->mc_global[1] = tf->tf_global[1];
 	mc->mc_global[2] = tf->tf_global[2];
 	mc->mc_global[3] = tf->tf_global[3];
 	mc->mc_global[4] = tf->tf_global[4];
 	mc->mc_global[5] = tf->tf_global[5];
 	mc->mc_global[6] = tf->tf_global[6];
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	} else {
 		mc->mc_out[0] = tf->tf_out[0];
 		mc->mc_out[1] = tf->tf_out[1];
 	}
 	mc->mc_out[2] = tf->tf_out[2];
 	mc->mc_out[3] = tf->tf_out[3];
 	mc->mc_out[4] = tf->tf_out[4];
 	mc->mc_out[5] = tf->tf_out[5];
 	mc->mc_out[6] = tf->tf_out[6];
 	mc->mc_out[7] = tf->tf_out[7];
 	mc->_mc_fprs = tf->tf_fprs;
 	mc->_mc_fsr = tf->tf_fsr;
 	mc->_mc_gsr = tf->tf_gsr;
 	mc->_mc_tnpc = tf->tf_tnpc;
 	mc->_mc_tpc = tf->tf_tpc;
 	mc->_mc_tstate = tf->tf_tstate;
 	mc->_mc_y = tf->tf_y;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		tf->tf_fprs &= ~FPRS_FEF;
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->_mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	if (!TSTATE_SECURE(mc->_mc_tstate) ||
 	    (mc->_mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	/*
 	 * Copy the registers which will be restored by tl0_ret() to the
 	 * trapframe.
 	 * Note that we skip %g7 which is used as the userland TLS register
 	 * and %wstate.
 	 */
 	tf->tf_global[1] = mc->mc_global[1];
 	tf->tf_global[2] = mc->mc_global[2];
 	tf->tf_global[3] = mc->mc_global[3];
 	tf->tf_global[4] = mc->mc_global[4];
 	tf->tf_global[5] = mc->mc_global[5];
 	tf->tf_global[6] = mc->mc_global[6];
 	tf->tf_out[0] = mc->mc_out[0];
 	tf->tf_out[1] = mc->mc_out[1];
 	tf->tf_out[2] = mc->mc_out[2];
 	tf->tf_out[3] = mc->mc_out[3];
 	tf->tf_out[4] = mc->mc_out[4];
 	tf->tf_out[5] = mc->mc_out[5];
 	tf->tf_out[6] = mc->mc_out[6];
 	tf->tf_out[7] = mc->mc_out[7];
 	tf->tf_fprs = mc->_mc_fprs;
 	tf->tf_fsr = mc->_mc_fsr;
 	tf->tf_gsr = mc->_mc_gsr;
 	tf->tf_tnpc = mc->_mc_tnpc;
 	tf->tf_tpc = mc->_mc_tpc;
 	tf->tf_tstate = mc->_mc_tstate;
 	tf->tf_y = mc->_mc_y;
 	if ((mc->_mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	ofw_exit(args);
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 
 	/* TBD */
 }
 
 /* Get current clock frequency for the given CPU ID. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	struct pcpu *pc;
 
 	pc = pcpu_find(cpu_id);
 	if (pc == NULL || rate == NULL)
 		return (EINVAL);
 	*rate = pc->pc_clock;
 	return (0);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 static void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware. */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(int busy)
 {
 
 	/* Insert code to halt (until next interrupt) for the idle loop. */
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 
 	return (1);
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 	tf->tf_tnpc = imgp->entry_addr + 4;
 	tf->tf_tpc = imgp->entry_addr;
 	/*
 	 * While we could adhere to the memory model indicated in the ELF
 	 * header, it turns out that just always using TSO performs best.
 	 */
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
 
 	td->td_retval[0] = tf->tf_out[0];
 	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
Index: stable/11/sys/sparc64/sparc64/mem.c
===================================================================
--- stable/11/sys/sparc64/sparc64/mem.c	(revision 331016)
+++ stable/11/sys/sparc64/sparc64/mem.c	(revision 331017)
@@ -1,178 +1,179 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1982, 1986, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and code derived from software contributed to
  * Berkeley by William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: Utah $Hdr: mem.c 1.13 89/10/08$
  *	from: @(#)mem.c	7.2 (Berkeley) 5/9/91
  *	from: FreeBSD: src/sys/i386/i386/mem.c,v 1.94 2001/09/26
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Memory special file
  *
  * NOTE: other architectures support mmap()'ing the mem device; this
  * might cause illegal aliases to be created for the locked kernel page(s), so
  * it is not implemented.
  */
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cache.h>
 #include <machine/md_var.h>
 #include <machine/tlb.h>
 
 #include <machine/memdev.h>
 
 struct mem_range_softc mem_range_softc;
 
 /* ARGSUSED */
 int
 memrw(struct cdev *dev, struct uio *uio, int flags)
 {
 	struct iovec *iov;
 	vm_offset_t eva;
 	vm_offset_t off;
 	vm_offset_t ova;
 	vm_offset_t va;
 	vm_prot_t prot;
 	vm_paddr_t pa;
 	vm_size_t cnt;
 	vm_page_t m;
 	int error;
 	uint32_t colors;
 
 	cnt = 0;
 	colors = 1;
 	error = 0;
 	ova = 0;
 
 	while (uio->uio_resid > 0 && error == 0) {
 		iov = uio->uio_iov;
 		if (iov->iov_len == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			if (uio->uio_iovcnt < 0)
 				panic("memrw");
 			continue;
 		}
 		if (dev2unit(dev) == CDEV_MINOR_MEM) {
 			pa = uio->uio_offset & ~PAGE_MASK;
 			if (!is_physical_memory(pa)) {
 				error = EFAULT;
 				break;
 			}
 
 			off = uio->uio_offset & PAGE_MASK;
 			cnt = PAGE_SIZE - ((vm_offset_t)iov->iov_base &
 			    PAGE_MASK);
 			cnt = ulmin(cnt, PAGE_SIZE - off);
 			cnt = ulmin(cnt, iov->iov_len);
 
 			m = vm_phys_paddr_to_vm_page(pa);
 			if (m != NULL) {
 				if (ova == 0) {
 					if (dcache_color_ignore == 0)
 						colors = DCACHE_COLORS;
 					ova = kva_alloc(PAGE_SIZE * colors);
 					if (ova == 0) {
 						error = ENOMEM;
 						break;
 					}
 				}
 				if (colors != 1 && m->md.color != -1)
 					va = ova + m->md.color * PAGE_SIZE;
 				else
 					va = ova;
 				pmap_qenter(va, &m, 1);
 				error = uiomove((void *)(va + off), cnt,
 				    uio);
 				pmap_qremove(va, 1);
 			} else {
 				va = TLB_PHYS_TO_DIRECT(pa);
 				error = uiomove((void *)(va + off), cnt,
 				    uio);
 			}
 			break;
 		} else if (dev2unit(dev) == CDEV_MINOR_KMEM) {
 			va = trunc_page(uio->uio_offset);
 			eva = round_page(uio->uio_offset + iov->iov_len);
 
 			/*
 			 * Make sure that all of the pages are currently
 			 * resident so we don't create any zero fill pages.
 			 */
 			for (; va < eva; va += PAGE_SIZE)
 				if (pmap_kextract(va) == 0)
 					return (EFAULT);
 
 			prot = (uio->uio_rw == UIO_READ) ? VM_PROT_READ :
 			    VM_PROT_WRITE;
 			va = uio->uio_offset;
 			if (va < VM_MIN_DIRECT_ADDRESS &&
 			    kernacc((void *)va, iov->iov_len, prot) == FALSE)
 				return (EFAULT);
 
 			error = uiomove((void *)va, iov->iov_len, uio);
 			break;
 		}
 		/* else panic! */
 	}
 	if (ova != 0)
 		kva_free(ova, PAGE_SIZE * colors);
 	return (error);
 }
Index: stable/11/sys/ufs/ffs/ffs_balloc.c
===================================================================
--- stable/11/sys/ufs/ffs/ffs_balloc.c	(revision 331016)
+++ stable/11/sys/ufs/ffs/ffs_balloc.c	(revision 331017)
@@ -1,1152 +1,1153 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Marshall
  * Kirk McKusick and Network Associates Laboratories, the Security
  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  * research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
+#include <sys/vmmeter.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 /*
  * Balloc defines the structure of filesystem storage
  * by allocating the physical blocks on a device given
  * the inode and the logical block number in a file.
  * This is the allocation strategy for UFS1. Below is
  * the allocation strategy for UFS2.
  */
 int
 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
     struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct inode *ip;
 	struct ufs1_dinode *dp;
 	ufs_lbn_t lbn, lastlbn;
 	struct fs *fs;
 	ufs1_daddr_t nb;
 	struct buf *bp, *nbp;
 	struct ufsmount *ump;
 	struct indir indirs[NIADDR + 2];
 	int deallocated, osize, nsize, num, i, error;
 	ufs2_daddr_t newb;
 	ufs1_daddr_t *bap, pref;
 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
 	int unwindidx = -1;
 	int saved_inbdflush;
 	static struct timeval lastfail;
 	static int curfail;
 	int gbflags, reclaimed;
 
 	ip = VTOI(vp);
 	dp = ip->i_din1;
 	fs = ITOFS(ip);
 	ump = ITOUMP(ip);
 	lbn = lblkno(fs, startoffset);
 	size = blkoff(fs, startoffset) + size;
 	reclaimed = 0;
 	if (size > fs->fs_bsize)
 		panic("ffs_balloc_ufs1: blk too big");
 	*bpp = NULL;
 	if (flags & IO_EXT)
 		return (EOPNOTSUPP);
 	if (lbn < 0)
 		return (EFBIG);
 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 
 	if (DOINGSOFTDEP(vp))
 		softdep_prealloc(vp, MNT_WAIT);
 	/*
 	 * If the next write will extend the file into a new block,
 	 * and the file is currently composed of a fragment
 	 * this fragment has to be extended to be a full block.
 	 */
 	lastlbn = lblkno(fs, ip->i_size);
 	if (lastlbn < NDADDR && lastlbn < lbn) {
 		nb = lastlbn;
 		osize = blksize(fs, ip, nb);
 		if (osize < fs->fs_bsize && osize > 0) {
 			UFS_LOCK(ump);
 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
 			   cred, &bp);
 			if (error)
 				return (error);
 			if (DOINGSOFTDEP(vp))
 				softdep_setup_allocdirect(ip, nb,
 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
 				    fs->fs_bsize, osize, bp);
 			ip->i_size = smalllblktosize(fs, nb + 1);
 			dp->di_size = ip->i_size;
 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			if (flags & IO_SYNC)
 				bwrite(bp);
 			else
 				bawrite(bp);
 		}
 	}
 	/*
 	 * The first NDADDR blocks are direct blocks
 	 */
 	if (lbn < NDADDR) {
 		if (flags & BA_METAONLY)
 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
 		nb = dp->di_db[lbn];
 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			bp->b_blkno = fsbtodb(fs, nb);
 			*bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
 			/*
 			 * Consider need to reallocate a fragment.
 			 */
 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
 			nsize = fragroundup(fs, size);
 			if (nsize <= osize) {
 				error = bread(vp, lbn, osize, NOCRED, &bp);
 				if (error) {
 					brelse(bp);
 					return (error);
 				}
 				bp->b_blkno = fsbtodb(fs, nb);
 			} else {
 				UFS_LOCK(ump);
 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
 				    &dp->di_db[0]), osize, nsize, flags,
 				    cred, &bp);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
 					softdep_setup_allocdirect(ip, lbn,
 					    dbtofsb(fs, bp->b_blkno), nb,
 					    nsize, osize, bp);
 			}
 		} else {
 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
 				nsize = fragroundup(fs, size);
 			else
 				nsize = fs->fs_bsize;
 			UFS_LOCK(ump);
 			error = ffs_alloc(ip, lbn,
 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
 			    nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
 			bp->b_blkno = fsbtodb(fs, newb);
 			if (flags & BA_CLRBUF)
 				vfs_bio_clrbuf(bp);
 			if (DOINGSOFTDEP(vp))
 				softdep_setup_allocdirect(ip, lbn, newb, 0,
 				    nsize, 0, bp);
 		}
 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Determine the number of levels of indirection.
 	 */
 	pref = 0;
 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
 		return(error);
 #ifdef INVARIANTS
 	if (num < 1)
 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
 #endif
 	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
 	/*
 	 * Fetch the first indirect block allocating if necessary.
 	 */
 	--num;
 	nb = dp->di_ib[indirs[0].in_off];
 	allocib = NULL;
 	allocblk = allociblk;
 	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
 		    (ufs1_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
 			curthread_pflags_restore(saved_inbdflush);
 			return (error);
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
 		bp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(bp);
 		if (DOINGSOFTDEP(vp)) {
 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
 			    newb, 0, fs->fs_bsize, 0, bp);
 			bdwrite(bp);
 		} else {
 			/*
 			 * Write synchronously so that indirect blocks
 			 * never point at garbage.
 			 */
 			if (DOINGASYNC(vp))
 				bdwrite(bp);
 			else if ((error = bwrite(bp)) != 0)
 				goto fail;
 		}
 		allocib = &dp->di_ib[indirs[0].in_off];
 		*allocib = nb;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * Fetch through the indirect blocks, allocating as necessary.
 	 */
 retry:
 	for (i = 1;;) {
 		error = bread(vp,
 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			goto fail;
 		}
 		bap = (ufs1_daddr_t *)bp->b_data;
 		nb = bap[indirs[i].in_off];
 		if (i == num)
 			break;
 		i += 1;
 		if (nb != 0) {
 			bqrelse(bp);
 			continue;
 		}
 		UFS_LOCK(ump);
 		/*
 		 * If parent indirect has just been allocated, try to cluster
 		 * immediately following it.
 		 */
 		if (pref == 0)
 			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
 			    (ufs1_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
 				UFS_UNLOCK(ump);
 				goto retry;
 			}
 			if (ppsratecheck(&lastfail, &curfail, 1)) {
 				ffs_fserr(fs, ip->i_number, "filesystem full");
 				uprintf("\n%s: write failed, filesystem "
 				    "is full\n", fs->fs_fsmnt);
 			}
 			goto fail;
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
 		if (DOINGSOFTDEP(vp)) {
 			softdep_setup_allocindir_meta(nbp, ip, bp,
 			    indirs[i - 1].in_off, nb);
 			bdwrite(nbp);
 		} else {
 			/*
 			 * Write synchronously so that indirect blocks
 			 * never point at garbage.
 			 */
 			if ((error = bwrite(nbp)) != 0) {
 				brelse(bp);
 				goto fail;
 			}
 		}
 		bap[indirs[i - 1].in_off] = nb;
 		if (allocib == NULL && unwindidx < 0)
 			unwindidx = i - 1;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & IO_SYNC) {
 			bwrite(bp);
 		} else {
 			if (bp->b_bufsize == fs->fs_bsize)
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 	}
 	/*
 	 * If asked only for the indirect block, then return it.
 	 */
 	if (flags & BA_METAONLY) {
 		curthread_pflags_restore(saved_inbdflush);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Get the data block, allocating if necessary.
 	 */
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		/*
 		 * If allocating metadata at the front of the cylinder
 		 * group and parent indirect block has just been allocated,
 		 * then cluster next to it if it is the first indirect in
 		 * the file. Otherwise it has been allocated in the metadata
 		 * area, so we want to find our own place out in the data area.
 		 */
 		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
 			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
 			    &bap[0]);
 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
 				UFS_UNLOCK(ump);
 				goto retry;
 			}
 			if (ppsratecheck(&lastfail, &curfail, 1)) {
 				ffs_fserr(fs, ip->i_number, "filesystem full");
 				uprintf("\n%s: write failed, filesystem "
 				    "is full\n", fs->fs_fsmnt);
 			}
 			goto fail;
 		}
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & BA_CLRBUF)
 			vfs_bio_clrbuf(nbp);
 		if (DOINGSOFTDEP(vp))
 			softdep_setup_allocindir_page(ip, lbn, bp,
 			    indirs[i].in_off, nb, 0, nbp);
 		bap[indirs[i].in_off] = nb;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & IO_SYNC) {
 			bwrite(bp);
 		} else {
 			if (bp->b_bufsize == fs->fs_bsize)
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		curthread_pflags_restore(saved_inbdflush);
 		*bpp = nbp;
 		return (0);
 	}
 	brelse(bp);
 	if (flags & BA_CLRBUF) {
 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
 		if (seqcount != 0 &&
 		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
 		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
 			error = cluster_read(vp, ip->i_size, lbn,
 			    (int)fs->fs_bsize, NOCRED,
 			    MAXBSIZE, seqcount, gbflags, &nbp);
 		} else {
 			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
 			    gbflags, &nbp);
 		}
 		if (error) {
 			brelse(nbp);
 			goto fail;
 		}
 	} else {
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
 	curthread_pflags_restore(saved_inbdflush);
 	*bpp = nbp;
 	return (0);
 fail:
 	curthread_pflags_restore(saved_inbdflush);
 	/*
 	 * If we have failed to allocate any blocks, simply return the error.
 	 * This is the usual case and avoids the need to fsync the file.
 	 */
 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
 		return (error);
 	/*
 	 * If we have failed part way through block allocation, we
 	 * have to deallocate any indirect blocks that we have allocated.
 	 * We have to fsync the file before we start to get rid of all
 	 * of its dependencies so that we do not leave them dangling.
 	 * We have to sync it at the end so that the soft updates code
 	 * does not find any untracked changes. Although this is really
 	 * slow, running out of disk space is not expected to be a common
 	 * occurrence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
 	 *
 	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
 	     blkp < allocblk; blkp++, lbns_remfree++) {
 		/*
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
 		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
 			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
 			    ("mismatch1 l %jd %jd b %ju %ju",
 			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
 			    (uintmax_t)bp->b_blkno,
 			    (uintmax_t)fsbtodb(fs, *blkp)));
 			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
 			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
 	}
 	if (allocib != NULL) {
 		*allocib = 0;
 	} else if (unwindidx >= 0) {
 		int r;
 
 		r = bread(vp, indirs[unwindidx].in_lbn, 
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (r) {
 			panic("Could not unwind indirect block, error %d", r);
 			brelse(bp);
 		} else {
 			bap = (ufs1_daddr_t *)bp->b_data;
 			bap[indirs[unwindidx].in_off] = 0;
 			if (flags & IO_SYNC) {
 				bwrite(bp);
 			} else {
 				if (bp->b_bufsize == fs->fs_bsize)
 					bp->b_flags |= B_CLUSTEROK;
 				bdwrite(bp);
 			}
 		}
 	}
 	if (deallocated) {
 #ifdef QUOTA
 		/*
 		 * Restore user's disk quota because allocation failed.
 		 */
 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
 #endif
 		dp->di_blocks -= btodb(deallocated);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	/*
 	 * After the buffers are invalidated and on-disk pointers are
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 #ifdef INVARIANTS
 		if (blkp == allociblk)
 			lbns_remfree = lbns;
 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
 		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
 			panic("zombie1 %jd %ju %ju",
 			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
 			    (uintmax_t)fsbtodb(fs, *blkp));
 		}
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
 	return (error);
 }
 
 /*
  * Balloc defines the structure of file system storage
  * by allocating the physical blocks on a device given
  * the inode and the logical block number in a file.
  * This is the allocation strategy for UFS2. Above is
  * the allocation strategy for UFS1.
  */
 int
 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
     struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
 	ufs_lbn_t lbn, lastlbn;
 	struct fs *fs;
 	struct buf *bp, *nbp;
 	struct ufsmount *ump;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t nb, newb, *bap, pref;
 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
 	int deallocated, osize, nsize, num, i, error;
 	int unwindidx = -1;
 	int saved_inbdflush;
 	static struct timeval lastfail;
 	static int curfail;
 	int gbflags, reclaimed;
 
 	ip = VTOI(vp);
 	dp = ip->i_din2;
 	fs = ITOFS(ip);
 	ump = ITOUMP(ip);
 	lbn = lblkno(fs, startoffset);
 	size = blkoff(fs, startoffset) + size;
 	reclaimed = 0;
 	if (size > fs->fs_bsize)
 		panic("ffs_balloc_ufs2: blk too big");
 	*bpp = NULL;
 	if (lbn < 0)
 		return (EFBIG);
 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 
 	if (DOINGSOFTDEP(vp))
 		softdep_prealloc(vp, MNT_WAIT);
 	
 	/*
 	 * Check for allocating external data.
 	 */
 	if (flags & IO_EXT) {
 		if (lbn >= NXADDR)
 			return (EFBIG);
 		/*
 		 * If the next write will extend the data into a new block,
 		 * and the data is currently composed of a fragment
 		 * this fragment has to be extended to be a full block.
 		 */
 		lastlbn = lblkno(fs, dp->di_extsize);
 		if (lastlbn < lbn) {
 			nb = lastlbn;
 			osize = sblksize(fs, dp->di_extsize, nb);
 			if (osize < fs->fs_bsize && osize > 0) {
 				UFS_LOCK(ump);
 				error = ffs_realloccg(ip, -1 - nb,
 				    dp->di_extb[nb],
 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
 				    &dp->di_extb[0]), osize,
 				    (int)fs->fs_bsize, flags, cred, &bp);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
 					softdep_setup_allocext(ip, nb,
 					    dbtofsb(fs, bp->b_blkno),
 					    dp->di_extb[nb],
 					    fs->fs_bsize, osize, bp);
 				dp->di_extsize = smalllblktosize(fs, nb + 1);
 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
 				bp->b_xflags |= BX_ALTDATA;
 				ip->i_flag |= IN_CHANGE;
 				if (flags & IO_SYNC)
 					bwrite(bp);
 				else
 					bawrite(bp);
 			}
 		}
 		/*
 		 * All blocks are direct blocks
 		 */
 		if (flags & BA_METAONLY)
 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
 		nb = dp->di_extb[lbn];
 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
 			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
 			    gbflags, &bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			bp->b_blkno = fsbtodb(fs, nb);
 			bp->b_xflags |= BX_ALTDATA;
 			*bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
 			/*
 			 * Consider need to reallocate a fragment.
 			 */
 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
 			nsize = fragroundup(fs, size);
 			if (nsize <= osize) {
 				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
 				    gbflags, &bp);
 				if (error) {
 					brelse(bp);
 					return (error);
 				}
 				bp->b_blkno = fsbtodb(fs, nb);
 				bp->b_xflags |= BX_ALTDATA;
 			} else {
 				UFS_LOCK(ump);
 				error = ffs_realloccg(ip, -1 - lbn,
 				    dp->di_extb[lbn],
 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
 				    &dp->di_extb[0]), osize, nsize, flags,
 				    cred, &bp);
 				if (error)
 					return (error);
 				bp->b_xflags |= BX_ALTDATA;
 				if (DOINGSOFTDEP(vp))
 					softdep_setup_allocext(ip, lbn,
 					    dbtofsb(fs, bp->b_blkno), nb,
 					    nsize, osize, bp);
 			}
 		} else {
 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
 				nsize = fragroundup(fs, size);
 			else
 				nsize = fs->fs_bsize;
 			UFS_LOCK(ump);
 			error = ffs_alloc(ip, lbn,
 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
 			   nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
 			bp->b_blkno = fsbtodb(fs, newb);
 			bp->b_xflags |= BX_ALTDATA;
 			if (flags & BA_CLRBUF)
 				vfs_bio_clrbuf(bp);
 			if (DOINGSOFTDEP(vp))
 				softdep_setup_allocext(ip, lbn, newb, 0,
 				    nsize, 0, bp);
 		}
 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
 		ip->i_flag |= IN_CHANGE;
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * If the next write will extend the file into a new block,
 	 * and the file is currently composed of a fragment
 	 * this fragment has to be extended to be a full block.
 	 */
 	lastlbn = lblkno(fs, ip->i_size);
 	if (lastlbn < NDADDR && lastlbn < lbn) {
 		nb = lastlbn;
 		osize = blksize(fs, ip, nb);
 		if (osize < fs->fs_bsize && osize > 0) {
 			UFS_LOCK(ump);
 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
 			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
 			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
 			    flags, cred, &bp);
 			if (error)
 				return (error);
 			if (DOINGSOFTDEP(vp))
 				softdep_setup_allocdirect(ip, nb,
 				    dbtofsb(fs, bp->b_blkno),
 				    dp->di_db[nb],
 				    fs->fs_bsize, osize, bp);
 			ip->i_size = smalllblktosize(fs, nb + 1);
 			dp->di_size = ip->i_size;
 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			if (flags & IO_SYNC)
 				bwrite(bp);
 			else
 				bawrite(bp);
 		}
 	}
 	/*
 	 * The first NDADDR blocks are direct blocks
 	 */
 	if (lbn < NDADDR) {
 		if (flags & BA_METAONLY)
 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
 		nb = dp->di_db[lbn];
 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
 			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
 			    gbflags, &bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			bp->b_blkno = fsbtodb(fs, nb);
 			*bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
 			/*
 			 * Consider need to reallocate a fragment.
 			 */
 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
 			nsize = fragroundup(fs, size);
 			if (nsize <= osize) {
 				error = bread_gb(vp, lbn, osize, NOCRED,
 				    gbflags, &bp);
 				if (error) {
 					brelse(bp);
 					return (error);
 				}
 				bp->b_blkno = fsbtodb(fs, nb);
 			} else {
 				UFS_LOCK(ump);
 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
 				    &dp->di_db[0]), osize, nsize, flags,
 				    cred, &bp);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
 					softdep_setup_allocdirect(ip, lbn,
 					    dbtofsb(fs, bp->b_blkno), nb,
 					    nsize, osize, bp);
 			}
 		} else {
 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
 				nsize = fragroundup(fs, size);
 			else
 				nsize = fs->fs_bsize;
 			UFS_LOCK(ump);
 			error = ffs_alloc(ip, lbn,
 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
 				&dp->di_db[0]), nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
 			bp->b_blkno = fsbtodb(fs, newb);
 			if (flags & BA_CLRBUF)
 				vfs_bio_clrbuf(bp);
 			if (DOINGSOFTDEP(vp))
 				softdep_setup_allocdirect(ip, lbn, newb, 0,
 				    nsize, 0, bp);
 		}
 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Determine the number of levels of indirection.
 	 */
 	pref = 0;
 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
 		return(error);
 #ifdef INVARIANTS
 	if (num < 1)
 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
 #endif
 	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
 	/*
 	 * Fetch the first indirect block allocating if necessary.
 	 */
 	--num;
 	nb = dp->di_ib[indirs[0].in_off];
 	allocib = NULL;
 	allocblk = allociblk;
 	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
 		    (ufs2_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
 			curthread_pflags_restore(saved_inbdflush);
 			return (error);
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
 		    GB_UNMAPPED);
 		bp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(bp);
 		if (DOINGSOFTDEP(vp)) {
 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
 			    newb, 0, fs->fs_bsize, 0, bp);
 			bdwrite(bp);
 		} else {
 			/*
 			 * Write synchronously so that indirect blocks
 			 * never point at garbage.
 			 */
 			if (DOINGASYNC(vp))
 				bdwrite(bp);
 			else if ((error = bwrite(bp)) != 0)
 				goto fail;
 		}
 		allocib = &dp->di_ib[indirs[0].in_off];
 		*allocib = nb;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * Fetch through the indirect blocks, allocating as necessary.
 	 */
 retry:
 	for (i = 1;;) {
 		error = bread(vp,
 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			goto fail;
 		}
 		bap = (ufs2_daddr_t *)bp->b_data;
 		nb = bap[indirs[i].in_off];
 		if (i == num)
 			break;
 		i += 1;
 		if (nb != 0) {
 			bqrelse(bp);
 			continue;
 		}
 		UFS_LOCK(ump);
 		/*
 		 * If parent indirect has just been allocated, try to cluster
 		 * immediately following it.
 		 */
 		if (pref == 0)
 			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
 			    (ufs2_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
 				UFS_UNLOCK(ump);
 				goto retry;
 			}
 			if (ppsratecheck(&lastfail, &curfail, 1)) {
 				ffs_fserr(fs, ip->i_number, "filesystem full");
 				uprintf("\n%s: write failed, filesystem "
 				    "is full\n", fs->fs_fsmnt);
 			}
 			goto fail;
 		}
 		pref = newb + fs->fs_frag;
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
 		    GB_UNMAPPED);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
 		if (DOINGSOFTDEP(vp)) {
 			softdep_setup_allocindir_meta(nbp, ip, bp,
 			    indirs[i - 1].in_off, nb);
 			bdwrite(nbp);
 		} else {
 			/*
 			 * Write synchronously so that indirect blocks
 			 * never point at garbage.
 			 */
 			if ((error = bwrite(nbp)) != 0) {
 				brelse(bp);
 				goto fail;
 			}
 		}
 		bap[indirs[i - 1].in_off] = nb;
 		if (allocib == NULL && unwindidx < 0)
 			unwindidx = i - 1;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & IO_SYNC) {
 			bwrite(bp);
 		} else {
 			if (bp->b_bufsize == fs->fs_bsize)
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 	}
 	/*
 	 * If asked only for the indirect block, then return it.
 	 */
 	if (flags & BA_METAONLY) {
 		curthread_pflags_restore(saved_inbdflush);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Get the data block, allocating if necessary.
 	 */
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		/*
 		 * If allocating metadata at the front of the cylinder
 		 * group and parent indirect block has just been allocated,
 		 * then cluster next to it if it is the first indirect in
 		 * the file. Otherwise it has been allocated in the metadata
 		 * area, so we want to find our own place out in the data area.
 		 */
 		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
 			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
 			    &bap[0]);
 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb);
 		if (error) {
 			brelse(bp);
 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
 				UFS_LOCK(ump);
 				softdep_request_cleanup(fs, vp, cred,
 				    FLUSH_BLOCKS_WAIT);
 				UFS_UNLOCK(ump);
 				goto retry;
 			}
 			if (ppsratecheck(&lastfail, &curfail, 1)) {
 				ffs_fserr(fs, ip->i_number, "filesystem full");
 				uprintf("\n%s: write failed, filesystem "
 				    "is full\n", fs->fs_fsmnt);
 			}
 			goto fail;
 		}
 		nb = newb;
 		MPASS(allocblk < allociblk + nitems(allociblk));
 		MPASS(lbns_remfree < lbns + nitems(lbns));
 		*allocblk++ = nb;
 		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & BA_CLRBUF)
 			vfs_bio_clrbuf(nbp);
 		if (DOINGSOFTDEP(vp))
 			softdep_setup_allocindir_page(ip, lbn, bp,
 			    indirs[i].in_off, nb, 0, nbp);
 		bap[indirs[i].in_off] = nb;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & IO_SYNC) {
 			bwrite(bp);
 		} else {
 			if (bp->b_bufsize == fs->fs_bsize)
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		curthread_pflags_restore(saved_inbdflush);
 		*bpp = nbp;
 		return (0);
 	}
 	brelse(bp);
 	/*
 	 * If requested clear invalid portions of the buffer.  If we
 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
 	 * try to do some read-ahead in the sequential case to reduce
 	 * the number of I/O transactions.
 	 */
 	if (flags & BA_CLRBUF) {
 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
 		if (seqcount != 0 &&
 		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
 		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
 			error = cluster_read(vp, ip->i_size, lbn,
 			    (int)fs->fs_bsize, NOCRED,
 			    MAXBSIZE, seqcount, gbflags, &nbp);
 		} else {
 			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
 			    NOCRED, gbflags, &nbp);
 		}
 		if (error) {
 			brelse(nbp);
 			goto fail;
 		}
 	} else {
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
 	curthread_pflags_restore(saved_inbdflush);
 	*bpp = nbp;
 	return (0);
 fail:
 	curthread_pflags_restore(saved_inbdflush);
 	/*
 	 * If we have failed to allocate any blocks, simply return the error.
 	 * This is the usual case and avoids the need to fsync the file.
 	 */
 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
 		return (error);
 	/*
 	 * If we have failed part way through block allocation, we
 	 * have to deallocate any indirect blocks that we have allocated.
 	 * We have to fsync the file before we start to get rid of all
 	 * of its dependencies so that we do not leave them dangling.
 	 * We have to sync it at the end so that the soft updates code
 	 * does not find any untracked changes. Although this is really
 	 * slow, running out of disk space is not expected to be a common
 	 * occurrence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
 	 *
 	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
 	     blkp < allocblk; blkp++, lbns_remfree++) {
 		/*
 		 * We shall not leave the freed blocks on the vnode
 		 * buffer object lists.
 		 */
 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
 		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
 			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
 			    ("mismatch2 l %jd %jd b %ju %ju",
 			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
 			    (uintmax_t)bp->b_blkno,
 			    (uintmax_t)fsbtodb(fs, *blkp)));
 			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
 			bp->b_flags &= ~(B_ASYNC | B_CACHE);
 			brelse(bp);
 		}
 		deallocated += fs->fs_bsize;
 	}
 	if (allocib != NULL) {
 		*allocib = 0;
 	} else if (unwindidx >= 0) {
 		int r;
 
 		r = bread(vp, indirs[unwindidx].in_lbn, 
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (r) {
 			panic("Could not unwind indirect block, error %d", r);
 			brelse(bp);
 		} else {
 			bap = (ufs2_daddr_t *)bp->b_data;
 			bap[indirs[unwindidx].in_off] = 0;
 			if (flags & IO_SYNC) {
 				bwrite(bp);
 			} else {
 				if (bp->b_bufsize == fs->fs_bsize)
 					bp->b_flags |= B_CLUSTEROK;
 				bdwrite(bp);
 			}
 		}
 	}
 	if (deallocated) {
 #ifdef QUOTA
 		/*
 		 * Restore user's disk quota because allocation failed.
 		 */
 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
 #endif
 		dp->di_blocks -= btodb(deallocated);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	/*
 	 * After the buffers are invalidated and on-disk pointers are
 	 * cleared, free the blocks.
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 #ifdef INVARIANTS
 		if (blkp == allociblk)
 			lbns_remfree = lbns;
 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
 		    GB_NOCREAT | GB_UNMAPPED);
 		if (bp != NULL) {
 			panic("zombie2 %jd %ju %ju",
 			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
 			    (uintmax_t)fsbtodb(fs, *blkp));
 		}
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
 	return (error);
 }
Index: stable/11/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- stable/11/sys/ufs/ffs/ffs_vfsops.c	(revision 331016)
+++ stable/11/sys/ufs/ffs/ffs_vfsops.c	(revision 331017)
@@ -1,2290 +1,2291 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
+#include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/gjournal.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/vm_page.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <ddb/ddb.h>
 
 static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
 
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static int	ffs_sync_lazy(struct mount *mp);
 
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
 static vfs_extattrctl_t ffs_extattrctl;
 static vfs_cmount_t ffs_cmount;
 static vfs_unmount_t ffs_unmount;
 static vfs_mount_t ffs_mount;
 static vfs_statfs_t ffs_statfs;
 static vfs_fhtovp_t ffs_fhtovp;
 static vfs_sync_t ffs_sync;
 
 static struct vfsops ufs_vfsops = {
 	.vfs_extattrctl =	ffs_extattrctl,
 	.vfs_fhtovp =		ffs_fhtovp,
 	.vfs_init =		ffs_init,
 	.vfs_mount =		ffs_mount,
 	.vfs_cmount =		ffs_cmount,
 	.vfs_quotactl =		ufs_quotactl,
 	.vfs_root =		ufs_root,
 	.vfs_statfs =		ffs_statfs,
 	.vfs_sync =		ffs_sync,
 	.vfs_uninit =		ffs_uninit,
 	.vfs_unmount =		ffs_unmount,
 	.vfs_vget =		ffs_vget,
 	.vfs_susp_clean =	process_deferred_inactive,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
 MODULE_VERSION(ufs, 1);
 
 static b_strategy_t ffs_geom_strategy;
 static b_write_t ffs_bufwrite;
 
 static struct buf_ops ffs_ops = {
 	.bop_name =	"FFS",
 	.bop_write =	ffs_bufwrite,
 	.bop_strategy =	ffs_geom_strategy,
 	.bop_sync =	bufsync,
 #ifdef NO_FFS_SNAPSHOT
 	.bop_bdflush =	bufbdflush,
 #else
 	.bop_bdflush =	ffs_bdflush,
 #endif
 };
 
 /*
  * Note that userquota and groupquota options are not currently used
  * by UFS/FFS code and generally mount(8) does not pass those options
  * from userland, but they can be passed by loader(8) via
  * vfs.root.mountfrom.options.
  */
 static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
     "noclusterw", "noexec", "export", "force", "from", "groupquota",
     "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir",
     "nosymfollow", "sync", "union", "userquota", NULL };
 
 static int
 ffs_mount(struct mount *mp)
 {
 	struct vnode *devvp;
 	struct thread *td;
 	struct ufsmount *ump = NULL;
 	struct fs *fs;
 	pid_t fsckpid = 0;
 	int error, error1, flags;
 	uint64_t mntorflags;
 	accmode_t accmode;
 	struct nameidata ndp;
 	char *fspec;
 
 	td = curthread;
 	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
 		return (EINVAL);
 	if (uma_inode == NULL) {
 		uma_inode = uma_zcreate("FFS inode",
 		    sizeof(struct inode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs1 = uma_zcreate("FFS1 dinode",
 		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		uma_ufs2 = uma_zcreate("FFS2 dinode",
 		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 
 	vfs_deleteopt(mp->mnt_optnew, "groupquota");
 	vfs_deleteopt(mp->mnt_optnew, "userquota");
 
 	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)
 		return (error);
 
 	mntorflags = 0;
 	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
 		mntorflags |= MNT_ACLS;
 
 	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
 		mntorflags |= MNT_SNAPSHOT;
 		/*
 		 * Once we have set the MNT_SNAPSHOT flag, do not
 		 * persist "snapshot" in the options list.
 		 */
 		vfs_deleteopt(mp->mnt_optnew, "snapshot");
 		vfs_deleteopt(mp->mnt_opt, "snapshot");
 	}
 
 	if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 &&
 	    vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) {
 		/*
 		 * Once we have set the restricted PID, do not
 		 * persist "fsckpid" in the options list.
 		 */
 		vfs_deleteopt(mp->mnt_optnew, "fsckpid");
 		vfs_deleteopt(mp->mnt_opt, "fsckpid");
 		if (mp->mnt_flag & MNT_UPDATE) {
 			if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 &&
 			     vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
 				vfs_mount_error(mp,
 				    "Checker enable: Must be read-only");
 				return (EINVAL);
 			}
 		} else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) {
 			vfs_mount_error(mp,
 			    "Checker enable: Must be read-only");
 			return (EINVAL);
 		}
 		/* Set to -1 if we are done */
 		if (fsckpid == 0)
 			fsckpid = -1;
 	}
 
 	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
 		if (mntorflags & MNT_ACLS) {
 			vfs_mount_error(mp,
 			    "\"acls\" and \"nfsv4acls\" options "
 			    "are mutually exclusive");
 			return (EINVAL);
 		}
 		mntorflags |= MNT_NFS4ACLS;
 	}
 
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= mntorflags;
 	MNT_IUNLOCK(mp);
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		devvp = ump->um_devvp;
 		if (fsckpid == -1 && ump->um_fsckpid > 0) {
 			if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
 			    (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
 				return (error);
 			g_topology_lock();
 			/*
 			 * Return to normal read-only mode.
 			 */
 			error = g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
 			ump->um_fsckpid = 0;
 		}
 		if (fs->fs_ronly == 0 &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * Flush any dirty data and suspend filesystem.
 			 */
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			error = vfs_write_suspend_umnt(mp);
 			if (error != 0)
 				return (error);
 			/*
 			 * Check for and optionally get rid of files open
 			 * for writing.
 			 */
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (MOUNTEDSOFTDEP(mp)) {
 				error = softdep_flushfiles(mp, flags, td);
 			} else {
 				error = ffs_flushfiles(mp, flags, td);
 			}
 			if (error) {
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (fs->fs_pendingblocks != 0 ||
 			    fs->fs_pendinginodes != 0) {
 				printf("WARNING: %s Update error: blocks %jd "
 				    "files %d\n", fs->fs_fsmnt, 
 				    (intmax_t)fs->fs_pendingblocks,
 				    fs->fs_pendinginodes);
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
 			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 				fs->fs_clean = 1;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				fs->fs_ronly = 0;
 				fs->fs_clean = 0;
 				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (MOUNTEDSOFTDEP(mp))
 				softdep_unmount(mp);
 			g_topology_lock();
 			/*
 			 * Drop our write and exclusive access.
 			 */
 			g_access(ump->um_cp, 0, -1, -1);
 			g_topology_unlock();
 			fs->fs_ronly = 1;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			/*
 			 * Allow the writers to note that filesystem
 			 * is ro now.
 			 */
 			vfs_write_resume(mp, 0);
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, td, 0)) != 0)
 			return (error);
 		if (fs->fs_ronly &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If we are running a checker, do not allow upgrade.
 			 */
 			if (ump->um_fsckpid > 0) {
 				vfs_mount_error(mp,
 				    "Active checker, cannot upgrade to write");
 				return (EINVAL);
 			}
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp, 0);
 				return (error);
 			}
 			VOP_UNLOCK(devvp, 0);
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
 				    ((fs->fs_flags &
 				     (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not properly "
 					   "dismounted\n", fs->fs_fsmnt);
 				} else {
 					vfs_mount_error(mp,
 					   "R/W mount of %s denied. %s.%s",
 					   fs->fs_fsmnt,
 					   "Filesystem is not clean - run fsck",
 					   (fs->fs_flags & FS_SUJ) == 0 ? "" :
 					   " Forced mount will invalidate"
 					   " journal contents");
 					return (EPERM);
 				}
 			}
 			g_topology_lock();
 			/*
 			 * Request exclusive write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 1);
 			g_topology_unlock();
 			if (error)
 				return (error);
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			fs->fs_ronly = 0;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 			fs->fs_mtime = time_second;
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				vn_finished_write(mp);
 				return (error);
 			}
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				vn_finished_write(mp);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vn_finished_write(mp);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
 		 * so if we are doing softupdates stop the user
 		 * from setting the async flag in an update.
 		 * Softdep_mount() clears it in an initial mount
 		 * or ro->rw remount.
 		 */
 		if (MOUNTEDSOFTDEP(mp)) {
 			/* XXX: Reset too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_ASYNC;
 			MNT_IUNLOCK(mp);
 		}
 		/*
 		 * Keep MNT_ACLS flag if it is stored in superblock.
 		 */
 		if ((fs->fs_flags & FS_ACLS) != 0) {
 			/* XXX: Set too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_ACLS;
 			MNT_IUNLOCK(mp);
 		}
 
 		if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
 			/* XXX: Set too late ? */
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_NFS4ACLS;
 			MNT_IUNLOCK(mp);
 		}
 		/*
 		 * If this is a request from fsck to clean up the filesystem,
 		 * then allow the specified pid to proceed.
 		 */
 		if (fsckpid > 0) {
 			if (ump->um_fsckpid != 0) {
 				vfs_mount_error(mp,
 				    "Active checker already running on %s",
 				    fs->fs_fsmnt);
 				return (EINVAL);
 			}
 			KASSERT(MOUNTEDSOFTDEP(mp) == 0,
 			    ("soft updates enabled on read-only file system"));
 			g_topology_lock();
 			/*
 			 * Request write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error) {
 				vfs_mount_error(mp,
 				    "Checker activation failed on %s",
 				    fs->fs_fsmnt);
 				return (error);
 			}
 			ump->um_fsckpid = fsckpid;
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			fs->fs_mtime = time_second;
 			fs->fs_fmod = 1;
 			fs->fs_clean = 0;
 			(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 		}
 
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
 		if (mp->mnt_flag & MNT_SNAPSHOT)
 			return (ffs_snapshot(mp, fspec));
 
 		/*
 		 * Must not call namei() while owning busy ref.
 		 */
 		vfs_unbusy(mp);
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
 	error = namei(&ndp);
 	if ((mp->mnt_flag & MNT_UPDATE) != 0) {
 		/*
 		 * Unmount does not start if MNT_UPDATE is set.  Mount
 		 * update busies mp before setting MNT_UPDATE.  We
 		 * must be able to retain our busy ref succesfully,
 		 * without sleep.
 		 */
 		error1 = vfs_busy(mp, MBF_NOWAIT);
 		MPASS(error1 == 0);
 	}
 	if (error != 0)
 		return (error);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Update only
 		 *
 		 * If it's not the same vnode, or at least the same device
 		 * then it's not correct.
 		 */
 
 		if (devvp->v_rdev != ump->um_devvp->v_rdev)
 			error = EINVAL;	/* needs translation */
 		vput(devvp);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * New mount
 		 *
 		 * We need the name for the mount point (also used for
 		 * "last mounted on") copied in. If an error occurs,
 		 * the mount point is discarded by the upper level code.
 		 * Note that vfs_mount_alloc() populates f_mntonname for us.
 		 */
 		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
 			vrele(devvp);
 			return (error);
 		}
 		if (fsckpid > 0) {
 			KASSERT(MOUNTEDSOFTDEP(mp) == 0,
 			    ("soft updates enabled on read-only file system"));
 			ump = VFSTOUFS(mp);
 			fs = ump->um_fs;
 			g_topology_lock();
 			/*
 			 * Request write access.
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error) {
 				printf("WARNING: %s: Checker activation "
 				    "failed\n", fs->fs_fsmnt);
 			} else { 
 				ump->um_fsckpid = fsckpid;
 				if (fs->fs_snapinum[0] != 0)
 					ffs_snapshot_mount(mp);
 				fs->fs_mtime = time_second;
 				fs->fs_clean = 0;
 				(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 			}
 		}
 	}
 	vfs_mountedfrom(mp, fspec);
 	return (0);
 }
 
 /*
  * Compatibility with old mount system call.
  */
 
 static int
 ffs_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 	struct ufs_args args;
 	struct export_args exp;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 	vfs_oexport_conv(&args.export, &exp);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &exp, sizeof(exp));
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). If the 'force' flag
  * is 0, the filesystem must be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary
  *	   writers, if requested.
  *	6) invalidate all cached file data.
  *	7) re-read inode data for all active vnodes.
  */
 int
 ffs_reload(struct mount *mp, struct thread *td, int flags)
 {
 	struct vnode *vp, *mvp, *devvp;
 	struct inode *ip;
 	void *space;
 	struct buf *bp;
 	struct fs *fs, *newfs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 	int i, blks, error;
 	u_long size;
 	int32_t *lp;
 
 	ump = VFSTOUFS(mp);
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) {
 		MNT_IUNLOCK(mp);
 		return (EINVAL);
 	}
 	MNT_IUNLOCK(mp);
 	
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 	if (vinvalbuf(devvp, 0, 0, 0) != 0)
 		panic("ffs_reload: dirty1");
 	VOP_UNLOCK(devvp, 0);
 
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	fs = VFSTOUFS(mp)->um_fs;
 	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
 	    NOCRED, &bp)) != 0)
 		return (error);
 	newfs = (struct fs *)bp->b_data;
 	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
 	     newfs->fs_magic != FS_UFS2_MAGIC) ||
 	    newfs->fs_bsize > MAXBSIZE ||
 	    newfs->fs_bsize < sizeof(struct fs)) {
 			brelse(bp);
 			return (EIO);		/* XXX needs translation */
 	}
 	/*
 	 * Copy pointer fields back into superblock before copying in	XXX
 	 * new superblock. These should really be in the ufsmount.	XXX
 	 * Note that important parameters (eg fs_ncg) are unchanged.
 	 */
 	newfs->fs_csp = fs->fs_csp;
 	newfs->fs_maxcluster = fs->fs_maxcluster;
 	newfs->fs_contigdirs = fs->fs_contigdirs;
 	newfs->fs_active = fs->fs_active;
 	newfs->fs_ronly = fs->fs_ronly;
 	sblockloc = fs->fs_sblockloc;
 	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
 	brelse(bp);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: %s: reload pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	free(fs->fs_csp, M_UFSMNT);
 	space = malloc(size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 	}
 	/*
 	 * We no longer know anything about clusters per cylinder group.
 	 */
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	if ((flags & FFSR_UNSUSPEND) != 0) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 		wakeup(&mp->mnt_flag);
 		MNT_IUNLOCK(mp);
 	}
 
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * Skip syncer vnode.
 		 */
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		/*
 		 * Step 4: invalidate all cached file data.
 		 */
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		if (vinvalbuf(vp, 0, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 5: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			VOP_UNLOCK(vp, 0);
 			vrele(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (error);
 		}
 		ffs_load_inode(bp, ip, fs, ip->i_number);
 		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		VOP_UNLOCK(vp, 0);
 		vrele(vp);
 	}
 	return (0);
 }
 
 /*
  * Possible superblock locations ordered from most to least likely.
  */
 static int sblock_try[] = SBLOCKSEARCH;
 
 /*
  * Common code for mount and mountroot
  */
 static int
 ffs_mountfs(devvp, mp, td)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct fs *fs;
 	struct cdev *dev;
 	void *space;
 	ufs2_daddr_t sblockloc;
 	int error, i, blks, len, ronly;
 	u_long size;
 	int32_t *lp;
 	struct ucred *cred;
 	struct g_consumer *cp;
 	struct mount *nmp;
 
 	bp = NULL;
 	ump = NULL;
 	cred = td ? td->td_ucred : NOCRED;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
 	dev = devvp->v_rdev;
 	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
 	    (uintptr_t)mp) == 0) {
 		VOP_UNLOCK(devvp, 0);
 		return (EBUSY);
 	}
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 	g_topology_unlock();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 		VOP_UNLOCK(devvp, 0);
 		return (error);
 	}
 	dev_ref(dev);
 	devvp->v_bufobj.bo_ops = &ffs_ops;
 	VOP_UNLOCK(devvp, 0);
 	if (dev->si_iosize_max != 0)
 		mp->mnt_iosize_max = dev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
 		mp->mnt_iosize_max = MAXPHYS;
 
 	fs = NULL;
 	sblockloc = 0;
 	/*
 	 * Try reading the superblock in each of its possible locations.
 	 */
 	for (i = 0; sblock_try[i] != -1; i++) {
 		if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
 			error = EINVAL;
 			vfs_mount_error(mp,
 			    "Invalid sectorsize %d for superblock size %d",
 			    cp->provider->sectorsize, SBLOCKSIZE);
 			goto out;
 		}
 		if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
 		    cred, &bp)) != 0)
 			goto out;
 		fs = (struct fs *)bp->b_data;
 		sblockloc = sblock_try[i];
 		if ((fs->fs_magic == FS_UFS1_MAGIC ||
 		     (fs->fs_magic == FS_UFS2_MAGIC &&
 		      (fs->fs_sblockloc == sblockloc ||
 		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
 		    fs->fs_bsize <= MAXBSIZE &&
 		    fs->fs_bsize >= sizeof(struct fs))
 			break;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (sblock_try[i] == -1) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indices */
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
 		    ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf("WARNING: %s was not properly dismounted\n",
 			    fs->fs_fsmnt);
 		} else {
 			vfs_mount_error(mp, "R/W mount of %s denied. %s%s",
 			    fs->fs_fsmnt, "Filesystem is not clean - run fsck.",
 			    (fs->fs_flags & FS_SUJ) == 0 ? "" :
 			    " Forced mount will invalidate journal contents");
 			error = EPERM;
 			goto out;
 		}
 		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
 		    (mp->mnt_flag & MNT_FORCE)) {
 			printf("WARNING: %s: lost blocks %jd files %d\n",
 			    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 			fs->fs_pendingblocks = 0;
 			fs->fs_pendinginodes = 0;
 		}
 	}
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: %s: mount pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
 #ifdef UFS_GJOURNAL
 		/*
 		 * Get journal provider name.
 		 */
 		len = 1024;
 		mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK);
 		if (g_io_getattr("GJOURNAL::provider", cp, &len,
 		    mp->mnt_gjprovider) == 0) {
 			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len,
 			    M_UFSMNT, M_WAITOK);
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_GJOURNAL;
 			MNT_IUNLOCK(mp);
 		} else {
 			printf("WARNING: %s: GJOURNAL flag on fs "
 			    "but no gjournal provider below\n",
 			    mp->mnt_stat.f_mntonname);
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 #else
 		printf("WARNING: %s: GJOURNAL flag on fs but no "
 		    "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname);
 #endif
 	} else {
 		mp->mnt_gjprovider = NULL;
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_cp = cp;
 	ump->um_bo = &devvp->v_bufobj;
 	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_fstype = UFS1;
 		ump->um_balloc = ffs_balloc_ufs1;
 	} else {
 		ump->um_fstype = UFS2;
 		ump->um_balloc = ffs_balloc_ufs2;
 	}
 	ump->um_blkatoff = ffs_blkatoff;
 	ump->um_truncate = ffs_truncate;
 	ump->um_update = ffs_update;
 	ump->um_valloc = ffs_valloc;
 	ump->um_vfree = ffs_vfree;
 	ump->um_ifree = ffs_ifree;
 	ump->um_rdonly = ffs_rdonly;
 	ump->um_snapgone = ffs_snapgone;
 	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
 	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBLOCKSIZE)
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_fs;
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	fs->fs_ronly = ronly;
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	size += fs->fs_ncg * sizeof(u_int8_t);
 	space = malloc(size, M_UFSMNT, M_WAITOK);
 	fs->fs_csp = space;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    cred, &bp)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			goto out;
 		}
 		bcopy(bp->b_data, space, (u_int)size);
 		space = (char *)space + size;
 		brelse(bp);
 		bp = NULL;
 	}
 	if (fs->fs_contigsumsize > 0) {
 		fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 		space = lp;
 	}
 	size = fs->fs_ncg * sizeof(u_int8_t);
 	fs->fs_contigdirs = (u_int8_t *)space;
 	bzero(fs->fs_contigdirs, size);
 	fs->fs_active = NULL;
 	mp->mnt_data = ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
 	nmp = NULL;
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
 	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
 		if (nmp)
 			vfs_rel(nmp);
 		vfs_getnewfsid(mp);
 	}
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
 #ifdef MAC
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_MULTILABEL;
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: multilabel flag on fs but "
 		    "no MAC support\n", mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_ACLS) != 0) {
 #ifdef UFS_ACL
 		MNT_ILOCK(mp);
 
 		if (mp->mnt_flag & MNT_NFS4ACLS)
 			printf("WARNING: %s: ACLs flag on fs conflicts with "
 			    "\"nfsv4acls\" mount option; option ignored\n",
 			    mp->mnt_stat.f_mntonname);
 		mp->mnt_flag &= ~MNT_NFS4ACLS;
 		mp->mnt_flag |= MNT_ACLS;
 
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: ACLs flag on fs but no ACLs support\n",
 		    mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
 #ifdef UFS_ACL
 		MNT_ILOCK(mp);
 
 		if (mp->mnt_flag & MNT_ACLS)
 			printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts "
 			    "with \"acls\" mount option; option ignored\n",
 			    mp->mnt_stat.f_mntonname);
 		mp->mnt_flag &= ~MNT_ACLS;
 		mp->mnt_flag |= MNT_NFS4ACLS;
 
 		MNT_IUNLOCK(mp);
 #else
 		printf("WARNING: %s: NFSv4 ACLs flag on fs but no "
 		    "ACLs support\n", mp->mnt_stat.f_mntonname);
 #endif
 	}
 	if ((fs->fs_flags & FS_TRIM) != 0) {
 		len = sizeof(int);
 		if (g_io_getattr("GEOM::candelete", cp, &len,
 		    &ump->um_candelete) == 0) {
 			if (!ump->um_candelete)
 				printf("WARNING: %s: TRIM flag on fs but disk "
 				    "does not support TRIM\n",
 				    mp->mnt_stat.f_mntonname);
 		} else {
 			printf("WARNING: %s: TRIM flag on fs but disk does "
 			    "not confirm that it supports TRIM\n",
 			    mp->mnt_stat.f_mntonname);
 			ump->um_candelete = 0;
 		}
 		if (ump->um_candelete) {
 			ump->um_trim_tq = taskqueue_create("trim", M_WAITOK,
 			    taskqueue_thread_enqueue, &ump->um_trim_tq);
 			taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
 			    "%s trim", mp->mnt_stat.f_mntonname);
 		}
 	}
 
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 #ifdef UFS_EXTATTR
 	ufs_extattr_uepm_init(&ump->um_extattr);
 #endif
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 
 	if (ronly == 0) {
 		fs->fs_mtime = time_second;
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
 			ffs_flushfiles(mp, FORCECLOSE, td);
 			goto out;
 		}
 		if (fs->fs_snapinum[0] != 0)
 			ffs_snapshot_mount(mp);
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
 	}
 	/*
 	 * Initialize filesystem state information in mount struct.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
 	    MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART
 	/*
 	 *
 	 * Auto-starting does the following:
 	 *	- check for /.attribute in the fs, and extattr_start if so
 	 *	- for each file in .attribute, enable that file with
 	 * 	  an attribute of the same name.
 	 * Not clear how to report errors -- probably eat them.
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
 		if (mp->mnt_gjprovider != NULL) {
 			free(mp->mnt_gjprovider, M_UFSMNT);
 			mp->mnt_gjprovider = NULL;
 		}
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
 	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 	dev_rel(dev);
 	return (error);
 }
 
 #include <sys/sysctl.h>
 static int bigcgs = 0;
 SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
 
 /*
  * Sanity checks for loading old filesystem superblocks.
  * See ffs_oldfscompat_write below for unwound actions.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 static void
 ffs_oldfscompat_read(fs, ump, sblockloc)
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t sblockloc;
 {
 	off_t maxfilesize;
 
 	/*
 	 * If not yet done, update fs_flags location and value of fs_sblockloc.
 	 */
 	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		fs->fs_flags = fs->fs_old_flags;
 		fs->fs_old_flags |= FS_FLAGS_UPDATED;
 		fs->fs_sblockloc = sblockloc;
 	}
 	/*
 	 * If not yet done, update UFS1 superblock with new wider fields.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
 		fs->fs_maxbsize = fs->fs_bsize;
 		fs->fs_time = fs->fs_old_time;
 		fs->fs_size = fs->fs_old_size;
 		fs->fs_dsize = fs->fs_old_dsize;
 		fs->fs_csaddr = fs->fs_old_csaddr;
 		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
 		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
 		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
 		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
 		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
 		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
 	/* Compatibility for old filesystems */
 	if (fs->fs_avgfilesize <= 0)
 		fs->fs_avgfilesize = AVFILESIZ;
 	if (fs->fs_avgfpdir <= 0)
 		fs->fs_avgfpdir = AFPDIR;
 	if (bigcgs) {
 		fs->fs_save_cgsize = fs->fs_cgsize;
 		fs->fs_cgsize = fs->fs_bsize;
 	}
 }
 
 /*
  * Unwinding superblock updates for old filesystems.
  * See ffs_oldfscompat_read above for details.
  *
  * XXX - Parts get retired eventually.
  * Unfortunately new bits get added.
  */
 void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
 {
 
 	/*
 	 * Copy back UFS2 updated fields that UFS1 inspects.
 	 */
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		fs->fs_old_time = fs->fs_time;
 		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
 		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
 		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
 		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
 		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
 	}
 	if (bigcgs) {
 		fs->fs_cgsize = fs->fs_save_cgsize;
 		fs->fs_save_cgsize = 0;
 	}
 }
 
 /*
  * unmount system call
  */
 static int
 ffs_unmount(mp, mntflags)
 	struct mount *mp;
 	int mntflags;
 {
 	struct thread *td;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, flags, susp;
 #ifdef UFS_EXTATTR
 	int e_restart;
 #endif
 
 	flags = 0;
 	td = curthread;
 	fs = ump->um_fs;
 	susp = 0;
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 		susp = fs->fs_ronly == 0;
 	}
 #ifdef UFS_EXTATTR
 	if ((error = ufs_extattr_stop(mp, td))) {
 		if (error != EOPNOTSUPP)
 			printf("WARNING: unmount %s: ufs_extattr_stop "
 			    "returned errno %d\n", mp->mnt_stat.f_mntonname,
 			    error);
 		e_restart = 0;
 	} else {
 		ufs_extattr_uepm_destroy(&ump->um_extattr);
 		e_restart = 1;
 	}
 #endif
 	if (susp) {
 		error = vfs_write_suspend_umnt(mp);
 		if (error != 0)
 			goto fail1;
 	}
 	if (MOUNTEDSOFTDEP(mp))
 		error = softdep_flushfiles(mp, flags, td);
 	else
 		error = ffs_flushfiles(mp, flags, td);
 	if (error != 0 && error != ENXIO)
 		goto fail;
 
 	UFS_LOCK(ump);
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		printf("WARNING: unmount %s: pending error: blocks %jd "
 		    "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
 		    fs->fs_pendinginodes);
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
 	if (MOUNTEDSOFTDEP(mp))
 		softdep_unmount(mp);
 	if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
 		if (error && error != ENXIO) {
 			fs->fs_clean = 0;
 			goto fail;
 		}
 	}
 	if (susp)
 		vfs_write_resume(mp, VR_START_WRITE);
 	if (ump->um_trim_tq != NULL) {
 		while (ump->um_trim_inflight != 0)
 			pause("ufsutr", hz);
 		taskqueue_drain_all(ump->um_trim_tq);
 		taskqueue_free(ump->um_trim_tq);
 	}
 	g_topology_lock();
 	if (ump->um_fsckpid > 0) {
 		/*
 		 * Return to normal read-only mode.
 		 */
 		error = g_access(ump->um_cp, 0, -1, 0);
 		ump->um_fsckpid = 0;
 	}
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
 	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
 	vrele(ump->um_devvp);
 	dev_rel(ump->um_dev);
 	mtx_destroy(UFS_MTX(ump));
 	if (mp->mnt_gjprovider != NULL) {
 		free(mp->mnt_gjprovider, M_UFSMNT);
 		mp->mnt_gjprovider = NULL;
 	}
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	if (td->td_su == mp) {
 		td->td_su = NULL;
 		vfs_rel(mp);
 	}
 	return (error);
 
 fail:
 	if (susp)
 		vfs_write_resume(mp, VR_START_WRITE);
 fail1:
 #ifdef UFS_EXTATTR
 	if (e_restart) {
 		ufs_extattr_uepm_init(&ump->um_extattr);
 #ifdef UFS_EXTATTR_AUTOSTART
 		(void) ufs_extattr_autostart(mp, td);
 #endif
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct ufsmount *ump;
 	int qerror, error;
 
 	ump = VFSTOUFS(mp);
 	qerror = 0;
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			error = quotaoff(td, mp, i);
 			if (error != 0) {
 				if ((flags & EARLYFLUSH) == 0)
 					return (error);
 				else
 					qerror = error;
 			}
 		}
 
 		/*
 		 * Here we fall through to vflush again to ensure that
 		 * we have gotten rid of all the system vnodes, unless
 		 * quotas must not be closed.
 		 */
 	}
 #endif
 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
 	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
 		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
 			return (error);
 		ffs_snapshot_unmount(mp);
 		flags |= FORCECLOSE;
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 
 	/*
 	 * Do not close system files if quotas were not closed, to be
 	 * able to sync the remaining dquots.  The freeblks softupdate
 	 * workitems might hold a reference on a dquot, preventing
 	 * quotaoff() from completing.  Next round of
 	 * softdep_flushworklist() iteration should process the
 	 * blockers, allowing the next run of quotaoff() to finally
 	 * flush held dquots.
 	 *
 	 * Otherwise, flush all the files.
 	 */
 	if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0)
 		return (error);
 
 	/*
 	 * Flush filesystem metadata.
 	 */
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
 	VOP_UNLOCK(ump->um_devvp, 0);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 static int
 ffs_statfs(mp, sbp)
 	struct mount *mp;
 	struct statfs *sbp;
 {
 	struct ufsmount *ump;
 	struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	UFS_LOCK(ump);
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
 	    dbtofsb(fs, fs->fs_pendingblocks);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
 	UFS_UNLOCK(ump);
 	sbp->f_namemax = NAME_MAX;
 	return (0);
 }
 
 static bool
 sync_doupdate(struct inode *ip)
 {
 
 	return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED |
 	    IN_UPDATE)) != 0);
 }
 
 /*
  * For a lazy sync, we only care about access times, quotas and the
  * superblock.  Other filesystem changes are already converted to
  * cylinder group blocks or inode blocks updates and are written to
  * disk by syncer.
  */
 static int
 ffs_sync_lazy(mp)
      struct mount *mp;
 {
 	struct vnode *mvp, *vp;
 	struct inode *ip;
 	struct thread *td;
 	int allerror, error;
 
 	allerror = 0;
 	td = curthread;
 	if ((mp->mnt_flag & MNT_NOATIME) != 0)
 		goto qupdate;
 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 
 		/*
 		 * The IN_ACCESS flag is converted to IN_MODIFIED by
 		 * ufs_close() and ufs_getattr() by the calls to
 		 * ufs_itimes_locked(), without subsequent UFS_UPDATE().
 		 * Test also all the other timestamp flags too, to pick up
 		 * any other cases that could be missed.
 		 */
 		if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
 		    td)) != 0)
 			continue;
 		if (sync_doupdate(ip))
 			error = ffs_update(vp, 0);
 		if (error != 0)
 			allerror = error;
 		vput(vp);
 	}
 
 qupdate:
 #ifdef QUOTA
 	qsync(mp);
 #endif
 
 	if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked busy using
  * vfs_busy().
  */
 static int
 ffs_sync(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct vnode *mvp, *vp, *devvp;
 	struct thread *td;
 	struct inode *ip;
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs;
 	int error, count, lockreq, allerror = 0;
 	int suspend;
 	int suspended;
 	int secondary_writes;
 	int secondary_accwrites;
 	int softdep_deps;
 	int softdep_accdeps;
 	struct bufobj *bo;
 
 	suspend = 0;
 	suspended = 0;
 	td = curthread;
 	fs = ump->um_fs;
 	if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0)
 		panic("%s: ffs_sync: modification on read-only filesystem",
 		    fs->fs_fsmnt);
 	if (waitfor == MNT_LAZY) {
 		if (!rebooting)
 			return (ffs_sync_lazy(mp));
 		waitfor = MNT_NOWAIT;
 	}
 
 	/*
 	 * Write back each (modified) inode.
 	 */
 	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
 	if (waitfor == MNT_SUSPEND) {
 		suspend = 1;
 		waitfor = MNT_WAIT;
 	}
 	if (waitfor == MNT_WAIT)
 		lockreq = LK_EXCLUSIVE;
 	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
 loop:
 	/* Grab snapshot of secondary write counts */
 	MNT_ILOCK(mp);
 	secondary_writes = mp->mnt_secondary_writes;
 	secondary_accwrites = mp->mnt_secondary_accwrites;
 	MNT_IUNLOCK(mp);
 
 	/* Grab snapshot of softdep dependency counts */
 	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		/*
 		 * Depend on the vnode interlock to keep things stable enough
 		 * for a quick test.  Since there might be hundreds of
 		 * thousands of vnodes, we cannot afford even a subroutine
 		 * call unless there's a good chance that we have work to do.
 		 */
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
 		    vp->v_bufobj.bo_dirty.bv_cnt == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		if ((error = vget(vp, lockreq, td)) != 0) {
 			if (error == ENOENT || error == ENOLCK) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
 			allerror = error;
 		vput(vp);
 	}
 	/*
 	 * Force stale filesystem control information to be flushed.
 	 */
 	if (waitfor == MNT_WAIT || rebooting) {
 		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
 			allerror = error;
 		/* Flushed work items may create new vnodes to clean */
 		if (allerror == 0 && count)
 			goto loop;
 	}
 #ifdef QUOTA
 	qsync(mp);
 #endif
 
 	devvp = ump->um_devvp;
 	bo = &devvp->v_bufobj;
 	BO_LOCK(bo);
 	if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
 		BO_UNLOCK(bo);
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(devvp, waitfor, td);
 		VOP_UNLOCK(devvp, 0);
 		if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN))
 			error = ffs_sbupdate(ump, waitfor, 0);
 		if (error != 0)
 			allerror = error;
 		if (allerror == 0 && waitfor == MNT_WAIT)
 			goto loop;
 	} else if (suspend != 0) {
 		if (softdep_check_suspend(mp,
 					  devvp,
 					  softdep_deps,
 					  softdep_accdeps,
 					  secondary_writes,
 					  secondary_accwrites) != 0) {
 			MNT_IUNLOCK(mp);
 			goto loop;	/* More work needed */
 		}
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
 		MNT_IUNLOCK(mp);
 		suspended = 1;
 	} else
 		BO_UNLOCK(bo);
 	/*
 	 * Write back modified superblock.
 	 */
 	if (fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 int
 ffs_vget(mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 	return (ffs_vgetf(mp, ino, flags, vpp, 0));
 }
 
 int
 ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 	int ffs_flags;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	int error;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/*
 	 * We must promote to an exclusive lock for vnode creation.  This
 	 * can happen if lookup is passed LOCKSHARED.
 	 */
 	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 	}
 
 	/*
 	 * We do not lock vnode creation as it is believed to be too
 	 * expensive for such rare case as simultaneous creation of vnode
 	 * for same ino by different processes. We just allow them to race
 	 * and check later to decide who wins. Let the race begin!
 	 */
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ?
 	    &ffs_vnodeops1 : &ffs_vnodeops2, &vp);
 	if (error) {
 		*vpp = NULL;
 		uma_zfree(uma_inode, ip);
 		return (error);
 	}
 	/*
 	 * FFS supports recursive locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	VN_LOCK_AREC(vp);
 	vp->v_data = ip;
 	vp->v_bufobj.bo_bsize = fs->fs_bsize;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
 	ip->i_number = ino;
 	ip->i_ea_refs = 0;
 	ip->i_nextclustercg = -1;
 	ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2;
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 
 	if (ffs_flags & FFSV_FORCEINSMQ)
 		vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		uma_zfree(uma_inode, ip);
 		*vpp = NULL;
 		return (error);
 	}
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
 		brelse(bp);
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	if (I_IS_UFS1(ip))
 		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
 	else
 		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
 	ffs_load_inode(bp, ip, fs, ino);
 	if (DOINGSOFTDEP(vp))
 		softdep_load_inodeblock(ip);
 	else
 		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2,
 	    &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Finish inode initialization.
 	 */
 	if (vp->v_type != VFIFO) {
 		/* FFS supports shared locking for all files except fifos. */
 		VN_LOCK_ASHARE(vp);
 	}
 
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		while (ip->i_gen == 0)
 			ip->i_gen = arc4random();
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
 		 * If this vnode is already allocated, and we're running
 		 * multi-label, attempt to perform a label association
 		 * from the extended attributes on the inode.
 		 */
 		error = mac_vnode_associate_extattr(mp, vp);
 		if (error) {
 			/* ufs_inactive will release ip->i_devvp ref. */
 			vput(vp);
 			*vpp = NULL;
 			return (error);
 		}
 	}
 #endif
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - for UFS2 check that the inode number is initialized
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 static int
 ffs_fhtovp(mp, fhp, flags, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	int flags;
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	ino_t ino;
 	u_int cg;
 	int error;
 
 	ufhp = (struct ufid *)fhp;
 	ino = ufhp->ufid_ino;
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (ino < ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	/*
 	 * Need to check if inode is initialized because UFS2 does lazy
 	 * initialization and nfs_fhtovp can offer arbitrary inode numbers.
 	 */
 	if (fs->fs_magic != FS_UFS2_MAGIC)
 		return (ufs_fhtovp(mp, ufhp, flags, vpp));
 	cg = ino_to_cg(fs, ino);
 	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		return (error);
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp) || ino >= cg * fs->fs_ipg + cgp->cg_initediblk) {
 		brelse(bp);
 		return (ESTALE);
 	}
 	brelse(bp);
 	return (ufs_fhtovp(mp, ufhp, flags, vpp));
 }
 
 /*
  * Initialize the filesystem.
  */
 static int
 ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	ffs_susp_initialize();
 	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
 /*
  * Undo the work of ffs_init().
  */
 static int
 ffs_uninit(vfsp)
 	struct vfsconf *vfsp;
 {
 	int ret;
 
 	ret = ufs_uninit(vfsp);
 	softdep_uninitialize();
 	ffs_susp_uninitialize();
 	return (ret);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 int
 ffs_sbupdate(ump, waitfor, suspended)
 	struct ufsmount *ump;
 	int waitfor;
 	int suspended;
 {
 	struct fs *fs = ump->um_fs;
 	struct buf *sbbp;
 	struct buf *bp;
 	int blks;
 	void *space;
 	int i, size, error, allerror = 0;
 
 	if (fs->fs_ronly == 1 &&
 	    (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
 	    (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0)
 		panic("ffs_sbupdate: write read-only filesystem");
 	/*
 	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
 	 */
 	sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 	    (int)fs->fs_sbsize, 0, 0, 0);
 	/*
 	 * First write back the summary information.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp;
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space = (char *)space + size;
 		if (suspended)
 			bp->b_flags |= B_VALIDSUSPWRT;
 		if (waitfor != MNT_WAIT)
 			bawrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			allerror = error;
 	}
 	/*
 	 * Now write back the superblock itself. If any errors occurred
 	 * up to this point, then fail so that the superblock avoids
 	 * being written out as clean.
 	 */
 	if (allerror) {
 		brelse(sbbp);
 		return (allerror);
 	}
 	bp = sbbp;
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
 	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
 		fs->fs_sblockloc = SBLOCK_UFS1;
 	}
 	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
 	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
 		fs->fs_sblockloc = SBLOCK_UFS2;
 	}
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
 	if (MOUNTEDSOFTDEP(ump->um_mountp))
 		softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
 	if (suspended)
 		bp->b_flags |= B_VALIDSUSPWRT;
 	if (waitfor != MNT_WAIT)
 		bawrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
 	int attrnamespace, const char *attrname)
 {
 
 #ifdef UFS_EXTATTR
 	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname));
 #else
 	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
 	    attrname));
 #endif
 }
 
 static void
 ffs_ifree(struct ufsmount *ump, struct inode *ip)
 {
 
 	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
 		uma_zfree(uma_ufs1, ip->i_din1);
 	else if (ip->i_din2 != NULL)
 		uma_zfree(uma_ufs2, ip->i_din2);
 	uma_zfree(uma_inode, ip);
 }
 
 static int dobkgrdwrite = 1;
 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
     "Do background writes (honoring the BV_BKGRDWRITE flag)?");
 
 /*
  * Complete a background write started from bwrite.
  */
 static void
 ffs_backgroundwritedone(struct buf *bp)
 {
 	struct bufobj *bufobj;
 	struct buf *origbp;
 
 	/*
 	 * Find the original buffer that we are writing.
 	 */
 	bufobj = bp->b_bufobj;
 	BO_LOCK(bufobj);
 	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
 		panic("backgroundwritedone: lost buffer");
 
 	/*
 	 * We should mark the cylinder group buffer origbp as
 	 * dirty, to not loose the failed write.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		origbp->b_vflags |= BV_BKGRDERR;
 	BO_UNLOCK(bufobj);
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
 	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0)
 		buf_complete(bp);
 #ifdef SOFTUPDATES
 	if (!LIST_EMPTY(&bp->b_dep))
 		softdep_move_dependencies(bp, origbp);
 #endif
 	/*
 	 * This buffer is marked B_NOCACHE so when it is released
 	 * by biodone it will be tossed.
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_flags &= ~B_CACHE;
 	pbrelvp(bp);
 
 	/*
 	 * Prevent brelse() from trying to keep and re-dirtying bp on
 	 * errors. It causes b_bufobj dereference in
 	 * bdirty()/reassignbuf(), and b_bufobj was cleared in
 	 * pbrelvp() above.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		bp->b_flags |= B_INVAL;
 	bufdone(bp);
 	BO_LOCK(bufobj);
 	/*
 	 * Clear the BV_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
 	 * If BV_BKGRDINPROG is not set in the original buffer it must
 	 * have been released and re-instantiated - which is not legal.
 	 */
 	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
 	    ("backgroundwritedone: lost buffer2"));
 	origbp->b_vflags &= ~BV_BKGRDINPROG;
 	if (origbp->b_vflags & BV_BKGRDWAIT) {
 		origbp->b_vflags &= ~BV_BKGRDWAIT;
 		wakeup(&origbp->b_xflags);
 	}
 	BO_UNLOCK(bufobj);
 }
 
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 static int
 ffs_bufwrite(struct buf *bp)
 {
 	struct buf *newbp;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (!BUF_ISLOCKED(bp))
 		panic("bufwrite: buffer is not busy???");
 	/*
 	 * If a background write is already in progress, delay
 	 * writing this block if it is asynchronous. Otherwise
 	 * wait for the background write to complete.
 	 */
 	BO_LOCK(bp->b_bufobj);
 	if (bp->b_vflags & BV_BKGRDINPROG) {
 		if (bp->b_flags & B_ASYNC) {
 			BO_UNLOCK(bp->b_bufobj);
 			bdwrite(bp);
 			return (0);
 		}
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO,
 		    "bwrbg", 0);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("bufwrite: still writing");
 	}
 	bp->b_vflags &= ~BV_BKGRDERR;
 	BO_UNLOCK(bp->b_bufobj);
 
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
 	 *
 	 * This optimization eats a lot of memory.  If we have a page
 	 * or buffer shortfall we can't do it.
 	 */
 	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
 	    (bp->b_flags & B_ASYNC) &&
 	    !vm_page_count_severe() &&
 	    !buf_dirty_count_severe()) {
 		KASSERT(bp->b_iodone == NULL,
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
 		/* get a new block */
 		newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
 		if (newbp == NULL)
 			goto normal_write;
 
 		KASSERT(buf_mapped(bp), ("Unmapped cg"));
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags |= BV_BKGRDINPROG;
 		BO_UNLOCK(bp->b_bufobj);
 		newbp->b_xflags |= BX_BKGRDMARKER;
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = ffs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC;
 		newbp->b_flags &= ~B_INVAL;
 		pbgetvp(bp->b_vp, newbp);
 
 #ifdef SOFTUPDATES
 		/*
 		 * Move over the dependencies.  If there are rollbacks,
 		 * leave the parent buffer dirtied as it will need to
 		 * be written again.
 		 */
 		if (LIST_EMPTY(&bp->b_dep) ||
 		    softdep_move_dependencies(bp, newbp) == 0)
 			bundirty(bp);
 #else
 		bundirty(bp);
 #endif
 
 		/*
 		 * Initiate write on the copy, release the original.  The
 		 * BKGRDINPROG flag prevents it from going away until 
 		 * the background write completes.
 		 */
 		bqrelse(bp);
 		bp = newbp;
 	} else
 		/* Mark the buffer clean */
 		bundirty(bp);
 
 
 	/* Let the normal bufwrite do the rest for us */
 normal_write:
 	return (bufwrite(bp));
 }
 
 
 static void
 ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 {
 	struct vnode *vp;
 	int error;
 	struct buf *tbp;
 	int nocopy;
 
 	vp = bo->__bo_vnode;
 	if (bp->b_iocmd == BIO_WRITE) {
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("ffs_geom_strategy: bad I/O");
 		nocopy = bp->b_flags & B_NOCOPY;
 		bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
 		if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
 				runningbufwakeup(bp);
 				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 					      b_cluster.cluster_entry) {
 					error = ffs_copyonwrite(vp, tbp);
 					if (error != 0 &&
 					    error != EOPNOTSUPP) {
 						bp->b_error = error;
 						bp->b_ioflags |= BIO_ERROR;
 						bufdone(bp);
 						return;
 					}
 				}
 				bp->b_runningbufspace = bp->b_bufsize;
 				atomic_add_long(&runningbufspace,
 					       bp->b_runningbufspace);
 			} else {
 				error = ffs_copyonwrite(vp, bp);
 				if (error != 0 && error != EOPNOTSUPP) {
 					bp->b_error = error;
 					bp->b_ioflags |= BIO_ERROR;
 					bufdone(bp);
 					return;
 				}
 			}
 		}
 #ifdef SOFTUPDATES
 		if ((bp->b_flags & B_CLUSTER) != 0) {
 			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 				      b_cluster.cluster_entry) {
 				if (!LIST_EMPTY(&tbp->b_dep))
 					buf_start(tbp);
 			}
 		} else {
 			if (!LIST_EMPTY(&bp->b_dep))
 				buf_start(bp);
 		}
 
 #endif
 	}
 	g_vfs_strategy(bo, bp);
 }
 
 int
 ffs_own_mount(const struct mount *mp)
 {
 
 	if (mp->mnt_op == &ufs_vfsops)
 		return (1);
 	return (0);
 }
 
 #ifdef	DDB
 #ifdef SOFTUPDATES
 
 /* defined in ffs_softdep.c */
 extern void db_print_ffs(struct ufsmount *ump);
 
 DB_SHOW_COMMAND(ffs, db_show_ffs)
 {
 	struct mount *mp;
 	struct ufsmount *ump;
 
 	if (have_addr) {
 		ump = VFSTOUFS((struct mount *)addr);
 		db_print_ffs(ump);
 		return;
 	}
 
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
 			db_print_ffs(VFSTOUFS(mp));
 	}
 }
 
 #endif	/* SOFTUPDATES */
 #endif	/* DDB */
Index: stable/11/sys/vm/device_pager.c
===================================================================
--- stable/11/sys/vm/device_pager.c	(revision 331016)
+++ stable/11/sys/vm/device_pager.c	(revision 331017)
@@ -1,470 +1,471 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)device_pager.c	8.1 (Berkeley) 6/11/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/uma.h>
 
 static void dev_pager_init(void);
 static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dev_pager_dealloc(vm_object_t);
 static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static void dev_pager_free_page(vm_object_t object, vm_page_t m);
 static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx,
     int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last);
 
 /* list of device pager objects */
 static struct pagerlst dev_pager_object_list;
 /* protect list manipulation */
 static struct mtx dev_pager_mtx;
 
 struct pagerops devicepagerops = {
 	.pgo_init =	dev_pager_init,
 	.pgo_alloc =	dev_pager_alloc,
 	.pgo_dealloc =	dev_pager_dealloc,
 	.pgo_getpages =	dev_pager_getpages,
 	.pgo_putpages =	dev_pager_putpages,
 	.pgo_haspage =	dev_pager_haspage,
 };
 
 struct pagerops mgtdevicepagerops = {
 	.pgo_alloc =	dev_pager_alloc,
 	.pgo_dealloc =	dev_pager_dealloc,
 	.pgo_getpages =	dev_pager_getpages,
 	.pgo_putpages =	dev_pager_putpages,
 	.pgo_haspage =	dev_pager_haspage,
 	.pgo_populate =	dev_pager_populate,
 };
 
 static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color);
 static void old_dev_pager_dtor(void *handle);
 static int old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
     int prot, vm_page_t *mres);
 
 static struct cdev_pager_ops old_dev_pager_ops = {
 	.cdev_pg_ctor =	old_dev_pager_ctor,
 	.cdev_pg_dtor =	old_dev_pager_dtor,
 	.cdev_pg_fault = old_dev_pager_fault
 };
 
 static void
 dev_pager_init(void)
 {
 
 	TAILQ_INIT(&dev_pager_object_list);
 	mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF);
 }
 
 vm_object_t
 cdev_pager_lookup(void *handle)
 {
 	vm_object_t object;
 
 	mtx_lock(&dev_pager_mtx);
 	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
 	mtx_unlock(&dev_pager_mtx);
 	return (object);
 }
 
 vm_object_t
 cdev_pager_allocate(void *handle, enum obj_type tp, struct cdev_pager_ops *ops,
     vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
 {
 	vm_object_t object, object1;
 	vm_pindex_t pindex;
 	u_short color;
 
 	if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE)
 		return (NULL);
 	KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL,
 	    ("populate on unmanaged device pager"));
 
 	/*
 	 * Offset should be page aligned.
 	 */
 	if (foff & PAGE_MASK)
 		return (NULL);
 
 	/*
 	 * Treat the mmap(2) file offset as an unsigned value for a
 	 * device mapping.  This, in effect, allows a user to pass all
 	 * possible off_t values as the mapping cookie to the driver.  At
 	 * this point, we know that both foff and size are a multiple
 	 * of the page size.  Do a check to avoid wrap.
 	 */
 	size = round_page(size);
 	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
 	if (pindex > OBJ_MAX_SIZE || pindex < UOFF_TO_IDX(foff) ||
 	    pindex < UOFF_TO_IDX(size))
 		return (NULL);
 
 	if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
 		return (NULL);
 	mtx_lock(&dev_pager_mtx);
 
 	/*
 	 * Look up pager, creating as necessary.
 	 */
 	object1 = NULL;
 	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
 	if (object == NULL) {
 		/*
 		 * Allocate object and associate it with the pager.  Initialize
 		 * the object's pg_color based upon the physical address of the
 		 * device's memory.
 		 */
 		mtx_unlock(&dev_pager_mtx);
 		object1 = vm_object_allocate(tp, pindex);
 		object1->flags |= OBJ_COLORED;
 		object1->pg_color = color;
 		object1->handle = handle;
 		object1->un_pager.devp.ops = ops;
 		object1->un_pager.devp.dev = handle;
 		TAILQ_INIT(&object1->un_pager.devp.devp_pglist);
 		mtx_lock(&dev_pager_mtx);
 		object = vm_pager_object_lookup(&dev_pager_object_list, handle);
 		if (object != NULL) {
 			/*
 			 * We raced with other thread while allocating object.
 			 */
 			if (pindex > object->size)
 				object->size = pindex;
 			KASSERT(object->type == tp,
 			    ("Inconsistent device pager type %p %d",
 			    object, tp));
 			KASSERT(object->un_pager.devp.ops == ops,
 			    ("Inconsistent devops %p %p", object, ops));
 		} else {
 			object = object1;
 			object1 = NULL;
 			object->handle = handle;
 			TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
 			    pager_object_list);
 			if (ops->cdev_pg_populate != NULL)
 				vm_object_set_flag(object, OBJ_POPULATE);
 		}
 	} else {
 		if (pindex > object->size)
 			object->size = pindex;
 		KASSERT(object->type == tp,
 		    ("Inconsistent device pager type %p %d", object, tp));
 	}
 	mtx_unlock(&dev_pager_mtx);
 	if (object1 != NULL) {
 		object1->handle = object1;
 		mtx_lock(&dev_pager_mtx);
 		TAILQ_INSERT_TAIL(&dev_pager_object_list, object1,
 		    pager_object_list);
 		mtx_unlock(&dev_pager_mtx);
 		vm_object_deallocate(object1);
 	}
 	return (object);
 }
 
 static vm_object_t
 dev_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred)
 {
 
 	return (cdev_pager_allocate(handle, OBJT_DEVICE, &old_dev_pager_ops,
 	    size, prot, foff, cred));
 }
 
 void
 cdev_pager_free_page(vm_object_t object, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type == OBJT_MGTDEVICE) {
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("unmanaged %p", m));
 		pmap_remove_all(m);
 		vm_page_lock(m);
 		vm_page_remove(m);
 		vm_page_unlock(m);
 	} else if (object->type == OBJT_DEVICE)
 		dev_pager_free_page(object, m);
 }
 
 static void
 dev_pager_free_page(vm_object_t object, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->type == OBJT_DEVICE &&
 	    (m->oflags & VPO_UNMANAGED) != 0),
 	    ("Managed device or page obj %p m %p", object, m));
 	TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, plinks.q);
 	vm_page_putfake(m);
 }
 
 static void
 dev_pager_dealloc(vm_object_t object)
 {
 	vm_page_t m;
 
 	VM_OBJECT_WUNLOCK(object);
 	object->un_pager.devp.ops->cdev_pg_dtor(object->un_pager.devp.dev);
 
 	mtx_lock(&dev_pager_mtx);
 	TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
 	mtx_unlock(&dev_pager_mtx);
 	VM_OBJECT_WLOCK(object);
 
 	if (object->type == OBJT_DEVICE) {
 		/*
 		 * Free up our fake pages.
 		 */
 		while ((m = TAILQ_FIRST(&object->un_pager.devp.devp_pglist))
 		    != NULL)
 			dev_pager_free_page(object, m);
 	}
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 }
 
 static int
 dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	int error;
 
 	/* Since our haspage reports zero after/before, the count is 1. */
 	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->un_pager.devp.ops->cdev_pg_fault == NULL)
 		return (VM_PAGER_FAIL);
 	error = object->un_pager.devp.ops->cdev_pg_fault(object,
 	    IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if (error == VM_PAGER_OK) {
 		KASSERT((object->type == OBJT_DEVICE &&
 		     (ma[0]->oflags & VPO_UNMANAGED) != 0) ||
 		    (object->type == OBJT_MGTDEVICE &&
 		     (ma[0]->oflags & VPO_UNMANAGED) == 0),
 		    ("Wrong page type %p %p", ma[0], object));
 		if (object->type == OBJT_DEVICE) {
 			TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
 			    ma[0], plinks.q);
 		}
 		if (rbehind)
 			*rbehind = 0;
 		if (rahead)
 			*rahead = 0;
 	}
 
 	return (error);
 }
 
 static int
 dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->un_pager.devp.ops->cdev_pg_populate == NULL)
 		return (VM_PAGER_FAIL);
 	return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx,
 	    fault_type, max_prot, first, last));
 }
 
 static int
 old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	vm_paddr_t paddr;
 	vm_page_t m_paddr, page;
 	struct cdev *dev;
 	struct cdevsw *csw;
 	struct file *fpop;
 	struct thread *td;
 	vm_memattr_t memattr, memattr1;
 	int ref, ret;
 
 	memattr = object->memattr;
 
 	VM_OBJECT_WUNLOCK(object);
 
 	dev = object->handle;
 	csw = dev_refthread(dev, &ref);
 	if (csw == NULL) {
 		VM_OBJECT_WLOCK(object);
 		return (VM_PAGER_FAIL);
 	}
 	td = curthread;
 	fpop = td->td_fpop;
 	td->td_fpop = NULL;
 	ret = csw->d_mmap(dev, offset, &paddr, prot, &memattr);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (ret != 0) {
 		printf(
 	    "WARNING: dev_pager_getpage: map function returns error %d", ret);
 		VM_OBJECT_WLOCK(object);
 		return (VM_PAGER_FAIL);
 	}
 
 	/* If "paddr" is a real page, perform a sanity check on "memattr". */
 	if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL &&
 	    (memattr1 = pmap_page_get_memattr(m_paddr)) != memattr) {
 		/*
 		 * For the /dev/mem d_mmap routine to return the
 		 * correct memattr, pmap_page_get_memattr() needs to
 		 * be called, which we do there.
 		 */
 		if ((csw->d_flags & D_MEM) == 0) {
 			printf("WARNING: Device driver %s has set "
 			    "\"memattr\" inconsistently (drv %u pmap %u).\n",
 			    csw->d_name, memattr, memattr1);
 		}
 		memattr = memattr1;
 	}
 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * If the passed in result page is a fake page, update it with
 		 * the new physical address.
 		 */
 		page = *mres;
 		VM_OBJECT_WLOCK(object);
 		vm_page_updatefake(page, paddr, memattr);
 	} else {
 		/*
 		 * Replace the passed in reqpage page with our own fake page and
 		 * free up the all of the original pages.
 		 */
 		page = vm_page_getfake(paddr, memattr);
 		VM_OBJECT_WLOCK(object);
 		vm_page_replace_checked(page, object, (*mres)->pindex, *mres);
 		vm_page_lock(*mres);
 		vm_page_free(*mres);
 		vm_page_unlock(*mres);
 		*mres = page;
 	}
 	page->valid = VM_PAGE_BITS_ALL;
 	return (VM_PAGER_OK);
 }
 
 static void
 dev_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags,
     int *rtvals)
 {
 
 	panic("dev_pager_putpage called");
 }
 
 static boolean_t
 dev_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 
 	if (before != NULL)
 		*before = 0;
 	if (after != NULL)
 		*after = 0;
 	return (TRUE);
 }
 
 static int
 old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 	struct cdev *dev;
 	struct cdevsw *csw;
 	vm_memattr_t dummy;
 	vm_ooffset_t off;
 	vm_paddr_t paddr;
 	unsigned int npages;
 	int ref;
 
 	/*
 	 * Make sure this device can be mapped.
 	 */
 	dev = handle;
 	csw = dev_refthread(dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	/*
 	 * Check that the specified range of the device allows the desired
 	 * protection.
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	npages = OFF_TO_IDX(size);
 	paddr = 0; /* Make paddr initialized for the case of size == 0. */
 	for (off = foff; npages--; off += PAGE_SIZE) {
 		if (csw->d_mmap(dev, off, &paddr, (int)prot, &dummy) != 0) {
 			dev_relthread(dev, ref);
 			return (EINVAL);
 		}
 	}
 
 	dev_ref(dev);
 	dev_relthread(dev, ref);
 	*color = atop(paddr) - OFF_TO_IDX(off - PAGE_SIZE);
 	return (0);
 }
 
 static void
 old_dev_pager_dtor(void *handle)
 {
 
 	dev_rel(handle);
 }
Index: stable/11/sys/vm/memguard.c
===================================================================
--- stable/11/sys/vm/memguard.c	(revision 331016)
+++ stable/11/sys/vm/memguard.c	(revision 331017)
@@ -1,517 +1,518 @@
 /*-
  * Copyright (c) 2005, Bosko Milekic <bmilekic@FreeBSD.org>.
  * Copyright (c) 2010 Isilon Systems, Inc. (http://www.isilon.com/)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * MemGuard is a simple replacement allocator for debugging only
  * which provides ElectricFence-style memory barrier protection on
  * objects being allocated, and is used to detect tampering-after-free
  * scenarios.
  *
  * See the memguard(9) man page for more information on using MemGuard.
  */
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma_int.h>
 #include <vm/memguard.h>
 
 static SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
 /*
  * The vm_memguard_divisor variable controls how much of kmem_map should be
  * reserved for MemGuard.
  */
 static u_int vm_memguard_divisor;
 SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &vm_memguard_divisor,
     0, "(kmem_size/memguard_divisor) == memguard submap size");
 
 /*
  * Short description (ks_shortdesc) of memory type to monitor.
  */
 static char vm_memguard_desc[128] = "";
 static struct malloc_type *vm_memguard_mtype = NULL;
 TUNABLE_STR("vm.memguard.desc", vm_memguard_desc, sizeof(vm_memguard_desc));
 static int
 memguard_sysctl_desc(SYSCTL_HANDLER_ARGS)
 {
 	char desc[sizeof(vm_memguard_desc)];
 	int error;
 
 	strlcpy(desc, vm_memguard_desc, sizeof(desc));
 	error = sysctl_handle_string(oidp, desc, sizeof(desc), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	mtx_lock(&malloc_mtx);
 	/* If mtp is NULL, it will be initialized in memguard_cmp() */
 	vm_memguard_mtype = malloc_desc2type(desc);
 	strlcpy(vm_memguard_desc, desc, sizeof(vm_memguard_desc));
 	mtx_unlock(&malloc_mtx);
 	return (error);
 }
 SYSCTL_PROC(_vm_memguard, OID_AUTO, desc,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     memguard_sysctl_desc, "A", "Short description of memory type to monitor");
 
 static vm_offset_t memguard_cursor;
 static vm_offset_t memguard_base;
 static vm_size_t memguard_mapsize;
 static vm_size_t memguard_physlimit;
 static u_long memguard_wasted;
 static u_long memguard_wrap;
 static u_long memguard_succ;
 static u_long memguard_fail_kva;
 static u_long memguard_fail_pgs;
 
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, cursor, CTLFLAG_RD,
     &memguard_cursor, 0, "MemGuard cursor");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, mapsize, CTLFLAG_RD,
     &memguard_mapsize, 0, "MemGuard private arena size");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, phys_limit, CTLFLAG_RD,
     &memguard_physlimit, 0, "Limit on MemGuard memory consumption");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, wasted, CTLFLAG_RD,
     &memguard_wasted, 0, "Excess memory used through page promotion");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, wrapcnt, CTLFLAG_RD,
     &memguard_wrap, 0, "MemGuard cursor wrap count");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, numalloc, CTLFLAG_RD,
     &memguard_succ, 0, "Count of successful MemGuard allocations");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, fail_kva, CTLFLAG_RD,
     &memguard_fail_kva, 0, "MemGuard failures due to lack of KVA");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, fail_pgs, CTLFLAG_RD,
     &memguard_fail_pgs, 0, "MemGuard failures due to lack of pages");
 
 #define MG_GUARD_AROUND		0x001
 #define MG_GUARD_ALLLARGE	0x002
 #define MG_GUARD_NOFREE		0x004
 static int memguard_options = MG_GUARD_AROUND;
 SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RWTUN,
     &memguard_options, 0,
     "MemGuard options:\n"
     "\t0x001 - add guard pages around each allocation\n"
     "\t0x002 - always use MemGuard for allocations over a page\n"
     "\t0x004 - guard uma(9) zones with UMA_ZONE_NOFREE flag");
 
 static u_int memguard_minsize;
 static u_long memguard_minsize_reject;
 SYSCTL_UINT(_vm_memguard, OID_AUTO, minsize, CTLFLAG_RW,
     &memguard_minsize, 0, "Minimum size for page promotion");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, minsize_reject, CTLFLAG_RD,
     &memguard_minsize_reject, 0, "# times rejected for size");
 
 static u_int memguard_frequency;
 static u_long memguard_frequency_hits;
 SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RWTUN,
     &memguard_frequency, 0, "Times in 100000 that MemGuard will randomly run");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD,
     &memguard_frequency_hits, 0, "# times MemGuard randomly chose");
 
 
 /*
  * Return a fudged value to be used for vm_kmem_size for allocating
  * the kmem_map.  The memguard memory will be a submap.
  */
 unsigned long
 memguard_fudge(unsigned long km_size, const struct vm_map *parent_map)
 {
 	u_long mem_pgs, parent_size;
 
 	vm_memguard_divisor = 10;
 	/* CTFLAG_RDTUN doesn't work during the early boot process. */
 	TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
 
 	parent_size = vm_map_max(parent_map) - vm_map_min(parent_map) +
 	    PAGE_SIZE;
 	/* Pick a conservative value if provided value sucks. */
 	if ((vm_memguard_divisor <= 0) ||
 	    ((parent_size / vm_memguard_divisor) == 0))
 		vm_memguard_divisor = 10;
 	/*
 	 * Limit consumption of physical pages to
 	 * 1/vm_memguard_divisor of system memory.  If the KVA is
 	 * smaller than this then the KVA limit comes into play first.
 	 * This prevents memguard's page promotions from completely
 	 * using up memory, since most malloc(9) calls are sub-page.
 	 */
 	mem_pgs = vm_cnt.v_page_count;
 	memguard_physlimit = (mem_pgs / vm_memguard_divisor) * PAGE_SIZE;
 	/*
 	 * We want as much KVA as we can take safely.  Use at most our
 	 * allotted fraction of the parent map's size.  Limit this to
 	 * twice the physical memory to avoid using too much memory as
 	 * pagetable pages (size must be multiple of PAGE_SIZE).
 	 */
 	memguard_mapsize = round_page(parent_size / vm_memguard_divisor);
 	if (memguard_mapsize / (2 * PAGE_SIZE) > mem_pgs)
 		memguard_mapsize = mem_pgs * 2 * PAGE_SIZE;
 	if (km_size + memguard_mapsize > parent_size)
 		memguard_mapsize = 0;
 	return (km_size + memguard_mapsize);
 }
 
 /*
  * Initialize the MemGuard mock allocator.  All objects from MemGuard come
  * out of a single VM map (contiguous chunk of address space).
  */
 void
 memguard_init(vmem_t *parent)
 {
 	vm_offset_t base;
 
 	vmem_alloc(parent, memguard_mapsize, M_BESTFIT | M_WAITOK, &base);
 	vmem_init(memguard_arena, "memguard arena", base, memguard_mapsize,
 	    PAGE_SIZE, 0, M_WAITOK);
 	memguard_cursor = base;
 	memguard_base = base;
 
 	printf("MEMGUARD DEBUGGING ALLOCATOR INITIALIZED:\n");
 	printf("\tMEMGUARD map base: 0x%lx\n", (u_long)base);
 	printf("\tMEMGUARD map size: %jd KBytes\n",
 	    (uintmax_t)memguard_mapsize >> 10);
 }
 
 /*
  * Run things that can't be done as early as memguard_init().
  */
 static void
 memguard_sysinit(void)
 {
 	struct sysctl_oid_list *parent;
 
 	parent = SYSCTL_STATIC_CHILDREN(_vm_memguard);
 
 	SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "mapstart", CTLFLAG_RD,
 	    &memguard_base, "MemGuard KVA base");
 	SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "maplimit", CTLFLAG_RD,
 	    &memguard_mapsize, "MemGuard KVA size");
 #if 0
 	SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "mapused", CTLFLAG_RD,
 	    &memguard_map->size, "MemGuard KVA used");
 #endif
 }
 SYSINIT(memguard, SI_SUB_KLD, SI_ORDER_ANY, memguard_sysinit, NULL);
 
 /*
  * v2sizep() converts a virtual address of the first page allocated for
  * an item to a pointer to u_long recording the size of the original
  * allocation request.
  *
  * This routine is very similar to those defined by UMA in uma_int.h.
  * The difference is that this routine stores the originally allocated
  * size in one of the page's fields that is unused when the page is
  * wired rather than the object field, which is used.
  */
 static u_long *
 v2sizep(vm_offset_t va)
 {
 	vm_paddr_t pa;
 	struct vm_page *p;
 
 	pa = pmap_kextract(va);
 	if (pa == 0)
 		panic("MemGuard detected double-free of %p", (void *)va);
 	p = PHYS_TO_VM_PAGE(pa);
 	KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
 	    ("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
 	return (&p->plinks.memguard.p);
 }
 
 static u_long *
 v2sizev(vm_offset_t va)
 {
 	vm_paddr_t pa;
 	struct vm_page *p;
 
 	pa = pmap_kextract(va);
 	if (pa == 0)
 		panic("MemGuard detected double-free of %p", (void *)va);
 	p = PHYS_TO_VM_PAGE(pa);
 	KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
 	    ("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
 	return (&p->plinks.memguard.v);
 }
 
 /*
  * Allocate a single object of specified size with specified flags
  * (either M_WAITOK or M_NOWAIT).
  */
 void *
 memguard_alloc(unsigned long req_size, int flags)
 {
 	vm_offset_t addr, origaddr;
 	u_long size_p, size_v;
 	int do_guard, rv;
 
 	size_p = round_page(req_size);
 	if (size_p == 0)
 		return (NULL);
 	/*
 	 * To ensure there are holes on both sides of the allocation,
 	 * request 2 extra pages of KVA.  We will only actually add a
 	 * vm_map_entry and get pages for the original request.  Save
 	 * the value of memguard_options so we have a consistent
 	 * value.
 	 */
 	size_v = size_p;
 	do_guard = (memguard_options & MG_GUARD_AROUND) != 0;
 	if (do_guard)
 		size_v += 2 * PAGE_SIZE;
 
 	/*
 	 * When we pass our memory limit, reject sub-page allocations.
 	 * Page-size and larger allocations will use the same amount
 	 * of physical memory whether we allocate or hand off to
 	 * uma_large_alloc(), so keep those.
 	 */
 	if (vmem_size(memguard_arena, VMEM_ALLOC) >= memguard_physlimit &&
 	    req_size < PAGE_SIZE) {
 		addr = (vm_offset_t)NULL;
 		memguard_fail_pgs++;
 		goto out;
 	}
 	/*
 	 * Keep a moving cursor so we don't recycle KVA as long as
 	 * possible.  It's not perfect, since we don't know in what
 	 * order previous allocations will be free'd, but it's simple
 	 * and fast, and requires O(1) additional storage if guard
 	 * pages are not used.
 	 *
 	 * XXX This scheme will lead to greater fragmentation of the
 	 * map, unless vm_map_findspace() is tweaked.
 	 */
 	for (;;) {
 		if (vmem_xalloc(memguard_arena, size_v, 0, 0, 0,
 		    memguard_cursor, VMEM_ADDR_MAX,
 		    M_BESTFIT | M_NOWAIT, &origaddr) == 0)
 			break;
 		/*
 		 * The map has no space.  This may be due to
 		 * fragmentation, or because the cursor is near the
 		 * end of the map.
 		 */
 		if (memguard_cursor == memguard_base) {
 			memguard_fail_kva++;
 			addr = (vm_offset_t)NULL;
 			goto out;
 		}
 		memguard_wrap++;
 		memguard_cursor = memguard_base;
 	}
 	addr = origaddr;
 	if (do_guard)
 		addr += PAGE_SIZE;
 	rv = kmem_back(kmem_object, addr, size_p, flags);
 	if (rv != KERN_SUCCESS) {
 		vmem_xfree(memguard_arena, origaddr, size_v);
 		memguard_fail_pgs++;
 		addr = (vm_offset_t)NULL;
 		goto out;
 	}
 	memguard_cursor = addr + size_v;
 	*v2sizep(trunc_page(addr)) = req_size;
 	*v2sizev(trunc_page(addr)) = size_v;
 	memguard_succ++;
 	if (req_size < PAGE_SIZE) {
 		memguard_wasted += (PAGE_SIZE - req_size);
 		if (do_guard) {
 			/*
 			 * Align the request to 16 bytes, and return
 			 * an address near the end of the page, to
 			 * better detect array overrun.
 			 */
 			req_size = roundup2(req_size, 16);
 			addr += (PAGE_SIZE - req_size);
 		}
 	}
 out:
 	return ((void *)addr);
 }
 
 int
 is_memguard_addr(void *addr)
 {
 	vm_offset_t a = (vm_offset_t)(uintptr_t)addr;
 
 	return (a >= memguard_base && a < memguard_base + memguard_mapsize);
 }
 
 /*
  * Free specified single object.
  */
 void
 memguard_free(void *ptr)
 {
 	vm_offset_t addr;
 	u_long req_size, size, sizev;
 	char *temp;
 	int i;
 
 	addr = trunc_page((uintptr_t)ptr);
 	req_size = *v2sizep(addr);
 	sizev = *v2sizev(addr);
 	size = round_page(req_size);
 
 	/*
 	 * Page should not be guarded right now, so force a write.
 	 * The purpose of this is to increase the likelihood of
 	 * catching a double-free, but not necessarily a
 	 * tamper-after-free (the second thread freeing might not
 	 * write before freeing, so this forces it to and,
 	 * subsequently, trigger a fault).
 	 */
 	temp = ptr;
 	for (i = 0; i < size; i += PAGE_SIZE)
 		temp[i] = 'M';
 
 	/*
 	 * This requires carnal knowledge of the implementation of
 	 * kmem_free(), but since we've already replaced kmem_malloc()
 	 * above, it's not really any worse.  We want to use the
 	 * vm_map lock to serialize updates to memguard_wasted, since
 	 * we had the lock at increment.
 	 */
 	kmem_unback(kmem_object, addr, size);
 	if (sizev > size)
 		addr -= PAGE_SIZE;
 	vmem_xfree(memguard_arena, addr, sizev);
 	if (req_size < PAGE_SIZE)
 		memguard_wasted -= (PAGE_SIZE - req_size);
 }
 
 /*
  * Re-allocate an allocation that was originally guarded.
  */
 void *
 memguard_realloc(void *addr, unsigned long size, struct malloc_type *mtp,
     int flags)
 {
 	void *newaddr;
 	u_long old_size;
 
 	/*
 	 * Allocate the new block.  Force the allocation to be guarded
 	 * as the original may have been guarded through random
 	 * chance, and that should be preserved.
 	 */
 	if ((newaddr = memguard_alloc(size, flags)) == NULL)
 		return (NULL);
 
 	/* Copy over original contents. */
 	old_size = *v2sizep(trunc_page((uintptr_t)addr));
 	bcopy(addr, newaddr, min(size, old_size));
 	memguard_free(addr);
 	return (newaddr);
 }
 
 static int
 memguard_cmp(unsigned long size)
 {
 
 	if (size < memguard_minsize) {
 		memguard_minsize_reject++;
 		return (0);
 	}
 	if ((memguard_options & MG_GUARD_ALLLARGE) != 0 && size >= PAGE_SIZE)
 		return (1);
 	if (memguard_frequency > 0 &&
 	    (random() % 100000) < memguard_frequency) {
 		memguard_frequency_hits++;
 		return (1);
 	}
 
 	return (0);
 }
 
 int
 memguard_cmp_mtp(struct malloc_type *mtp, unsigned long size)
 {
 
 	if (memguard_cmp(size))
 		return(1);
 
 #if 1
 	/*
 	 * The safest way of comparsion is to always compare short description
 	 * string of memory type, but it is also the slowest way.
 	 */
 	return (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0);
 #else
 	/*
 	 * If we compare pointers, there are two possible problems:
 	 * 1. Memory type was unloaded and new memory type was allocated at the
 	 *    same address.
 	 * 2. Memory type was unloaded and loaded again, but allocated at a
 	 *    different address.
 	 */
 	if (vm_memguard_mtype != NULL)
 		return (mtp == vm_memguard_mtype);
 	if (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0) {
 		vm_memguard_mtype = mtp;
 		return (1);
 	}
 	return (0);
 #endif
 }
 
 int
 memguard_cmp_zone(uma_zone_t zone)
 {
 
 	if ((memguard_options & MG_GUARD_NOFREE) == 0 &&
 	    zone->uz_flags & UMA_ZONE_NOFREE)
 		return (0);
 
 	if (memguard_cmp(zone->uz_size))
 		return (1);
 
 	/*
 	 * The safest way of comparsion is to always compare zone name,
 	 * but it is also the slowest way.
 	 */
 	return (strcmp(zone->uz_name, vm_memguard_desc) == 0);
 }
Index: stable/11/sys/vm/sg_pager.c
===================================================================
--- stable/11/sys/vm/sg_pager.c	(revision 331016)
+++ stable/11/sys/vm/sg_pager.c	(revision 331017)
@@ -1,225 +1,227 @@
 /*-
  * Copyright (c) 2009 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * This pager manages OBJT_SG objects.  These objects are backed by
  * a scatter/gather list of physical address ranges.
  */
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sglist.h>
+#include <sys/vmmeter.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/uma.h>
 
 static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void sg_pager_dealloc(vm_object_t);
 static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void sg_pager_putpages(vm_object_t, vm_page_t *, int, 
 		boolean_t, int *);
 static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *,
 		int *);
 
 struct pagerops sgpagerops = {
 	.pgo_alloc =	sg_pager_alloc,
 	.pgo_dealloc =	sg_pager_dealloc,
 	.pgo_getpages =	sg_pager_getpages,
 	.pgo_putpages =	sg_pager_putpages,
 	.pgo_haspage =	sg_pager_haspage,
 };
 
 static vm_object_t
 sg_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred)
 {
 	struct sglist *sg;
 	vm_object_t object;
 	vm_pindex_t npages, pindex;
 	int i;
 
 	/*
 	 * Offset should be page aligned.
 	 */
 	if (foff & PAGE_MASK)
 		return (NULL);
 
 	/*
 	 * The scatter/gather list must only include page-aligned
 	 * ranges.
 	 */
 	npages = 0;
 	sg = handle;
 	for (i = 0; i < sg->sg_nseg; i++) {
 		if ((sg->sg_segs[i].ss_paddr % PAGE_SIZE) != 0 ||
 		    (sg->sg_segs[i].ss_len % PAGE_SIZE) != 0)
 			return (NULL);
 		npages += sg->sg_segs[i].ss_len / PAGE_SIZE;
 	}
 
 	/*
 	 * The scatter/gather list has a fixed size.  Refuse requests
 	 * to map beyond that.
 	 */
 	size = round_page(size);
 	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
 	if (pindex > npages || pindex < UOFF_TO_IDX(foff) ||
 	    pindex < UOFF_TO_IDX(size))
 		return (NULL);
 
 	/*
 	 * Allocate a new object and associate it with the
 	 * scatter/gather list.  It is ok for our purposes to have
 	 * multiple VM objects associated with the same scatter/gather
 	 * list because scatter/gather lists are static.  This is also
 	 * simpler than ensuring a unique object per scatter/gather
 	 * list.
 	 */
 	object = vm_object_allocate(OBJT_SG, npages);
 	object->handle = sglist_hold(sg);
 	TAILQ_INIT(&object->un_pager.sgp.sgp_pglist);
 	return (object);
 }
 
 static void
 sg_pager_dealloc(vm_object_t object)
 {
 	struct sglist *sg;
 	vm_page_t m;
 
 	/*
 	 * Free up our fake pages.
 	 */
 	while ((m = TAILQ_FIRST(&object->un_pager.sgp.sgp_pglist)) != 0) {
 		TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, plinks.q);
 		vm_page_putfake(m);
 	}
 	
 	sg = object->handle;
 	sglist_free(sg);
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 }
 
 static int
 sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 	struct sglist *sg;
 	vm_page_t m_paddr, page;
 	vm_pindex_t offset;
 	vm_paddr_t paddr;
 	vm_memattr_t memattr;
 	size_t space;
 	int i;
 
 	/* Since our haspage reports zero after/before, the count is 1. */
 	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	sg = object->handle;
 	memattr = object->memattr;
 	VM_OBJECT_WUNLOCK(object);
 	offset = m[0]->pindex;
 
 	/*
 	 * Lookup the physical address of the requested page.  An initial
 	 * value of '1' instead of '0' is used so we can assert that the
 	 * page is found since '0' can be a valid page-aligned physical
 	 * address.
 	 */
 	space = 0;
 	paddr = 1;
 	for (i = 0; i < sg->sg_nseg; i++) {
 		if (space + sg->sg_segs[i].ss_len <= (offset * PAGE_SIZE)) {
 			space += sg->sg_segs[i].ss_len;
 			continue;
 		}
 		paddr = sg->sg_segs[i].ss_paddr + offset * PAGE_SIZE - space;
 		break;
 	}
 	KASSERT(paddr != 1, ("invalid SG page index"));
 
 	/* If "paddr" is a real page, perform a sanity check on "memattr". */
 	if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL &&
 	    pmap_page_get_memattr(m_paddr) != memattr) {
 		memattr = pmap_page_get_memattr(m_paddr);
 		printf(
 	    "WARNING: A device driver has set \"memattr\" inconsistently.\n");
 	}
 
 	/* Return a fake page for the requested page. */
 	KASSERT(!(m[0]->flags & PG_FICTITIOUS),
 	    ("backing page for SG is fake"));
 
 	/* Construct a new fake page. */
 	page = vm_page_getfake(paddr, memattr);
 	VM_OBJECT_WLOCK(object);
 	TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
 	vm_page_replace_checked(page, object, offset, m[0]);
 	vm_page_lock(m[0]);
 	vm_page_free(m[0]);
 	vm_page_unlock(m[0]);
 	m[0] = page;
 	page->valid = VM_PAGE_BITS_ALL;
 
 	if (rbehind)
 		*rbehind = 0;
 	if (rahead)
 		*rahead = 0;
 
 	return (VM_PAGER_OK);
 }
 
 static void
 sg_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     boolean_t sync, int *rtvals)
 {
 
 	panic("sg_pager_putpage called");
 }
 
 static boolean_t
 sg_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 
 	if (before != NULL)
 		*before = 0;
 	if (after != NULL)
 		*after = 0;
 	return (TRUE);
 }
Index: stable/11/sys/vm/vm_reserv.c
===================================================================
--- stable/11/sys/vm/vm_reserv.c	(revision 331016)
+++ stable/11/sys/vm/vm_reserv.c	(revision 331017)
@@ -1,1125 +1,1126 @@
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  *	Superpage reservation management module
  *
  * Any external functions defined by this module are only to be used by the
  * virtual memory system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 
 /*
  * The reservation system supports the speculative allocation of large physical
  * pages ("superpages").  Speculative allocation enables the fully automatic
  * utilization of superpages by the virtual memory system.  In other words, no
  * programmatic directives are required to use superpages.
  */
 
 #if VM_NRESERVLEVEL > 0
 
 /*
  * The number of small pages that are contained in a level 0 reservation
  */
 #define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
 
 /*
  * The number of bits by which a physical address is shifted to obtain the
  * reservation number
  */
 #define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
 
 /*
  * The size of a level 0 reservation in bytes
  */
 #define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)
 
 /*
  * Computes the index of the small page underlying the given (object, pindex)
  * within the reservation's array of small pages.
  */
 #define	VM_RESERV_INDEX(object, pindex)	\
     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
 
 /*
  * The size of a population map entry
  */
 typedef	u_long		popmap_t;
 
 /*
  * The number of bits in a population map entry
  */
 #define	NBPOPMAP	(NBBY * sizeof(popmap_t))
 
 /*
  * The number of population map entries in a reservation
  */
 #define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
 
 /*
  * Clear a bit in the population map.
  */
 static __inline void
 popmap_clear(popmap_t popmap[], int i)
 {
 
 	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
 }
 
 /*
  * Set a bit in the population map.
  */
 static __inline void
 popmap_set(popmap_t popmap[], int i)
 {
 
 	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
 }
 
 /*
  * Is a bit in the population map clear?
  */
 static __inline boolean_t
 popmap_is_clear(popmap_t popmap[], int i)
 {
 
 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
 }
 
 /*
  * Is a bit in the population map set?
  */
 static __inline boolean_t
 popmap_is_set(popmap_t popmap[], int i)
 {
 
 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
 }
 
 /*
  * The reservation structure
  *
  * A reservation structure is constructed whenever a large physical page is
  * speculatively allocated to an object.  The reservation provides the small
  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  * within that object.  The reservation's "popcnt" tracks the number of these
  * small physical pages that are in use at any given time.  When and if the
  * reservation is not fully utilized, it appears in the queue of partially
  * populated reservations.  The reservation always appears on the containing
  * object's list of reservations.
  *
  * A partially populated reservation can be broken and reclaimed at any time.
  */
 struct vm_reserv {
 	TAILQ_ENTRY(vm_reserv) partpopq;
 	LIST_ENTRY(vm_reserv) objq;
 	vm_object_t	object;			/* containing object */
 	vm_pindex_t	pindex;			/* offset within object */
 	vm_page_t	pages;			/* first page of a superpage */
 	int		popcnt;			/* # of pages in use */
 	char		inpartpopq;
 	popmap_t	popmap[NPOPMAP];	/* bit vector of used pages */
 };
 
 /*
  * The reservation array
  *
  * This array is analoguous in function to vm_page_array.  It differs in the
  * respect that it may contain a greater number of useful reservation
  * structures than there are (physical) superpages.  These "invalid"
  * reservation structures exist to trade-off space for time in the
  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  * distinguishable from "valid" reservation structures by inspecting the
  * reservation's "pages" field.  Invalid reservation structures have a NULL
  * "pages" field.
  *
  * vm_reserv_from_page() maps a small (physical) page to an element of this
  * array by computing a physical reservation number from the page's physical
  * address.  The physical reservation number is used as the array index.
  *
  * An "active" reservation is a valid reservation structure that has a non-NULL
  * "object" field and a non-zero "popcnt" field.  In other words, every active
  * reservation belongs to a particular object.  Moreover, every active
  * reservation has an entry in the containing object's list of reservations.  
  */
 static vm_reserv_t vm_reserv_array;
 
 /*
  * The partially populated reservation queue
  *
  * This queue enables the fast recovery of an unused free small page from a
  * partially populated reservation.  The reservation at the head of this queue
  * is the least recently changed, partially populated reservation.
  *
  * Access to this queue is synchronized by the free page queue lock.
  */
 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop =
 			    TAILQ_HEAD_INITIALIZER(vm_rvq_partpop);
 
 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 
 static long vm_reserv_broken;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 
 static long vm_reserv_freed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 
 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
 
 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 
 static long vm_reserv_reclaimed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 
 static void		vm_reserv_break(vm_reserv_t rv);
 static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
 			    vm_pindex_t pindex);
 static void		vm_reserv_populate(vm_reserv_t rv, int index);
 static void		vm_reserv_reclaim(vm_reserv_t rv);
 
 /*
  * Returns the current number of full reservations.
  *
  * Since the number of full reservations is computed without acquiring the
  * free page queue lock, the returned value may be inexact.
  */
 static int
 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
 {
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
 	vm_reserv_t rv;
 	int fullpop, segind;
 
 	fullpop = 0;
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
 			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
 	return (sysctl_handle_int(oidp, &fullpop, 0, req));
 }
 
 /*
  * Describes the current state of the partially populated reservation queue.
  */
 static int
 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	vm_reserv_t rv;
 	int counter, error, level, unused_pages;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, "\nLEVEL     SIZE  NUMBER\n\n");
 	for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 		counter = 0;
 		unused_pages = 0;
 		mtx_lock(&vm_page_queue_free_mtx);
 		TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) {
 			counter++;
 			unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 		sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level,
 		    unused_pages * ((int)PAGE_SIZE / 1024), counter);
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
  * reservation to the tail of the partially populated reservation queue if the
  * population count is non-zero.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_depopulate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
 	KASSERT(popmap_is_set(rv->popmap, index),
 	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
 	    index));
 	KASSERT(rv->popcnt > 0,
 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	} else {
 		KASSERT(rv->pages->psind == 1,
 		    ("vm_reserv_depopulate: reserv %p is already demoted",
 		    rv));
 		rv->pages->psind = 0;
 	}
 	popmap_clear(rv->popmap, index);
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
 		LIST_REMOVE(rv, objq);
 		rv->object = NULL;
 		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 		vm_reserv_freed++;
 	} else {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 	}
 }
 
 /*
  * Returns the reservation to which the given page might belong.
  */
 static __inline vm_reserv_t
 vm_reserv_from_page(vm_page_t m)
 {
 
 	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 }
 
 /*
  * Returns TRUE if the given reservation contains the given page index and
  * FALSE otherwise.
  */
 static __inline boolean_t
 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 {
 
 	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 }
 
 /*
  * Increases the given reservation's population count.  Moves the reservation
  * to the tail of the partially populated reservation queue.
  *
  * The free page queue must be locked.
  */
 static void
 vm_reserv_populate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
 	KASSERT(popmap_is_clear(rv->popmap, index),
 	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
 	    index));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
 	KASSERT(rv->pages->psind == 0,
 	    ("vm_reserv_populate: reserv %p is already promoted", rv));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
 	popmap_set(rv->popmap, index);
 	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 	} else
 		rv->pages->psind = 1;
 }
 
 /*
  * Allocates a contiguous set of physical pages of the given size "npages"
  * from existing or newly created reservations.  All of the physical pages
  * must be at or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross any
  * physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_page_t mpred)
 {
 	vm_paddr_t pa, size;
 	vm_page_t m, m_ret, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	u_long allocpages, maxpages, minpages;
 	int i, index, n;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 
 	/*
 	 * Is a reservation fundamentally impossible?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex + npages > object->size)
 		return (NULL);
 
 	/*
 	 * All reservations of a particular size have the same alignment.
 	 * Assuming that the first page is allocated from a reservation, the
 	 * least significant bits of its physical address can be determined
 	 * from its offset from the beginning of the reservation and the size
 	 * of the reservation.
 	 *
 	 * Could the specified index within a reservation of the smallest
 	 * possible size satisfy the alignment and boundary requirements?
 	 */
 	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 	if ((pa & (alignment - 1)) != 0)
 		return (NULL);
 	size = npages << PAGE_SHIFT;
 	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 		return (NULL);
 
 	/*
 	 * Look for an existing reservation.
 	 */
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_reserv_alloc_contig: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
 		rv = vm_reserv_from_page(msucc);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 	}
 
 	/*
 	 * Could at least one reservation fit between the first index to the
 	 * left that can be used ("leftcap") and the first index to the right
 	 * that cannot be used ("rightcap")?
 	 */
 	first = pindex - VM_RESERV_INDEX(object, pindex);
 	if (mpred != NULL) {
 		if ((rv = vm_reserv_from_page(mpred))->object != object)
 			leftcap = mpred->pindex + 1;
 		else
 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 		if (leftcap > first)
 			return (NULL);
 	}
 	minpages = VM_RESERV_INDEX(object, pindex) + npages;
 	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
 	allocpages = maxpages;
 	if (msucc != NULL) {
 		if ((rv = vm_reserv_from_page(msucc))->object != object)
 			rightcap = msucc->pindex;
 		else
 			rightcap = rv->pindex;
 		if (first + maxpages > rightcap) {
 			if (maxpages == VM_LEVEL_0_NPAGES)
 				return (NULL);
 
 			/*
 			 * At least one reservation will fit between "leftcap"
 			 * and "rightcap".  However, a reservation for the
 			 * last of the requested pages will not fit.  Reduce
 			 * the size of the upcoming allocation accordingly.
 			 */
 			allocpages = minpages;
 		}
 	}
 
 	/*
 	 * Would the last new reservation extend past the end of the object?
 	 */
 	if (first + maxpages > object->size) {
 		/*
 		 * Don't allocate the last new reservation if the object is a
 		 * vnode or backed by another object that is a vnode. 
 		 */
 		if (object->type == OBJT_VNODE ||
 		    (object->backing_object != NULL &&
 		    object->backing_object->type == OBJT_VNODE)) {
 			if (maxpages == VM_LEVEL_0_NPAGES)
 				return (NULL);
 			allocpages = minpages;
 		}
 		/* Speculate that the object may grow. */
 	}
 
 	/*
 	 * Allocate the physical pages.  The alignment and boundary specified
 	 * for this allocation may be different from the alignment and
 	 * boundary specified for the requested pages.  For instance, the
 	 * specified index may not be the first page within the first new
 	 * reservation.
 	 */
 	m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
 	    VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
 	if (m == NULL)
 		return (NULL);
 
 	/*
 	 * The allocated physical pages always begin at a reservation
 	 * boundary, but they do not always end at a reservation boundary.
 	 * Initialize every reservation that is completely covered by the
 	 * allocated physical pages.
 	 */
 	m_ret = NULL;
 	index = VM_RESERV_INDEX(object, pindex);
 	do {
 		rv = vm_reserv_from_page(m);
 		KASSERT(rv->pages == m,
 		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 		    rv));
 		KASSERT(rv->object == NULL,
 		    ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
 		LIST_INSERT_HEAD(&object->rvq, rv, objq);
 		rv->object = object;
 		rv->pindex = first;
 		KASSERT(rv->popcnt == 0,
 		    ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
 		    rv));
 		KASSERT(!rv->inpartpopq,
 		    ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
 		    rv));
 		for (i = 0; i < NPOPMAP; i++)
 			KASSERT(rv->popmap[i] == 0,
 		    ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
 			    rv));
 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 		for (i = 0; i < n; i++)
 			vm_reserv_populate(rv, index + i);
 		npages -= n;
 		if (m_ret == NULL) {
 			m_ret = &rv->pages[index];
 			index = 0;
 		}
 		m += VM_LEVEL_0_NPAGES;
 		first += VM_LEVEL_0_NPAGES;
 		allocpages -= VM_LEVEL_0_NPAGES;
 	} while (allocpages >= VM_LEVEL_0_NPAGES);
 	return (m_ret);
 
 	/*
 	 * Found a matching reservation.
 	 */
 found:
 	index = VM_RESERV_INDEX(object, pindex);
 	/* Does the allocation fit within the reservation? */
 	if (index + npages > VM_LEVEL_0_NPAGES)
 		return (NULL);
 	m = &rv->pages[index];
 	pa = VM_PAGE_TO_PHYS(m);
 	if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
 	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 		return (NULL);
 	/* Handle vm_page_rename(m, new_object, ...). */
 	for (i = 0; i < npages; i++)
 		if (popmap_is_set(rv->popmap, index + i))
 			return (NULL);
 	for (i = 0; i < npages; i++)
 		vm_reserv_populate(rv, index + i);
 	return (m);
 }
 
 /*
  * Allocates a page from an existing or newly created reservation.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred)
 {
 	vm_page_t m, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	int i, index;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Is a reservation fundamentally impossible?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex >= object->size)
 		return (NULL);
 
 	/*
 	 * Look for an existing reservation.
 	 */
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_reserv_alloc_page: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_reserv_alloc_page: mpred doesn't precede pindex"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_reserv_alloc_page: msucc doesn't succeed pindex"));
 		rv = vm_reserv_from_page(msucc);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 	}
 
 	/*
 	 * Could a reservation fit between the first index to the left that
 	 * can be used and the first index to the right that cannot be used?
 	 */
 	first = pindex - VM_RESERV_INDEX(object, pindex);
 	if (mpred != NULL) {
 		if ((rv = vm_reserv_from_page(mpred))->object != object)
 			leftcap = mpred->pindex + 1;
 		else
 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 		if (leftcap > first)
 			return (NULL);
 	}
 	if (msucc != NULL) {
 		if ((rv = vm_reserv_from_page(msucc))->object != object)
 			rightcap = msucc->pindex;
 		else
 			rightcap = rv->pindex;
 		if (first + VM_LEVEL_0_NPAGES > rightcap)
 			return (NULL);
 	}
 
 	/*
 	 * Would a new reservation extend past the end of the object? 
 	 */
 	if (first + VM_LEVEL_0_NPAGES > object->size) {
 		/*
 		 * Don't allocate a new reservation if the object is a vnode or
 		 * backed by another object that is a vnode. 
 		 */
 		if (object->type == OBJT_VNODE ||
 		    (object->backing_object != NULL &&
 		    object->backing_object->type == OBJT_VNODE))
 			return (NULL);
 		/* Speculate that the object may grow. */
 	}
 
 	/*
 	 * Allocate and populate the new reservation.
 	 */
 	m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 	if (m == NULL)
 		return (NULL);
 	rv = vm_reserv_from_page(m);
 	KASSERT(rv->pages == m,
 	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 	KASSERT(rv->object == NULL,
 	    ("vm_reserv_alloc_page: reserv %p isn't free", rv));
 	LIST_INSERT_HEAD(&object->rvq, rv, objq);
 	rv->object = object;
 	rv->pindex = first;
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
 	for (i = 0; i < NPOPMAP; i++)
 		KASSERT(rv->popmap[i] == 0,
 		    ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
 		    rv));
 	index = VM_RESERV_INDEX(object, pindex);
 	vm_reserv_populate(rv, index);
 	return (&rv->pages[index]);
 
 	/*
 	 * Found a matching reservation.
 	 */
 found:
 	index = VM_RESERV_INDEX(object, pindex);
 	m = &rv->pages[index];
 	/* Handle vm_page_rename(m, new_object, ...). */
 	if (popmap_is_set(rv->popmap, index))
 		return (NULL);
 	vm_reserv_populate(rv, index);
 	return (m);
 }
 
 /*
  * Breaks the given reservation.  All free pages in the reservation
  * are returned to the physical memory allocator.  The reservation's
  * population count and map are reset to their initial state.
  *
  * The given reservation must not be in the partially populated reservation
  * queue.  The free page queue lock must be held.
  */
 static void
 vm_reserv_break(vm_reserv_t rv)
 {
 	int begin_zeroes, hi, i, lo;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_break: reserv %p is free", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
 	LIST_REMOVE(rv, objq);
 	rv->object = NULL;
 	rv->pages->psind = 0;
 	i = hi = 0;
 	do {
 		/* Find the next 0 bit.  Any previous 0 bits are < "hi". */
 		lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 		if (lo == 0) {
 			/* Redundantly clears bits < "hi". */
 			rv->popmap[i] = 0;
 			rv->popcnt -= NBPOPMAP - hi;
 			while (++i < NPOPMAP) {
 				lo = ffsl(~rv->popmap[i]);
 				if (lo == 0) {
 					rv->popmap[i] = 0;
 					rv->popcnt -= NBPOPMAP;
 				} else
 					break;
 			}
 			if (i == NPOPMAP)
 				break;
 			hi = 0;
 		}
 		KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
 		/* Convert from ffsl() to ordinary bit numbering. */
 		lo--;
 		if (lo > 0) {
 			/* Redundantly clears bits < "hi". */
 			rv->popmap[i] &= ~((1UL << lo) - 1);
 			rv->popcnt -= lo - hi;
 		}
 		begin_zeroes = NBPOPMAP * i + lo;
 		/* Find the next 1 bit. */
 		do
 			hi = ffsl(rv->popmap[i]);
 		while (hi == 0 && ++i < NPOPMAP);
 		if (i != NPOPMAP)
 			/* Convert from ffsl() to ordinary bit numbering. */
 			hi--;
 		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
 		    hi - begin_zeroes);
 	} while (i < NPOPMAP);
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
 	vm_reserv_broken++;
 }
 
 /*
  * Breaks all reservations belonging to the given object.
  */
 void
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 		KASSERT(rv->object == object,
 		    ("vm_reserv_break_all: reserv %p is corrupted", rv));
 		if (rv->inpartpopq) {
 			TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
 		vm_reserv_break(rv);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
  * page is freed and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_free_page(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
 	vm_reserv_depopulate(rv, m - rv->pages);
 	return (TRUE);
 }
 
 /*
  * Initializes the reservation management system.  Specifically, initializes
  * the reservation array.
  *
  * Requires that vm_page_array and first_page are initialized!
  */
 void
 vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
 	int segind;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
 	 * "pages" field for every element that has an underlying superpage.
 	 */
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 			    PHYS_TO_VM_PAGE(paddr);
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
 }
 
 /*
  * Returns true if the given page belongs to a reservation and that page is
  * free.  Otherwise, returns false.
  */
 bool
 vm_reserv_is_page_free(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (false);
 	return (popmap_is_clear(rv->popmap, m - rv->pages));
 }
 
 /*
  * If the given page belongs to a reservation, returns the level of that
  * reservation.  Otherwise, returns -1.
  */
 int
 vm_reserv_level(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	return (rv->object != NULL ? 0 : -1);
 }
 
 /*
  * Returns a reservation level if the given page belongs to a fully populated
  * reservation and -1 otherwise.
  */
 int
 vm_reserv_level_iffullpop(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 }
 
 /*
  * Breaks the given partially populated reservation, releasing its free pages
  * to the physical memory allocator.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->inpartpopq,
 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 	TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 	rv->inpartpopq = FALSE;
 	vm_reserv_break(rv);
 	vm_reserv_reclaimed++;
 }
 
 /*
  * Breaks the reservation at the head of the partially populated reservation
  * queue, releasing its free pages to the physical memory allocator.  Returns
  * TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_inactive(void)
 {
 	vm_reserv_t rv;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) {
 		vm_reserv_reclaim(rv);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  * Searches the partially populated reservation queue for the least recently
  * changed reservation with free pages that satisfy the given request for
  * contiguous physical memory.  If a satisfactory reservation is found, it is
  * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t pa, size;
 	vm_reserv_t rv;
 	int hi, i, lo, low_index, next_free;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (npages > VM_LEVEL_0_NPAGES - 1)
 		return (FALSE);
 	size = npages << PAGE_SHIFT;
 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
 			/* This entire reservation is too low; go to next. */
 			continue;
 		}
 		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 		if (pa + size > high) {
 			/* This entire reservation is too high; go to next. */
 			continue;
 		}
 		if (pa < low) {
 			/* Start the search for free pages at "low". */
 			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
 			i = low_index / NBPOPMAP;
 			hi = low_index % NBPOPMAP;
 		} else
 			i = hi = 0;
 		do {
 			/* Find the next free page. */
 			lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 			while (lo == 0 && ++i < NPOPMAP)
 				lo = ffsl(~rv->popmap[i]);
 			if (i == NPOPMAP)
 				break;
 			/* Convert from ffsl() to ordinary bit numbering. */
 			lo--;
 			next_free = NBPOPMAP * i + lo;
 			pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
 			KASSERT(pa >= low,
 			    ("vm_reserv_reclaim_contig: pa is too low"));
 			if (pa + size > high) {
 				/* The rest of this reservation is too high. */
 				break;
 			} else if ((pa & (alignment - 1)) != 0 ||
 			    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 				/*
 				 * The current page doesn't meet the alignment
 				 * and/or boundary requirements.  Continue
 				 * searching this reservation until the rest
 				 * of its free pages are either excluded or
 				 * exhausted.
 				 */
 				hi = lo + 1;
 				if (hi >= NBPOPMAP) {
 					hi = 0;
 					i++;
 				}
 				continue;
 			}
 			/* Find the next used page. */
 			hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
 			while (hi == 0 && ++i < NPOPMAP) {
 				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
 				    size) {
 					vm_reserv_reclaim(rv);
 					return (TRUE);
 				}
 				hi = ffsl(rv->popmap[i]);
 			}
 			/* Convert from ffsl() to ordinary bit numbering. */
 			if (i != NPOPMAP)
 				hi--;
 			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
 			    size) {
 				vm_reserv_reclaim(rv);
 				return (TRUE);
 			}
 		} while (i < NPOPMAP);
 	}
 	return (FALSE);
 }
 
 /*
  * Transfers the reservation underlying the given page to a new object.
  *
  * The object must be locked.
  */
 void
 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
     vm_pindex_t old_object_offset)
 {
 	vm_reserv_t rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == old_object) {
 		mtx_lock(&vm_page_queue_free_mtx);
 		if (rv->object == old_object) {
 			LIST_REMOVE(rv, objq);
 			LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 			rv->object = new_object;
 			rv->pindex -= old_object_offset;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 }
 
 /*
  * Returns the size (in bytes) of a reservation of the specified level.
  */
 int
 vm_reserv_size(int level)
 {
 
 	switch (level) {
 	case 0:
 		return (VM_LEVEL_0_SIZE);
 	case -1:
 		return (PAGE_SIZE);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Allocates the virtual and physical memory required by the reservation
  * management system's data structures, in particular, the reservation array.
  */
 vm_paddr_t
 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 {
 	vm_paddr_t new_end;
 	size_t size;
 
 	/*
 	 * Calculate the size (in bytes) of the reservation array.  Round up
 	 * from "high_water" because every small page is mapped to an element
 	 * in the reservation array based on its physical address.  Thus, the
 	 * number of elements in the reservation array can be greater than the
 	 * number of superpages. 
 	 */
 	size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 
 	/*
 	 * Allocate and map the physical memory for the reservation array.  The
 	 * next available virtual address is returned by reference.
 	 */
 	new_end = end - round_page(size);
 	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero(vm_reserv_array, size);
 
 	/*
 	 * Return the next available physical address.
 	 */
 	return (new_end);
 }
 
 /*
  * Returns the superpage containing the given page.
  */
 vm_page_t
 vm_reserv_to_superpage(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	rv = vm_reserv_from_page(m);
 	return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
 	    rv->pages : NULL);
 }
 
 #endif	/* VM_NRESERVLEVEL > 0 */
Index: stable/11/sys/x86/x86/intr_machdep.c
===================================================================
--- stable/11/sys/x86/x86/intr_machdep.c	(revision 331016)
+++ stable/11/sys/x86/x86/intr_machdep.c	(revision 331017)
@@ -1,610 +1,611 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Machine dependent interrupt code for x86.  For x86, we have to
  * deal with different PICs.  Thus, we use the passed in vector to lookup
  * an interrupt source associated with that vector.  The interrupt source
  * describes which PIC the source belongs to and includes methods to handle
  * that source.
  */
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifndef DEV_ATPIC
 #include <machine/segments.h>
 #include <machine/frame.h>
 #include <dev/ic/i8259.h>
 #include <x86/isa/icu.h>
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
 #include <isa/isareg.h>
 #endif
 #endif
 
 #define	MAX_STRAY_LOG	5
 
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
 static struct intsrc *interrupt_sources[NUM_IO_INTS];
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
 
 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
 static int assign_cpu;
 #endif
 
 u_long intrcnt[INTRCNT_COUNT];
 char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 
 static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
 static void	intrcnt_setname(const char *name, int index);
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
 static int
 intr_pic_registered(struct pic *pic)
 {
 	struct pic *p;
 
 	TAILQ_FOREACH(p, &pics, pics) {
 		if (p == pic)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Register a new interrupt controller (PIC).  This is to support suspend
  * and resume where we suspend/resume controllers rather than individual
  * sources.  This also allows controllers with no active sources (such as
  * 8259As in a system using the APICs) to participate in suspend and resume.
  */
 int
 intr_register_pic(struct pic *pic)
 {
 	int error;
 
 	mtx_lock(&intrpic_lock);
 	if (intr_pic_registered(pic))
 		error = EBUSY;
 	else {
 		TAILQ_INSERT_TAIL(&pics, pic, pics);
 		error = 0;
 	}
 	mtx_unlock(&intrpic_lock);
 	return (error);
 }
 
 /*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
  */
 int
 intr_register_source(struct intsrc *isrc)
 {
 	int error, vector;
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
 	    vector);
 	if (error)
 		return (error);
 	sx_xlock(&intrsrc_lock);
 	if (interrupt_sources[vector] != NULL) {
 		sx_xunlock(&intrsrc_lock);
 		intr_event_destroy(isrc->is_event);
 		return (EEXIST);
 	}
 	intrcnt_register(isrc);
 	interrupt_sources[vector] = isrc;
 	isrc->is_handlers = 0;
 	sx_xunlock(&intrsrc_lock);
 	return (0);
 }
 
 struct intsrc *
 intr_lookup_source(int vector)
 {
 
 	if (vector < 0 || vector >= nitems(interrupt_sources))
 		return (NULL);
 	return (interrupt_sources[vector]);
 }
 
 int
 intr_add_handler(const char *name, int vector, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
 	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		intrcnt_updatename(isrc);
 		isrc->is_handlers++;
 		if (isrc->is_handlers == 1) {
 			isrc->is_pic->pic_enable_intr(isrc);
 			isrc->is_pic->pic_enable_source(isrc);
 		}
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_remove_handler(void *cookie)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_handler_source(cookie);
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		isrc->is_handlers--;
 		if (isrc->is_handlers == 0) {
 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
 			isrc->is_pic->pic_disable_intr(isrc);
 		}
 		intrcnt_updatename(isrc);
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
 }
 
 static void
 intr_disable_src(void *arg)
 {
 	struct intsrc *isrc;
 
 	isrc = arg;
 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 }
 
 void
 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 {
 	struct intr_event *ie;
 	int vector;
 
 	/*
 	 * We count software interrupts when we process them.  The
 	 * code here follows previous practice, but there's an
 	 * argument for counting hardware interrupts when they're
 	 * processed too.
 	 */
 	(*isrc->is_count)++;
 	PCPU_INC(cnt.v_intr);
 
 	ie = isrc->is_event;
 
 	/*
 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
 	 * device (clk).
 	 */
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (vector == 0)
 		clkintr_pending = 1;
 
 	/*
 	 * For stray interrupts, mask and EOI the source, bump the
 	 * stray count, and log the condition.
 	 */
 	if (intr_event_handle(ie, frame) != 0) {
 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 		(*isrc->is_straycount)++;
 		if (*isrc->is_straycount < MAX_STRAY_LOG)
 			log(LOG_ERR, "stray irq%d\n", vector);
 		else if (*isrc->is_straycount == MAX_STRAY_LOG)
 			log(LOG_CRIT,
 			    "too many stray irq %d's: not logging anymore\n",
 			    vector);
 	}
 }
 
 void
 intr_resume(bool suspend_cancelled)
 {
 	struct pic *pic;
 
 #ifndef DEV_ATPIC
 	atpic_reset();
 #endif
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH(pic, &pics, pics) {
 		if (pic->pic_resume != NULL)
 			pic->pic_resume(pic, suspend_cancelled);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 void
 intr_suspend(void)
 {
 	struct pic *pic;
 
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
 		if (pic->pic_suspend != NULL)
 			pic->pic_suspend(pic);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 static int
 intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 
 	/* Nothing to do if there is only a single CPU. */
 	if (mp_ncpus > 1 && cpu != NOCPU) {
 #else
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
 #endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
 		sx_xunlock(&intrsrc_lock);
 	} else
 		error = 0;
 	return (error);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 static void
 intrcnt_setname(const char *name, int index)
 {
 
 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
 	    MAXCOMLEN, name);
 }
 
 static void
 intrcnt_updatename(struct intsrc *is)
 {
 
 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
 }
 
 static void
 intrcnt_register(struct intsrc *is)
 {
 	char straystr[MAXCOMLEN + 1];
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
 	    is->is_pic->pic_vector(is));
 	intrcnt_updatename(is);
 	is->is_count = &intrcnt[is->is_index];
 	intrcnt_setname(straystr, is->is_index + 1);
 	is->is_straycount = &intrcnt[is->is_index + 1];
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 void
 intrcnt_add(const char *name, u_long **countp)
 {
 
 	mtx_lock_spin(&intrcnt_lock);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static void
 intr_init(void *dummy __unused)
 {
 
 	intrcnt_setname("???", 0);
 	intrcnt_index = 1;
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
 static void
 intr_init_final(void *dummy __unused)
 {
 
 	/*
 	 * Enable interrupts on the BSP after all of the interrupt
 	 * controllers are initialized.  Device interrupts are still
 	 * disabled in the interrupt controllers until interrupt
 	 * handlers are registered.  Interrupts are enabled on each AP
 	 * after their first context switch.
 	 */
 	enable_intr();
 }
 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
 
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
 atpic_reset(void)
 {
 
 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
 
 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
 }
 #endif
 
 /* Add a description to an active interrupt handler. */
 int
 intr_describe(u_int vector, void *ih, const char *descr)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
 	if (error)
 		return (error);
 	intrcnt_updatename(isrc);
 	return (0);
 }
 
 void
 intr_reprogram(void)
 {
 	struct intsrc *is;
 	int v;
 
 	sx_xlock(&intrsrc_lock);
 	for (v = 0; v < NUM_IO_INTS; v++) {
 		is = interrupt_sources[v];
 		if (is == NULL)
 			continue;
 		if (is->is_pic->pic_reprogram_pin != NULL)
 			is->is_pic->pic_reprogram_pin(is);
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
 	int i, verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
 	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
 #endif
 
 #ifdef SMP
 /*
  * Support for balancing interrupt sources across CPUs.  For now we just
  * allocate CPUs round-robin.
  */
 
 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
 static int current_cpu;
 
 /*
  * Return the CPU that the next interrupt source should use.  For now
  * this just returns the next local APIC according to round-robin.
  */
 u_int
 intr_next_cpu(void)
 {
 	u_int apic_id;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	if (mp_ncpus == 1)
 		return (PCPU_GET(apic_id));
 #else
 	/* Leave all interrupts on the BSP during boot. */
 	if (!assign_cpu)
 		return (PCPU_GET(apic_id));
 #endif
 
 	mtx_lock_spin(&icu_lock);
 	apic_id = cpu_apic_ids[current_cpu];
 	do {
 		current_cpu++;
 		if (current_cpu > mp_maxid)
 			current_cpu = 0;
 	} while (!CPU_ISSET(current_cpu, &intr_cpus));
 	mtx_unlock_spin(&icu_lock);
 	return (apic_id);
 }
 
 /* Attempt to bind the specified IRQ to the specified CPU. */
 int
 intr_bind(u_int vector, u_char cpu)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (intr_event_bind(isrc->is_event, cpu));
 }
 
 /*
  * Add a CPU to our mask of valid CPUs that can be destinations of
  * interrupts.
  */
 void
 intr_add_cpu(u_int cpu)
 {
 
 	if (cpu >= MAXCPU)
 		panic("%s: Invalid CPU ID", __func__);
 	if (bootverbose)
 		printf("INTR: Adding local APIC %d as a target\n",
 		    cpu_apic_ids[cpu]);
 
 	CPU_SET(cpu, &intr_cpus);
 }
 
 #ifndef EARLY_AP_STARTUP
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * AP's have been launched.
  */
 static void
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct intsrc *isrc;
 	int i;
 
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
 
 	/* Round-robin assign a CPU to each enabled source. */
 	sx_xlock(&intrsrc_lock);
 	assign_cpu = 1;
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc != NULL && isrc->is_handlers > 0) {
 			/*
 			 * If this event is already bound to a CPU,
 			 * then assign the source to that CPU instead
 			 * of picking one via round-robin.  Note that
 			 * this is careful to only advance the
 			 * round-robin if the CPU assignment succeeds.
 			 */
 			if (isrc->is_event->ie_cpu != NOCPU)
 				(void)isrc->is_pic->pic_assign_cpu(isrc,
 				    cpu_apic_ids[isrc->is_event->ie_cpu]);
 			else if (isrc->is_pic->pic_assign_cpu(isrc,
 				cpu_apic_ids[current_cpu]) == 0)
 				(void)intr_next_cpu();
 
 		}
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
 #endif
 #else
 /*
  * Always route interrupts to the current processor in the UP case.
  */
 u_int
 intr_next_cpu(void)
 {
 
 	return (PCPU_GET(apic_id));
 }
 #endif
Index: stable/11/sys/x86/xen/xenpv.c
===================================================================
--- stable/11/sys/x86/xen/xenpv.c	(revision 331016)
+++ stable/11/sys/x86/xen/xenpv.c	(revision 331017)
@@ -1,201 +1,202 @@
 /*
  * Copyright (c) 2014 Roger Pau Monné <roger.pau@citrix.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/pcpu.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/limits.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #include <xen/xen-os.h>
 #include <xen/gnttab.h>
 
 #include "xenmem_if.h"
 
 /*
  * Allocate unused physical memory above 4GB in order to map memory
  * from foreign domains. We use memory starting at 4GB in order to
  * prevent clashes with MMIO/ACPI regions.
  *
  * Since this is not possible on i386 just use any available memory
  * chunk and hope we don't clash with anything else.
  */
 #ifdef __amd64__
 #define LOW_MEM_LIMIT	0x100000000ul
 #else
 #define LOW_MEM_LIMIT	0
 #endif
 
 static devclass_t xenpv_devclass;
 
 static void
 xenpv_identify(driver_t *driver, device_t parent)
 {
 	if (!xen_domain())
 		return;
 
 	/* Make sure there's only one xenpv device. */
 	if (devclass_get_device(xenpv_devclass, 0))
 		return;
 
 	/*
 	 * The xenpv bus should be the last to attach in order
 	 * to properly detect if an ISA bus has already been added.
 	 */
 	if (BUS_ADD_CHILD(parent, UINT_MAX, "xenpv", 0) == NULL)
 		panic("Unable to attach xenpv bus.");
 }
 
 static int
 xenpv_probe(device_t dev)
 {
 
 	device_set_desc(dev, "Xen PV bus");
 	return (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 xenpv_attach(device_t dev)
 {
 	device_t child;
 
 	/*
 	 * Let our child drivers identify any child devices that they
 	 * can find.  Once that is done attach any devices that we
 	 * found.
 	 */
 	bus_generic_probe(dev);
 	bus_generic_attach(dev);
 
 	if (!devclass_get_device(devclass_find("isa"), 0)) {
 		child = BUS_ADD_CHILD(dev, 0, "isa", 0);
 		if (child == NULL)
 			panic("Failed to attach ISA bus.");
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 static struct resource *
 xenpv_alloc_physmem(device_t dev, device_t child, int *res_id, size_t size)
 {
 	struct resource *res;
 	vm_paddr_t phys_addr;
 	int error;
 
 	res = bus_alloc_resource(child, SYS_RES_MEMORY, res_id, LOW_MEM_LIMIT,
 	    ~0, size, RF_ACTIVE);
 	if (res == NULL)
 		return (NULL);
 
 	phys_addr = rman_get_start(res);
 	error = vm_phys_fictitious_reg_range(phys_addr, phys_addr + size,
 	    VM_MEMATTR_DEFAULT);
 	if (error) {
 		bus_release_resource(child, SYS_RES_MEMORY, *res_id, res);
 		return (NULL);
 	}
 
 	return (res);
 }
 
 static int
 xenpv_free_physmem(device_t dev, device_t child, int res_id, struct resource *res)
 {
 	vm_paddr_t phys_addr;
 	size_t size;
 
 	phys_addr = rman_get_start(res);
 	size = rman_get_size(res);
 
 	vm_phys_fictitious_unreg_range(phys_addr, phys_addr + size);
 	return (bus_release_resource(child, SYS_RES_MEMORY, res_id, res));
 }
 
 static device_method_t xenpv_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,		xenpv_identify),
 	DEVMETHOD(device_probe,			xenpv_probe),
 	DEVMETHOD(device_attach,		xenpv_attach),
 	DEVMETHOD(device_suspend,		bus_generic_suspend),
 	DEVMETHOD(device_resume,		bus_generic_resume),
 
 	/* Bus interface */
 	DEVMETHOD(bus_add_child,		bus_generic_add_child),
 	DEVMETHOD(bus_alloc_resource,		bus_generic_alloc_resource),
 	DEVMETHOD(bus_release_resource,		bus_generic_release_resource),
 	DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
 	DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
 
 	/* Interface to allocate memory for foreign mappings */
 	DEVMETHOD(xenmem_alloc,			xenpv_alloc_physmem),
 	DEVMETHOD(xenmem_free,			xenpv_free_physmem),
 
 	DEVMETHOD_END
 };
 
 static driver_t xenpv_driver = {
 	"xenpv",
 	xenpv_methods,
 	0,
 };
 
 DRIVER_MODULE(xenpv, nexus, xenpv_driver, xenpv_devclass, 0, 0);
 
 struct resource *
 xenmem_alloc(device_t dev, int *res_id, size_t size)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (NULL);
 	return (XENMEM_ALLOC(parent, dev, res_id, size));
 }
 
 int
 xenmem_free(device_t dev, int res_id, struct resource *res)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (ENXIO);
 	return (XENMEM_FREE(parent, dev, res_id, res));
 }
Index: stable/11
===================================================================
--- stable/11	(revision 331016)
+++ stable/11	(revision 331017)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r317055-317056