Index: head/sys/dev/drm/drm_bufs.c
===================================================================
--- head/sys/dev/drm/drm_bufs.c	(revision 283997)
+++ head/sys/dev/drm/drm_bufs.c	(revision 283998)
@@ -1,1130 +1,1130 @@
 /*-
  * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas.
  * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *    Rickard E. (Rik) Faith <faith@valinux.com>
  *    Gareth Hughes <gareth@valinux.com>
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /** @file drm_bufs.c
  * Implementation of the ioctls for setup of DRM mappings and DMA buffers.
  */
 
 #include "dev/pci/pcireg.h"
 
 #include "dev/drm/drmP.h"
 
 /* Allocation of PCI memory resources (framebuffer, registers, etc.) for
  * drm_get_resource_*.  Note that they are not RF_ACTIVE, so there's no virtual
  * address for accessing them.  Cleaned up at unload.
  */
 static int drm_alloc_resource(struct drm_device *dev, int resource)
 {
 	struct resource *res;
 	int rid;
 
 	DRM_SPINLOCK_ASSERT(&dev->dev_lock);
 
 	if (resource >= DRM_MAX_PCI_RESOURCE) {
 		DRM_ERROR("Resource %d too large\n", resource);
 		return 1;
 	}
 
 	if (dev->pcir[resource] != NULL) {
 		return 0;
 	}
 
 	DRM_UNLOCK();
 	rid = PCIR_BAR(resource);
 	res = bus_alloc_resource_any(dev->device, SYS_RES_MEMORY, &rid,
 	    RF_SHAREABLE);
 	DRM_LOCK();
 	if (res == NULL) {
 		DRM_ERROR("Couldn't find resource 0x%x\n", resource);
 		return 1;
 	}
 
 	if (dev->pcir[resource] == NULL) {
 		dev->pcirid[resource] = rid;
 		dev->pcir[resource] = res;
 	}
 
 	return 0;
 }
 
 unsigned long drm_get_resource_start(struct drm_device *dev,
 				     unsigned int resource)
 {
 	if (drm_alloc_resource(dev, resource) != 0)
 		return 0;
 
 	return rman_get_start(dev->pcir[resource]);
 }
 
 unsigned long drm_get_resource_len(struct drm_device *dev,
 				   unsigned int resource)
 {
 	if (drm_alloc_resource(dev, resource) != 0)
 		return 0;
 
 	return rman_get_size(dev->pcir[resource]);
 }
 
 int drm_addmap(struct drm_device * dev, unsigned long offset,
 	       unsigned long size,
     enum drm_map_type type, enum drm_map_flags flags, drm_local_map_t **map_ptr)
 {
 	drm_local_map_t *map;
 	int align;
 	/*drm_agp_mem_t *entry;
 	int valid;*/
 
 	/* Only allow shared memory to be removable since we only keep enough
 	 * book keeping information about shared memory to allow for removal
 	 * when processes fork.
 	 */
 	if ((flags & _DRM_REMOVABLE) && type != _DRM_SHM) {
 		DRM_ERROR("Requested removable map for non-DRM_SHM\n");
 		return EINVAL;
 	}
 	if ((offset & PAGE_MASK) || (size & PAGE_MASK)) {
 		DRM_ERROR("offset/size not page aligned: 0x%lx/0x%lx\n",
 		    offset, size);
 		return EINVAL;
 	}
 	if (offset + size < offset) {
 		DRM_ERROR("offset and size wrap around: 0x%lx/0x%lx\n",
 		    offset, size);
 		return EINVAL;
 	}
 
 	DRM_DEBUG("offset = 0x%08lx, size = 0x%08lx, type = %d\n", offset,
 	    size, type);
 
 	/* Check if this is just another version of a kernel-allocated map, and
 	 * just hand that back if so.
 	 */
 	if (type == _DRM_REGISTERS || type == _DRM_FRAME_BUFFER ||
 	    type == _DRM_SHM) {
 		TAILQ_FOREACH(map, &dev->maplist, link) {
 			if (map->type == type && (map->offset == offset ||
 			    (map->type == _DRM_SHM &&
 			    map->flags == _DRM_CONTAINS_LOCK))) {
 				map->size = size;
 				DRM_DEBUG("Found kernel map %d\n", type);
 				goto done;
 			}
 		}
 	}
 	DRM_UNLOCK();
 
 	/* Allocate a new map structure, fill it in, and do any type-specific
 	 * initialization necessary.
 	 */
 	map = malloc(sizeof(*map), DRM_MEM_MAPS, M_ZERO | M_NOWAIT);
 	if (!map) {
 		DRM_LOCK();
 		return ENOMEM;
 	}
 
 	map->offset = offset;
 	map->size = size;
 	map->type = type;
 	map->flags = flags;
 	map->handle = (void *)((unsigned long)alloc_unr(dev->map_unrhdr) <<
 	    DRM_MAP_HANDLE_SHIFT);
 
 	switch (map->type) {
 	case _DRM_REGISTERS:
 		map->virtual = drm_ioremap(dev, map);
 		if (!(map->flags & _DRM_WRITE_COMBINING))
 			break;
 		/* FALLTHROUGH */
 	case _DRM_FRAME_BUFFER:
 		if (drm_mtrr_add(map->offset, map->size, DRM_MTRR_WC) == 0)
 			map->mtrr = 1;
 		break;
 	case _DRM_SHM:
 		map->virtual = malloc(map->size, DRM_MEM_MAPS, M_NOWAIT);
 		DRM_DEBUG("%lu %d %p\n",
 		    map->size, drm_order(map->size), map->virtual);
 		if (!map->virtual) {
 			free(map, DRM_MEM_MAPS);
 			DRM_LOCK();
 			return ENOMEM;
 		}
 		map->offset = (unsigned long)map->virtual;
 		if (map->flags & _DRM_CONTAINS_LOCK) {
 			/* Prevent a 2nd X Server from creating a 2nd lock */
 			DRM_LOCK();
 			if (dev->lock.hw_lock != NULL) {
 				DRM_UNLOCK();
 				free(map->virtual, DRM_MEM_MAPS);
 				free(map, DRM_MEM_MAPS);
 				return EBUSY;
 			}
 			dev->lock.hw_lock = map->virtual; /* Pointer to lock */
 			DRM_UNLOCK();
 		}
 		break;
 	case _DRM_AGP:
 		/*valid = 0;*/
 		/* In some cases (i810 driver), user space may have already
 		 * added the AGP base itself, because dev->agp->base previously
 		 * only got set during AGP enable.  So, only add the base
 		 * address if the map's offset isn't already within the
 		 * aperture.
 		 */
 		if (map->offset < dev->agp->base ||
 		    map->offset > dev->agp->base +
 		    dev->agp->info.ai_aperture_size - 1) {
 			map->offset += dev->agp->base;
 		}
 		map->mtrr   = dev->agp->mtrr; /* for getmap */
 		/*for (entry = dev->agp->memory; entry; entry = entry->next) {
 			if ((map->offset >= entry->bound) &&
 			    (map->offset + map->size <=
 			    entry->bound + entry->pages * PAGE_SIZE)) {
 				valid = 1;
 				break;
 			}
 		}
 		if (!valid) {
 			free(map, DRM_MEM_MAPS);
 			DRM_LOCK();
 			return EACCES;
 		}*/
 		break;
 	case _DRM_SCATTER_GATHER:
 		if (!dev->sg) {
 			free(map, DRM_MEM_MAPS);
 			DRM_LOCK();
 			return EINVAL;
 		}
 		map->virtual = (void *)(dev->sg->vaddr + offset);
 		map->offset = dev->sg->vaddr + offset;
 		break;
 	case _DRM_CONSISTENT:
 		/* Unfortunately, we don't get any alignment specification from
 		 * the caller, so we have to guess.  drm_pci_alloc requires
 		 * a power-of-two alignment, so try to align the bus address of
 		 * the map to it size if possible, otherwise just assume
 		 * PAGE_SIZE alignment.
 		 */
 		align = map->size;
 		if ((align & (align - 1)) != 0)
 			align = PAGE_SIZE;
 		map->dmah = drm_pci_alloc(dev, map->size, align, 0xfffffffful);
 		if (map->dmah == NULL) {
 			free(map, DRM_MEM_MAPS);
 			DRM_LOCK();
 			return ENOMEM;
 		}
 		map->virtual = map->dmah->vaddr;
 		map->offset = map->dmah->busaddr;
 		break;
 	default:
 		DRM_ERROR("Bad map type %d\n", map->type);
 		free(map, DRM_MEM_MAPS);
 		DRM_LOCK();
 		return EINVAL;
 	}
 
 	DRM_LOCK();
 	TAILQ_INSERT_TAIL(&dev->maplist, map, link);
 
 done:
 	/* Jumped to, with lock held, when a kernel map is found. */
 
 	DRM_DEBUG("Added map %d 0x%lx/0x%lx\n", map->type, map->offset,
 	    map->size);
 
 	*map_ptr = map;
 
 	return 0;
 }
 
 int drm_addmap_ioctl(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv)
 {
 	struct drm_map *request = data;
 	drm_local_map_t *map;
 	int err;
 
 	if (!(dev->flags & (FREAD|FWRITE)))
 		return EACCES; /* Require read/write */
 
 	if (!DRM_SUSER(DRM_CURPROC) && request->type != _DRM_AGP)
 		return EACCES;
 
 	DRM_LOCK();
 	err = drm_addmap(dev, request->offset, request->size, request->type,
 	    request->flags, &map);
 	DRM_UNLOCK();
 	if (err != 0)
 		return err;
 
 	request->offset = map->offset;
 	request->size = map->size;
 	request->type = map->type;
 	request->flags = map->flags;
 	request->mtrr   = map->mtrr;
 	request->handle = (void *)map->handle;
 
 	return 0;
 }
 
 void drm_rmmap(struct drm_device *dev, drm_local_map_t *map)
 {
 	DRM_SPINLOCK_ASSERT(&dev->dev_lock);
 
 	if (map == NULL)
 		return;
 
 	TAILQ_REMOVE(&dev->maplist, map, link);
 
 	switch (map->type) {
 	case _DRM_REGISTERS:
 		if (map->bsr == NULL)
 			drm_ioremapfree(map);
 		/* FALLTHROUGH */
 	case _DRM_FRAME_BUFFER:
 		if (map->mtrr) {
 			int __unused retcode;
 			
 			retcode = drm_mtrr_del(0, map->offset, map->size,
 			    DRM_MTRR_WC);
 			DRM_DEBUG("mtrr_del = %d\n", retcode);
 		}
 		break;
 	case _DRM_SHM:
 		free(map->virtual, DRM_MEM_MAPS);
 		break;
 	case _DRM_AGP:
 	case _DRM_SCATTER_GATHER:
 		break;
 	case _DRM_CONSISTENT:
 		drm_pci_free(dev, map->dmah);
 		break;
 	default:
 		DRM_ERROR("Bad map type %d\n", map->type);
 		break;
 	}
 
 	if (map->bsr != NULL) {
 		bus_release_resource(dev->device, SYS_RES_MEMORY, map->rid,
 		    map->bsr);
 	}
 
 	DRM_UNLOCK();
 	if (map->handle)
 		free_unr(dev->map_unrhdr, (unsigned long)map->handle >>
 		    DRM_MAP_HANDLE_SHIFT);
 	DRM_LOCK();
 
 	free(map, DRM_MEM_MAPS);
 }
 
 /* Remove a map private from list and deallocate resources if the mapping
  * isn't in use.
  */
 
 int drm_rmmap_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv)
 {
 	drm_local_map_t *map;
 	struct drm_map *request = data;
 
 	DRM_LOCK();
 	TAILQ_FOREACH(map, &dev->maplist, link) {
 		if (map->handle == request->handle &&
 		    map->flags & _DRM_REMOVABLE)
 			break;
 	}
 
 	/* No match found. */
 	if (map == NULL) {
 		DRM_UNLOCK();
 		return EINVAL;
 	}
 
 	drm_rmmap(dev, map);
 
 	DRM_UNLOCK();
 
 	return 0;
 }
 
 
 static void drm_cleanup_buf_error(struct drm_device *dev,
 				  drm_buf_entry_t *entry)
 {
 	int i;
 
 	if (entry->seg_count) {
 		for (i = 0; i < entry->seg_count; i++) {
 			drm_pci_free(dev, entry->seglist[i]);
 		}
 		free(entry->seglist, DRM_MEM_SEGS);
 
 		entry->seg_count = 0;
 	}
 
    	if (entry->buf_count) {
 	   	for (i = 0; i < entry->buf_count; i++) {
 			free(entry->buflist[i].dev_private, DRM_MEM_BUFS);
 		}
 		free(entry->buflist, DRM_MEM_BUFS);
 
 		entry->buf_count = 0;
 	}
 }
 
 static int drm_do_addbufs_agp(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	drm_device_dma_t *dma = dev->dma;
 	drm_buf_entry_t *entry;
 	/*drm_agp_mem_t *agp_entry;
 	int valid*/
 	drm_buf_t *buf;
 	unsigned long offset;
 	unsigned long agp_offset;
 	int count;
 	int order;
 	int size;
 	int alignment;
 	int page_order;
 	int total;
 	int byte_count;
 	int i;
 	drm_buf_t **temp_buflist;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	alignment  = (request->flags & _DRM_PAGE_ALIGN)
 	    ? round_page(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	byte_count = 0;
 	agp_offset = dev->agp->base + request->agp_start;
 
 	DRM_DEBUG("count:      %d\n",  count);
 	DRM_DEBUG("order:      %d\n",  order);
 	DRM_DEBUG("size:       %d\n",  size);
 	DRM_DEBUG("agp_offset: 0x%lx\n", agp_offset);
 	DRM_DEBUG("alignment:  %d\n",  alignment);
 	DRM_DEBUG("page_order: %d\n",  page_order);
 	DRM_DEBUG("total:      %d\n",  total);
 
 	/* Make sure buffers are located in AGP memory that we own */
 	/* Breaks MGA due to drm_alloc_agp not setting up entries for the
 	 * memory.  Safe to ignore for now because these ioctls are still
 	 * root-only.
 	 */
 	/*valid = 0;
 	for (agp_entry = dev->agp->memory; agp_entry;
 	    agp_entry = agp_entry->next) {
 		if ((agp_offset >= agp_entry->bound) &&
 		    (agp_offset + total * count <=
 		    agp_entry->bound + agp_entry->pages * PAGE_SIZE)) {
 			valid = 1;
 			break;
 		}
 	}
 	if (!valid) {
 		DRM_DEBUG("zone invalid\n");
 		return EINVAL;
 	}*/
 
 	entry = &dma->bufs[order];
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->buflist) {
 		return ENOMEM;
 	}
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 
 	offset = 0;
 
 	while (entry->buf_count < count) {
 		buf          = &entry->buflist[entry->buf_count];
 		buf->idx     = dma->buf_count + entry->buf_count;
 		buf->total   = alignment;
 		buf->order   = order;
 		buf->used    = 0;
 
 		buf->offset  = (dma->byte_count + offset);
 		buf->bus_address = agp_offset + offset;
 		buf->address = (void *)(agp_offset + offset);
 		buf->next    = NULL;
 		buf->pending = 0;
 		buf->file_priv = NULL;
 
 		buf->dev_priv_size = dev->driver->buf_priv_size;
 		buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS,
 		    M_NOWAIT | M_ZERO);
 		if (buf->dev_private == NULL) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			return ENOMEM;
 		}
 
 		offset += alignment;
 		entry->buf_count++;
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	DRM_DEBUG("byte_count: %d\n", byte_count);
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (temp_buflist == NULL) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		return ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	dma->buf_count += entry->buf_count;
 	dma->byte_count += byte_count;
 
 	DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count);
 	DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	dma->flags = _DRM_DMA_USE_AGP;
 
 	return 0;
 }
 
 static int drm_do_addbufs_pci(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	drm_device_dma_t *dma = dev->dma;
 	int count;
 	int order;
 	int size;
 	int total;
 	int page_order;
 	drm_buf_entry_t *entry;
 	drm_buf_t *buf;
 	int alignment;
 	unsigned long offset;
 	int i;
 	int byte_count;
 	int page_count;
 	unsigned long *temp_pagelist;
 	drm_buf_t **temp_buflist;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	DRM_DEBUG("count=%d, size=%d (%d), order=%d\n",
 	    request->count, request->size, size, order);
 
 	alignment = (request->flags & _DRM_PAGE_ALIGN)
 	    ? round_page(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	entry = &dma->bufs[order];
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	entry->seglist = malloc(count * sizeof(*entry->seglist), DRM_MEM_SEGS,
 	    M_NOWAIT | M_ZERO);
 
 	/* Keep the original pagelist until we know all the allocations
 	 * have succeeded
 	 */
 	temp_pagelist = malloc((dma->page_count + (count << page_order)) *
 	    sizeof(*dma->pagelist), DRM_MEM_PAGES, M_NOWAIT);
 
 	if (entry->buflist == NULL || entry->seglist == NULL || 
 	    temp_pagelist == NULL) {
 		free(temp_pagelist, DRM_MEM_PAGES);
 		free(entry->seglist, DRM_MEM_SEGS);
 		free(entry->buflist, DRM_MEM_BUFS);
 		return ENOMEM;
 	}
 
 	memcpy(temp_pagelist, dma->pagelist, dma->page_count * 
 	    sizeof(*dma->pagelist));
 
 	DRM_DEBUG("pagelist: %d entries\n",
 	    dma->page_count + (count << page_order));
 
 	entry->buf_size	= size;
 	entry->page_order = page_order;
 	byte_count = 0;
 	page_count = 0;
 
 	while (entry->buf_count < count) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		drm_dma_handle_t *dmah = drm_pci_alloc(dev, size, alignment,
 		    0xfffffffful);
 		DRM_SPINLOCK(&dev->dma_lock);
 		if (dmah == NULL) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			entry->seg_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			free(temp_pagelist, DRM_MEM_PAGES);
 			return ENOMEM;
 		}
 
 		entry->seglist[entry->seg_count++] = dmah;
 		for (i = 0; i < (1 << page_order); i++) {
 			DRM_DEBUG("page %d @ %p\n",
 			    dma->page_count + page_count,
 			    (char *)dmah->vaddr + PAGE_SIZE * i);
 			temp_pagelist[dma->page_count + page_count++] = 
 			    (long)dmah->vaddr + PAGE_SIZE * i;
 		}
 		for (offset = 0;
 		    offset + size <= total && entry->buf_count < count;
 		    offset += alignment, ++entry->buf_count) {
 			buf	     = &entry->buflist[entry->buf_count];
 			buf->idx     = dma->buf_count + entry->buf_count;
 			buf->total   = alignment;
 			buf->order   = order;
 			buf->used    = 0;
 			buf->offset  = (dma->byte_count + byte_count + offset);
 			buf->address = ((char *)dmah->vaddr + offset);
 			buf->bus_address = dmah->busaddr + offset;
 			buf->next    = NULL;
 			buf->pending = 0;
 			buf->file_priv = NULL;
 
 			buf->dev_priv_size = dev->driver->buf_priv_size;
 			buf->dev_private = malloc(buf->dev_priv_size,
 			    DRM_MEM_BUFS, M_NOWAIT | M_ZERO);
 			if (buf->dev_private == NULL) {
 				/* Set count correctly so we free the proper amount. */
 				entry->buf_count = count;
 				entry->seg_count = count;
 				drm_cleanup_buf_error(dev, entry);
 				free(temp_pagelist, DRM_MEM_PAGES);
 				return ENOMEM;
 			}
 
 			DRM_DEBUG("buffer %d @ %p\n",
 			    entry->buf_count, buf->address);
 		}
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (temp_buflist == NULL) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		free(temp_pagelist, DRM_MEM_PAGES);
 		return ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	/* No allocations failed, so now we can replace the orginal pagelist
 	 * with the new one.
 	 */
 	free(dma->pagelist, DRM_MEM_PAGES);
 	dma->pagelist = temp_pagelist;
 
 	dma->buf_count += entry->buf_count;
 	dma->seg_count += entry->seg_count;
 	dma->page_count += entry->seg_count << page_order;
 	dma->byte_count += PAGE_SIZE * (entry->seg_count << page_order);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	return 0;
 
 }
 
 static int drm_do_addbufs_sg(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	drm_device_dma_t *dma = dev->dma;
 	drm_buf_entry_t *entry;
 	drm_buf_t *buf;
 	unsigned long offset;
 	unsigned long agp_offset;
 	int count;
 	int order;
 	int size;
 	int alignment;
 	int page_order;
 	int total;
 	int byte_count;
 	int i;
 	drm_buf_t **temp_buflist;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	alignment  = (request->flags & _DRM_PAGE_ALIGN)
 	    ? round_page(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	byte_count = 0;
 	agp_offset = request->agp_start;
 
 	DRM_DEBUG("count:      %d\n",  count);
 	DRM_DEBUG("order:      %d\n",  order);
 	DRM_DEBUG("size:       %d\n",  size);
 	DRM_DEBUG("agp_offset: %ld\n", agp_offset);
 	DRM_DEBUG("alignment:  %d\n",  alignment);
 	DRM_DEBUG("page_order: %d\n",  page_order);
 	DRM_DEBUG("total:      %d\n",  total);
 
 	entry = &dma->bufs[order];
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (entry->buflist == NULL)
 		return ENOMEM;
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 
 	offset = 0;
 
 	while (entry->buf_count < count) {
 		buf          = &entry->buflist[entry->buf_count];
 		buf->idx     = dma->buf_count + entry->buf_count;
 		buf->total   = alignment;
 		buf->order   = order;
 		buf->used    = 0;
 
 		buf->offset  = (dma->byte_count + offset);
 		buf->bus_address = agp_offset + offset;
 		buf->address = (void *)(agp_offset + offset + dev->sg->vaddr);
 		buf->next    = NULL;
 		buf->pending = 0;
 		buf->file_priv = NULL;
 
 		buf->dev_priv_size = dev->driver->buf_priv_size;
 		buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS,
 		    M_NOWAIT | M_ZERO);
 		if (buf->dev_private == NULL) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			return ENOMEM;
 		}
 
 		DRM_DEBUG("buffer %d @ %p\n",
 		    entry->buf_count, buf->address);
 
 		offset += alignment;
 		entry->buf_count++;
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	DRM_DEBUG("byte_count: %d\n", byte_count);
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (temp_buflist == NULL) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		return ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	dma->buf_count += entry->buf_count;
 	dma->byte_count += byte_count;
 
 	DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count);
 	DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	dma->flags = _DRM_DMA_USE_SG;
 
 	return 0;
 }
 
 int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	int order, ret;
 
 	if (request->count < 0 || request->count > 4096)
 		return EINVAL;
 	
 	order = drm_order(request->size);
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return EINVAL;
 
 	DRM_SPINLOCK(&dev->dma_lock);
 
 	/* No more allocations after first buffer-using ioctl. */
 	if (dev->buf_use != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return EBUSY;
 	}
 	/* No more than one allocation per order */
 	if (dev->dma->bufs[order].buf_count != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return ENOMEM;
 	}
 
 	ret = drm_do_addbufs_agp(dev, request);
 
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	return ret;
 }
 
 int drm_addbufs_sg(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	int order, ret;
 
 	if (!DRM_SUSER(DRM_CURPROC))
 		return EACCES;
 
 	if (request->count < 0 || request->count > 4096)
 		return EINVAL;
 
 	order = drm_order(request->size);
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return EINVAL;
 
 	DRM_SPINLOCK(&dev->dma_lock);
 
 	/* No more allocations after first buffer-using ioctl. */
 	if (dev->buf_use != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return EBUSY;
 	}
 	/* No more than one allocation per order */
 	if (dev->dma->bufs[order].buf_count != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return ENOMEM;
 	}
 
 	ret = drm_do_addbufs_sg(dev, request);
 
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	return ret;
 }
 
 int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc *request)
 {
 	int order, ret;
 
 	if (!DRM_SUSER(DRM_CURPROC))
 		return EACCES;
 
 	if (request->count < 0 || request->count > 4096)
 		return EINVAL;
 
 	order = drm_order(request->size);
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return EINVAL;
 
 	DRM_SPINLOCK(&dev->dma_lock);
 
 	/* No more allocations after first buffer-using ioctl. */
 	if (dev->buf_use != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return EBUSY;
 	}
 	/* No more than one allocation per order */
 	if (dev->dma->bufs[order].buf_count != 0) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return ENOMEM;
 	}
 
 	ret = drm_do_addbufs_pci(dev, request);
 
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	return ret;
 }
 
 int drm_addbufs(struct drm_device *dev, void *data, struct drm_file *file_priv)
 {
 	struct drm_buf_desc *request = data;
 	int err;
 
 	if (request->flags & _DRM_AGP_BUFFER)
 		err = drm_addbufs_agp(dev, request);
 	else if (request->flags & _DRM_SG_BUFFER)
 		err = drm_addbufs_sg(dev, request);
 	else
 		err = drm_addbufs_pci(dev, request);
 
 	return err;
 }
 
 int drm_infobufs(struct drm_device *dev, void *data, struct drm_file *file_priv)
 {
 	drm_device_dma_t *dma = dev->dma;
 	struct drm_buf_info *request = data;
 	int i;
 	int count;
 	int retcode = 0;
 
 	DRM_SPINLOCK(&dev->dma_lock);
 	++dev->buf_use;		/* Can't allocate more after this call */
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) {
 		if (dma->bufs[i].buf_count)
 			++count;
 	}
 
 	DRM_DEBUG("count = %d\n", count);
 
 	if (request->count >= count) {
 		for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) {
 			if (dma->bufs[i].buf_count) {
 				struct drm_buf_desc from;
 
 				from.count = dma->bufs[i].buf_count;
 				from.size = dma->bufs[i].buf_size;
 				from.low_mark = dma->bufs[i].freelist.low_mark;
 				from.high_mark = dma->bufs[i].freelist.high_mark;
 
 				if (DRM_COPY_TO_USER(&request->list[count], &from,
 				    sizeof(struct drm_buf_desc)) != 0) {
 					retcode = EFAULT;
 					break;
 				}
 
 				DRM_DEBUG("%d %d %d %d %d\n",
 				    i, dma->bufs[i].buf_count,
 				    dma->bufs[i].buf_size,
 				    dma->bufs[i].freelist.low_mark,
 				    dma->bufs[i].freelist.high_mark);
 				++count;
 			}
 		}
 	}
 	request->count = count;
 
 	return retcode;
 }
 
 int drm_markbufs(struct drm_device *dev, void *data, struct drm_file *file_priv)
 {
 	drm_device_dma_t *dma = dev->dma;
 	struct drm_buf_desc *request = data;
 	int order;
 
 	DRM_DEBUG("%d, %d, %d\n",
 		  request->size, request->low_mark, request->high_mark);
 	
 
 	order = drm_order(request->size);	
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER ||
 	    request->low_mark < 0 || request->high_mark < 0) {
 		return EINVAL;
 	}
 
 	DRM_SPINLOCK(&dev->dma_lock);
 	if (request->low_mark > dma->bufs[order].buf_count ||
 	    request->high_mark > dma->bufs[order].buf_count) {
 		DRM_SPINUNLOCK(&dev->dma_lock);
 		return EINVAL;
 	}
 
 	dma->bufs[order].freelist.low_mark  = request->low_mark;
 	dma->bufs[order].freelist.high_mark = request->high_mark;
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	return 0;
 }
 
 int drm_freebufs(struct drm_device *dev, void *data, struct drm_file *file_priv)
 {
 	drm_device_dma_t *dma = dev->dma;
 	struct drm_buf_free *request = data;
 	int i;
 	int idx;
 	drm_buf_t *buf;
 	int retcode = 0;
 
 	DRM_DEBUG("%d\n", request->count);
 	
 	DRM_SPINLOCK(&dev->dma_lock);
 	for (i = 0; i < request->count; i++) {
 		if (DRM_COPY_FROM_USER(&idx, &request->list[i], sizeof(idx))) {
 			retcode = EFAULT;
 			break;
 		}
 		if (idx < 0 || idx >= dma->buf_count) {
 			DRM_ERROR("Index %d (of %d max)\n",
 			    idx, dma->buf_count - 1);
 			retcode = EINVAL;
 			break;
 		}
 		buf = dma->buflist[idx];
 		if (buf->file_priv != file_priv) {
 			DRM_ERROR("Process %d freeing buffer not owned\n",
 			    DRM_CURRENTPID);
 			retcode = EINVAL;
 			break;
 		}
 		drm_free_buffer(dev, buf);
 	}
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	return retcode;
 }
 
 int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv)
 {
 	drm_device_dma_t *dma = dev->dma;
 	int retcode = 0;
 	const int zero = 0;
 	vm_offset_t address;
 	struct vmspace *vms;
 	vm_ooffset_t foff;
 	vm_size_t size;
 	vm_offset_t vaddr;
 	struct drm_buf_map *request = data;
 	int i;
 
 	vms = DRM_CURPROC->td_proc->p_vmspace;
 
 	DRM_SPINLOCK(&dev->dma_lock);
 	dev->buf_use++;		/* Can't allocate more after this call */
 	DRM_SPINUNLOCK(&dev->dma_lock);
 
 	if (request->count < dma->buf_count)
 		goto done;
 
 	if ((drm_core_has_AGP(dev) && (dma->flags & _DRM_DMA_USE_AGP)) ||
 	    (drm_core_check_feature(dev, DRIVER_SG) &&
 	    (dma->flags & _DRM_DMA_USE_SG))) {
 		drm_local_map_t *map = dev->agp_buffer_map;
 
 		if (map == NULL) {
 			retcode = EINVAL;
 			goto done;
 		}
 		size = round_page(map->size);
 		foff = (unsigned long)map->handle;
 	} else {
 		size = round_page(dma->byte_count),
 		foff = 0;
 	}
 
 	vaddr = round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ);
 #if __FreeBSD_version >= 600023
-	retcode = vm_mmap(&vms->vm_map, &vaddr, size, PROT_READ | PROT_WRITE,
-	    VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE,
+	retcode = vm_mmap(&vms->vm_map, &vaddr, size, VM_PROT_READ |
+	    VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE,
 	    dev->devnode, foff);
 #else
-	retcode = vm_mmap(&vms->vm_map, &vaddr, size, PROT_READ | PROT_WRITE,
-	    VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC,
+	retcode = vm_mmap(&vms->vm_map, &vaddr, size, VM_PROT_READ |
+	    VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC,
 	    SLIST_FIRST(&dev->devnode->si_hlist), foff);
 #endif
 	if (retcode)
 		goto done;
 
 	request->virtual = (void *)vaddr;
 
 	for (i = 0; i < dma->buf_count; i++) {
 		if (DRM_COPY_TO_USER(&request->list[i].idx,
 		    &dma->buflist[i]->idx, sizeof(request->list[0].idx))) {
 			retcode = EFAULT;
 			goto done;
 		}
 		if (DRM_COPY_TO_USER(&request->list[i].total,
 		    &dma->buflist[i]->total, sizeof(request->list[0].total))) {
 			retcode = EFAULT;
 			goto done;
 		}
 		if (DRM_COPY_TO_USER(&request->list[i].used, &zero,
 		    sizeof(zero))) {
 			retcode = EFAULT;
 			goto done;
 		}
 		address = vaddr + dma->buflist[i]->offset; /* *** */
 		if (DRM_COPY_TO_USER(&request->list[i].address, &address,
 		    sizeof(address))) {
 			retcode = EFAULT;
 			goto done;
 		}
 	}
 
  done:
 	request->count = dma->buf_count;
 
 	DRM_DEBUG("%d buffers, retcode = %d\n", request->count, retcode);
 
 	return retcode;
 }
 
 /*
  * Compute order.  Can be made faster.
  */
 int drm_order(unsigned long size)
 {
 	int order;
 
 	if (size == 0)
 		return 0;
 
 	order = flsl(size) - 1;
 	if (size & ~(1ul << order))
 		++order;
 
 	return order;
 }
Index: head/sys/dev/drm2/drm_bufs.c
===================================================================
--- head/sys/dev/drm2/drm_bufs.c	(revision 283997)
+++ head/sys/dev/drm2/drm_bufs.c	(revision 283998)
@@ -1,1708 +1,1708 @@
 /**
  * \file drm_bufs.c
  * Generic buffer template
  *
  * \author Rickard E. (Rik) Faith <faith@valinux.com>
  * \author Gareth Hughes <gareth@valinux.com>
  */
 
 /*
  * Created: Thu Nov 23 03:10:50 2000 by gareth@valinux.com
  *
  * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas.
  * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/shm.h>
 
 #include <dev/pci/pcireg.h>
 
 #include <dev/drm2/drmP.h>
 
 /* Allocation of PCI memory resources (framebuffer, registers, etc.) for
  * drm_get_resource_*.  Note that they are not RF_ACTIVE, so there's no virtual
  * address for accessing them.  Cleaned up at unload.
  */
 static int drm_alloc_resource(struct drm_device *dev, int resource)
 {
 	struct resource *res;
 	int rid;
 
 	if (resource >= DRM_MAX_PCI_RESOURCE) {
 		DRM_ERROR("Resource %d too large\n", resource);
 		return 1;
 	}
 
 	if (dev->pcir[resource] != NULL) {
 		return 0;
 	}
 
 	rid = PCIR_BAR(resource);
 	res = bus_alloc_resource_any(dev->dev, SYS_RES_MEMORY, &rid,
 	    RF_SHAREABLE);
 	if (res == NULL) {
 		DRM_ERROR("Couldn't find resource 0x%x\n", resource);
 		return 1;
 	}
 
 	if (dev->pcir[resource] == NULL) {
 		dev->pcirid[resource] = rid;
 		dev->pcir[resource] = res;
 	}
 
 	return 0;
 }
 
 unsigned long drm_get_resource_start(struct drm_device *dev,
 				     unsigned int resource)
 {
 	unsigned long start;
 
 	mtx_lock(&dev->pcir_lock);
 
 	if (drm_alloc_resource(dev, resource) != 0)
 		return 0;
 
 	start = rman_get_start(dev->pcir[resource]);
 
 	mtx_unlock(&dev->pcir_lock);
 
 	return (start);
 }
 
 unsigned long drm_get_resource_len(struct drm_device *dev,
 				   unsigned int resource)
 {
 	unsigned long len;
 
 	mtx_lock(&dev->pcir_lock);
 
 	if (drm_alloc_resource(dev, resource) != 0)
 		return 0;
 
 	len = rman_get_size(dev->pcir[resource]);
 
 	mtx_unlock(&dev->pcir_lock);
 
 	return (len);
 }
 
 static struct drm_map_list *drm_find_matching_map(struct drm_device *dev,
 						  struct drm_local_map *map)
 {
 	struct drm_map_list *entry;
 	list_for_each_entry(entry, &dev->maplist, head) {
 		/*
 		 * Because the kernel-userspace ABI is fixed at a 32-bit offset
 		 * while PCI resources may live above that, we only compare the
 		 * lower 32 bits of the map offset for maps of type
 		 * _DRM_FRAMEBUFFER or _DRM_REGISTERS.
 		 * It is assumed that if a driver have more than one resource
 		 * of each type, the lower 32 bits are different.
 		 */
 		if (!entry->map ||
 		    map->type != entry->map->type ||
 		    entry->master != dev->primary->master)
 			continue;
 		switch (map->type) {
 		case _DRM_SHM:
 			if (map->flags != _DRM_CONTAINS_LOCK)
 				break;
 			return entry;
 		case _DRM_REGISTERS:
 		case _DRM_FRAME_BUFFER:
 			if ((entry->map->offset & 0xffffffff) ==
 			    (map->offset & 0xffffffff))
 				return entry;
 		default: /* Make gcc happy */
 			;
 		}
 		if (entry->map->offset == map->offset)
 			return entry;
 	}
 
 	return NULL;
 }
 
 static int drm_map_handle(struct drm_device *dev, struct drm_hash_item *hash,
 			  unsigned long user_token, int hashed_handle, int shm)
 {
 	int use_hashed_handle, shift;
 	unsigned long add;
 
 #if (BITS_PER_LONG == 64)
 	use_hashed_handle = ((user_token & 0xFFFFFFFF00000000UL) || hashed_handle);
 #elif (BITS_PER_LONG == 32)
 	use_hashed_handle = hashed_handle;
 #else
 #error Unsupported long size. Neither 64 nor 32 bits.
 #endif
 
 	if (!use_hashed_handle) {
 		int ret;
 		hash->key = user_token >> PAGE_SHIFT;
 		ret = drm_ht_insert_item(&dev->map_hash, hash);
 		if (ret != -EINVAL)
 			return ret;
 	}
 
 	shift = 0;
 	add = DRM_MAP_HASH_OFFSET >> PAGE_SHIFT;
 	if (shm && (SHMLBA > PAGE_SIZE)) {
 		int bits = ilog2(SHMLBA >> PAGE_SHIFT) + 1;
 
 		/* For shared memory, we have to preserve the SHMLBA
 		 * bits of the eventual vma->vm_pgoff value during
 		 * mmap().  Otherwise we run into cache aliasing problems
 		 * on some platforms.  On these platforms, the pgoff of
 		 * a mmap() request is used to pick a suitable virtual
 		 * address for the mmap() region such that it will not
 		 * cause cache aliasing problems.
 		 *
 		 * Therefore, make sure the SHMLBA relevant bits of the
 		 * hash value we use are equal to those in the original
 		 * kernel virtual address.
 		 */
 		shift = bits;
 		add |= ((user_token >> PAGE_SHIFT) & ((1UL << bits) - 1UL));
 	}
 
 	return drm_ht_just_insert_please(&dev->map_hash, hash,
 					 user_token, 32 - PAGE_SHIFT - 3,
 					 shift, add);
 }
 
 /**
  * Core function to create a range of memory available for mapping by a
  * non-root process.
  *
  * Adjusts the memory offset to its absolute value according to the mapping
  * type.  Adds the map to the map list drm_device::maplist. Adds MTRR's where
  * applicable and if supported by the kernel.
  */
 static int drm_addmap_core(struct drm_device * dev, resource_size_t offset,
 			   unsigned int size, enum drm_map_type type,
 			   enum drm_map_flags flags,
 			   struct drm_map_list ** maplist)
 {
 	struct drm_local_map *map;
 	struct drm_map_list *list;
 	drm_dma_handle_t *dmah;
 	unsigned long user_token;
 	int ret;
 	int align;
 
 	map = malloc(sizeof(*map), DRM_MEM_MAPS, M_NOWAIT);
 	if (!map)
 		return -ENOMEM;
 
 	map->offset = offset;
 	map->size = size;
 	map->flags = flags;
 	map->type = type;
 
 	/* Only allow shared memory to be removable since we only keep enough
 	 * book keeping information about shared memory to allow for removal
 	 * when processes fork.
 	 */
 	if ((map->flags & _DRM_REMOVABLE) && map->type != _DRM_SHM) {
 		free(map, DRM_MEM_MAPS);
 		return -EINVAL;
 	}
 	DRM_DEBUG("offset = 0x%08llx, size = 0x%08lx, type = %d\n",
 		  (unsigned long long)map->offset, map->size, map->type);
 
 	/* page-align _DRM_SHM maps. They are allocated here so there is no security
 	 * hole created by that and it works around various broken drivers that use
 	 * a non-aligned quantity to map the SAREA. --BenH
 	 */
 	if (map->type == _DRM_SHM)
 		map->size = PAGE_ALIGN(map->size);
 
 	/*
 	 * FreeBSD port note: FreeBSD's PAGE_MASK is the inverse of
 	 * Linux's one. That's why the test below doesn't inverse the
 	 * constant.
 	 */
 	if ((map->offset & ((resource_size_t)PAGE_MASK)) || (map->size & (PAGE_MASK))) {
 		free(map, DRM_MEM_MAPS);
 		return -EINVAL;
 	}
 	map->mtrr = -1;
 	map->handle = NULL;
 
 	switch (map->type) {
 	case _DRM_REGISTERS:
 	case _DRM_FRAME_BUFFER:
 #ifdef __linux__
 #if !defined(__sparc__) && !defined(__alpha__) && !defined(__ia64__) && !defined(__powerpc64__) && !defined(__x86_64__) && !defined(__arm__)
 		if (map->offset + (map->size-1) < map->offset ||
 		    map->offset < virt_to_phys(high_memory)) {
 			kfree(map);
 			return -EINVAL;
 		}
 #endif
 #endif
 		/* Some drivers preinitialize some maps, without the X Server
 		 * needing to be aware of it.  Therefore, we just return success
 		 * when the server tries to create a duplicate map.
 		 */
 		list = drm_find_matching_map(dev, map);
 		if (list != NULL) {
 			if (list->map->size != map->size) {
 				DRM_DEBUG("Matching maps of type %d with "
 					  "mismatched sizes, (%ld vs %ld)\n",
 					  map->type, map->size,
 					  list->map->size);
 				list->map->size = map->size;
 			}
 
 			free(map, DRM_MEM_MAPS);
 			*maplist = list;
 			return 0;
 		}
 
 		if (drm_core_has_MTRR(dev)) {
 			if (map->type == _DRM_FRAME_BUFFER ||
 			    (map->flags & _DRM_WRITE_COMBINING)) {
 				if (drm_mtrr_add(
 				    map->offset, map->size,
 				    DRM_MTRR_WC) == 0)
 					map->mtrr = 1;
 			}
 		}
 		if (map->type == _DRM_REGISTERS) {
 			drm_core_ioremap(map, dev);
 			if (!map->handle) {
 				free(map, DRM_MEM_MAPS);
 				return -ENOMEM;
 			}
 		}
 
 		break;
 	case _DRM_SHM:
 		list = drm_find_matching_map(dev, map);
 		if (list != NULL) {
 			if(list->map->size != map->size) {
 				DRM_DEBUG("Matching maps of type %d with "
 					  "mismatched sizes, (%ld vs %ld)\n",
 					  map->type, map->size, list->map->size);
 				list->map->size = map->size;
 			}
 
 			free(map, DRM_MEM_MAPS);
 			*maplist = list;
 			return 0;
 		}
 		map->handle = malloc(map->size, DRM_MEM_MAPS, M_NOWAIT);
 		DRM_DEBUG("%lu %d %p\n",
 			  map->size, drm_order(map->size), map->handle);
 		if (!map->handle) {
 			free(map, DRM_MEM_MAPS);
 			return -ENOMEM;
 		}
 		map->offset = (unsigned long)map->handle;
 		if (map->flags & _DRM_CONTAINS_LOCK) {
 			/* Prevent a 2nd X Server from creating a 2nd lock */
 			if (dev->primary->master->lock.hw_lock != NULL) {
 				free(map->handle, DRM_MEM_MAPS);
 				free(map, DRM_MEM_MAPS);
 				return -EBUSY;
 			}
 			dev->sigdata.lock = dev->primary->master->lock.hw_lock = map->handle;	/* Pointer to lock */
 		}
 		break;
 	case _DRM_AGP: {
 		struct drm_agp_mem *entry;
 		int valid = 0;
 
 		if (!drm_core_has_AGP(dev)) {
 			free(map, DRM_MEM_MAPS);
 			return -EINVAL;
 		}
 #ifdef __linux__
 #ifdef __alpha__
 		map->offset += dev->hose->mem_space->start;
 #endif
 #endif
 		/* In some cases (i810 driver), user space may have already
 		 * added the AGP base itself, because dev->agp->base previously
 		 * only got set during AGP enable.  So, only add the base
 		 * address if the map's offset isn't already within the
 		 * aperture.
 		 */
 		if (map->offset < dev->agp->base ||
 		    map->offset > dev->agp->base +
 		    dev->agp->agp_info.ai_aperture_size * 1024 * 1024 - 1) {
 			map->offset += dev->agp->base;
 		}
 		map->mtrr = dev->agp->agp_mtrr;	/* for getmap */
 
 		/* This assumes the DRM is in total control of AGP space.
 		 * It's not always the case as AGP can be in the control
 		 * of user space (i.e. i810 driver). So this loop will get
 		 * skipped and we double check that dev->agp->memory is
 		 * actually set as well as being invalid before EPERM'ing
 		 */
 		list_for_each_entry(entry, &dev->agp->memory, head) {
 			if ((map->offset >= entry->bound) &&
 			    (map->offset + map->size <= entry->bound + entry->pages * PAGE_SIZE)) {
 				valid = 1;
 				break;
 			}
 		}
 		if (!list_empty(&dev->agp->memory) && !valid) {
 			free(map, DRM_MEM_MAPS);
 			return -EPERM;
 		}
 		DRM_DEBUG("AGP offset = 0x%08llx, size = 0x%08lx\n",
 			  (unsigned long long)map->offset, map->size);
 
 		break;
 	}
 	case _DRM_GEM:
 		DRM_ERROR("tried to addmap GEM object\n");
 		break;
 	case _DRM_SCATTER_GATHER:
 		if (!dev->sg) {
 			free(map, DRM_MEM_MAPS);
 			return -EINVAL;
 		}
 		map->handle = (void *)(dev->sg->vaddr + offset);
 		map->offset += dev->sg->vaddr;
 		break;
 	case _DRM_CONSISTENT:
 		/* dma_addr_t is 64bit on i386 with CONFIG_HIGHMEM64G,
 		 * As we're limiting the address to 2^32-1 (or less),
 		 * casting it down to 32 bits is no problem, but we
 		 * need to point to a 64bit variable first. */
 		align = map->size;
 		if ((align & (align - 1)) != 0)
 			align = PAGE_SIZE;
 		dmah = drm_pci_alloc(dev, map->size, align, BUS_SPACE_MAXADDR);
 		if (!dmah) {
 			free(map, DRM_MEM_MAPS);
 			return -ENOMEM;
 		}
 		map->handle = dmah->vaddr;
 		map->offset = dmah->busaddr;
 		map->dmah = dmah;
 		break;
 	default:
 		free(map, DRM_MEM_MAPS);
 		return -EINVAL;
 	}
 
 	list = malloc(sizeof(*list), DRM_MEM_MAPS, M_ZERO | M_NOWAIT);
 	if (!list) {
 		if (map->type == _DRM_REGISTERS)
 			drm_core_ioremapfree(map, dev);
 		free(map, DRM_MEM_MAPS);
 		return -EINVAL;
 	}
 	list->map = map;
 
 	DRM_LOCK(dev);
 	list_add(&list->head, &dev->maplist);
 
 	/* Assign a 32-bit handle */
 	/* We do it here so that dev->struct_mutex protects the increment */
 	user_token = (map->type == _DRM_SHM) ? (unsigned long)map->handle :
 		map->offset;
 	ret = drm_map_handle(dev, &list->hash, user_token, 0,
 			     (map->type == _DRM_SHM));
 	if (ret) {
 		if (map->type == _DRM_REGISTERS)
 			drm_core_ioremapfree(map, dev);
 		free(map, DRM_MEM_MAPS);
 		free(list, DRM_MEM_MAPS);
 		DRM_UNLOCK(dev);
 		return ret;
 	}
 
 	list->user_token = list->hash.key << PAGE_SHIFT;
 	DRM_UNLOCK(dev);
 
 	if (!(map->flags & _DRM_DRIVER))
 		list->master = dev->primary->master;
 	*maplist = list;
 	return 0;
 	}
 
 int drm_addmap(struct drm_device * dev, resource_size_t offset,
 	       unsigned int size, enum drm_map_type type,
 	       enum drm_map_flags flags, struct drm_local_map ** map_ptr)
 {
 	struct drm_map_list *list;
 	int rc;
 
 	rc = drm_addmap_core(dev, offset, size, type, flags, &list);
 	if (!rc)
 		*map_ptr = list->map;
 	return rc;
 }
 
 EXPORT_SYMBOL(drm_addmap);
 
 /**
  * Ioctl to specify a range of memory that is available for mapping by a
  * non-root process.
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a drm_map structure.
  * \return zero on success or a negative value on error.
  *
  */
 int drm_addmap_ioctl(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv)
 {
 	struct drm_map *map = data;
 	struct drm_map_list *maplist;
 	int err;
 
 	if (!(DRM_SUSER(DRM_CURPROC) || map->type == _DRM_AGP || map->type == _DRM_SHM))
 		return -EPERM;
 
 	err = drm_addmap_core(dev, map->offset, map->size, map->type,
 			      map->flags, &maplist);
 
 	if (err)
 		return err;
 
 	/* avoid a warning on 64-bit, this casting isn't very nice, but the API is set so too late */
 	map->handle = (void *)(unsigned long)maplist->user_token;
 	return 0;
 }
 
 /**
  * Remove a map private from list and deallocate resources if the mapping
  * isn't in use.
  *
  * Searches the map on drm_device::maplist, removes it from the list, see if
  * its being used, and free any associate resource (such as MTRR's) if it's not
  * being on use.
  *
  * \sa drm_addmap
  */
 int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map)
 {
 	struct drm_map_list *r_list = NULL, *list_t;
 	int found = 0;
 	struct drm_master *master;
 
 	/* Find the list entry for the map and remove it */
 	list_for_each_entry_safe(r_list, list_t, &dev->maplist, head) {
 		if (r_list->map == map) {
 			master = r_list->master;
 			list_del(&r_list->head);
 			drm_ht_remove_key(&dev->map_hash,
 					  r_list->user_token >> PAGE_SHIFT);
 			free(r_list, DRM_MEM_MAPS);
 			found = 1;
 			break;
 		}
 	}
 
 	if (!found)
 		return -EINVAL;
 
 	switch (map->type) {
 	case _DRM_REGISTERS:
 		drm_core_ioremapfree(map, dev);
 		/* FALLTHROUGH */
 	case _DRM_FRAME_BUFFER:
 		if (drm_core_has_MTRR(dev) && map->mtrr >= 0) {
 			int retcode;
 			retcode = drm_mtrr_del(map->mtrr, map->offset,
 			    map->size, DRM_MTRR_WC);
 			DRM_DEBUG("mtrr_del=%d\n", retcode);
 		}
 		break;
 	case _DRM_SHM:
 		free(map->handle, DRM_MEM_MAPS);
 		if (master) {
 			if (dev->sigdata.lock == master->lock.hw_lock)
 				dev->sigdata.lock = NULL;
 			master->lock.hw_lock = NULL;   /* SHM removed */
 			master->lock.file_priv = NULL;
 			DRM_WAKEUP_INT((void *)&master->lock.lock_queue);
 		}
 		break;
 	case _DRM_AGP:
 	case _DRM_SCATTER_GATHER:
 		break;
 	case _DRM_CONSISTENT:
 		drm_pci_free(dev, map->dmah);
 		break;
 	case _DRM_GEM:
 		DRM_ERROR("tried to rmmap GEM object\n");
 		break;
 	}
 	free(map, DRM_MEM_MAPS);
 
 	return 0;
 }
 EXPORT_SYMBOL(drm_rmmap_locked);
 
 int drm_rmmap(struct drm_device *dev, struct drm_local_map *map)
 {
 	int ret;
 
 	DRM_LOCK(dev);
 	ret = drm_rmmap_locked(dev, map);
 	DRM_UNLOCK(dev);
 
 	return ret;
 }
 EXPORT_SYMBOL(drm_rmmap);
 
 /* The rmmap ioctl appears to be unnecessary.  All mappings are torn down on
  * the last close of the device, and this is necessary for cleanup when things
  * exit uncleanly.  Therefore, having userland manually remove mappings seems
  * like a pointless exercise since they're going away anyway.
  *
  * One use case might be after addmap is allowed for normal users for SHM and
  * gets used by drivers that the server doesn't need to care about.  This seems
  * unlikely.
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a struct drm_map structure.
  * \return zero on success or a negative value on error.
  */
 int drm_rmmap_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv)
 {
 	struct drm_map *request = data;
 	struct drm_local_map *map = NULL;
 	struct drm_map_list *r_list;
 	int ret;
 
 	DRM_LOCK(dev);
 	list_for_each_entry(r_list, &dev->maplist, head) {
 		if (r_list->map &&
 		    r_list->user_token == (unsigned long)request->handle &&
 		    r_list->map->flags & _DRM_REMOVABLE) {
 			map = r_list->map;
 			break;
 		}
 	}
 
 	/* List has wrapped around to the head pointer, or its empty we didn't
 	 * find anything.
 	 */
 	if (list_empty(&dev->maplist) || !map) {
 		DRM_UNLOCK(dev);
 		return -EINVAL;
 	}
 
 	/* Register and framebuffer maps are permanent */
 	if ((map->type == _DRM_REGISTERS) || (map->type == _DRM_FRAME_BUFFER)) {
 		DRM_UNLOCK(dev);
 		return 0;
 	}
 
 	ret = drm_rmmap_locked(dev, map);
 
 	DRM_UNLOCK(dev);
 
 	return ret;
 }
 
 /**
  * Cleanup after an error on one of the addbufs() functions.
  *
  * \param dev DRM device.
  * \param entry buffer entry where the error occurred.
  *
  * Frees any pages and buffers associated with the given entry.
  */
 static void drm_cleanup_buf_error(struct drm_device * dev,
 				  struct drm_buf_entry * entry)
 {
 	int i;
 
 	if (entry->seg_count) {
 		for (i = 0; i < entry->seg_count; i++) {
 			if (entry->seglist[i]) {
 				drm_pci_free(dev, entry->seglist[i]);
 			}
 		}
 		free(entry->seglist, DRM_MEM_SEGS);
 
 		entry->seg_count = 0;
 	}
 
 	if (entry->buf_count) {
 		for (i = 0; i < entry->buf_count; i++) {
 			free(entry->buflist[i].dev_private, DRM_MEM_BUFS);
 		}
 		free(entry->buflist, DRM_MEM_BUFS);
 
 		entry->buf_count = 0;
 	}
 }
 
 #if __OS_HAS_AGP
 /**
  * Add AGP buffers for DMA transfers.
  *
  * \param dev struct drm_device to which the buffers are to be added.
  * \param request pointer to a struct drm_buf_desc describing the request.
  * \return zero on success or a negative number on failure.
  *
  * After some sanity checks creates a drm_buf structure for each buffer and
  * reallocates the buffer list of the same size order to accommodate the new
  * buffers.
  */
 int drm_addbufs_agp(struct drm_device * dev, struct drm_buf_desc * request)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_entry *entry;
 	struct drm_agp_mem *agp_entry;
 	struct drm_buf *buf;
 	unsigned long offset;
 	unsigned long agp_offset;
 	int count;
 	int order;
 	int size;
 	int alignment;
 	int page_order;
 	int total;
 	int byte_count;
 	int i, valid;
 	struct drm_buf **temp_buflist;
 
 	if (!dma)
 		return -EINVAL;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	alignment = (request->flags & _DRM_PAGE_ALIGN)
 	    ? PAGE_ALIGN(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	byte_count = 0;
 	agp_offset = dev->agp->base + request->agp_start;
 
 	DRM_DEBUG("count:      %d\n", count);
 	DRM_DEBUG("order:      %d\n", order);
 	DRM_DEBUG("size:       %d\n", size);
 	DRM_DEBUG("agp_offset: %lx\n", agp_offset);
 	DRM_DEBUG("alignment:  %d\n", alignment);
 	DRM_DEBUG("page_order: %d\n", page_order);
 	DRM_DEBUG("total:      %d\n", total);
 
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return -EINVAL;
 
 	/* Make sure buffers are located in AGP memory that we own */
 	valid = 0;
 	list_for_each_entry(agp_entry, &dev->agp->memory, head) {
 		if ((agp_offset >= agp_entry->bound) &&
 		    (agp_offset + total * count <= agp_entry->bound + agp_entry->pages * PAGE_SIZE)) {
 			valid = 1;
 			break;
 		}
 	}
 	if (!list_empty(&dev->agp->memory) && !valid) {
 		DRM_DEBUG("zone invalid\n");
 		return -EINVAL;
 	}
 	mtx_lock(&dev->count_lock);
 	if (dev->buf_use) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	atomic_inc(&dev->buf_alloc);
 	mtx_unlock(&dev->count_lock);
 
 	DRM_LOCK(dev);
 	entry = &dma->bufs[order];
 	if (entry->buf_count) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;	/* May only call once for each order */
 	}
 
 	if (count < 0 || count > 4096) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -EINVAL;
 	}
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->buflist) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 
 	offset = 0;
 
 	while (entry->buf_count < count) {
 		buf = &entry->buflist[entry->buf_count];
 		buf->idx = dma->buf_count + entry->buf_count;
 		buf->total = alignment;
 		buf->order = order;
 		buf->used = 0;
 
 		buf->offset = (dma->byte_count + offset);
 		buf->bus_address = agp_offset + offset;
 		buf->address = (void *)(agp_offset + offset);
 		buf->next = NULL;
 		buf->waiting = 0;
 		buf->pending = 0;
 		buf->file_priv = NULL;
 
 		buf->dev_priv_size = dev->driver->dev_priv_size;
 		buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS,
 		    M_NOWAIT | M_ZERO);
 		if (!buf->dev_private) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			DRM_UNLOCK(dev);
 			atomic_dec(&dev->buf_alloc);
 			return -ENOMEM;
 		}
 
 		DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address);
 
 		offset += alignment;
 		entry->buf_count++;
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	DRM_DEBUG("byte_count: %d\n", byte_count);
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (!temp_buflist) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	dma->buf_count += entry->buf_count;
 	dma->seg_count += entry->seg_count;
 	dma->page_count += byte_count >> PAGE_SHIFT;
 	dma->byte_count += byte_count;
 
 	DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count);
 	DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count);
 
 	DRM_UNLOCK(dev);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	dma->flags = _DRM_DMA_USE_AGP;
 
 	atomic_dec(&dev->buf_alloc);
 	return 0;
 }
 EXPORT_SYMBOL(drm_addbufs_agp);
 #endif				/* __OS_HAS_AGP */
 
 int drm_addbufs_pci(struct drm_device * dev, struct drm_buf_desc * request)
 {
 	struct drm_device_dma *dma = dev->dma;
 	int count;
 	int order;
 	int size;
 	int total;
 	int page_order;
 	struct drm_buf_entry *entry;
 	drm_dma_handle_t *dmah;
 	struct drm_buf *buf;
 	int alignment;
 	unsigned long offset;
 	int i;
 	int byte_count;
 	int page_count;
 	unsigned long *temp_pagelist;
 	struct drm_buf **temp_buflist;
 
 	if (!drm_core_check_feature(dev, DRIVER_PCI_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	if (!DRM_SUSER(DRM_CURPROC))
 		return -EPERM;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	DRM_DEBUG("count=%d, size=%d (%d), order=%d\n",
 		  request->count, request->size, size, order);
 
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return -EINVAL;
 
 	alignment = (request->flags & _DRM_PAGE_ALIGN)
 	    ? PAGE_ALIGN(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	mtx_lock(&dev->count_lock);
 	if (dev->buf_use) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	atomic_inc(&dev->buf_alloc);
 	mtx_unlock(&dev->count_lock);
 
 	DRM_LOCK(dev);
 	entry = &dma->bufs[order];
 	if (entry->buf_count) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;	/* May only call once for each order */
 	}
 
 	if (count < 0 || count > 4096) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -EINVAL;
 	}
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->buflist) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 
 	entry->seglist = malloc(count * sizeof(*entry->seglist), DRM_MEM_SEGS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->seglist) {
 		free(entry->buflist, DRM_MEM_BUFS);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 
 	/* Keep the original pagelist until we know all the allocations
 	 * have succeeded
 	 */
 	temp_pagelist = malloc((dma->page_count + (count << page_order)) *
 	    sizeof(*dma->pagelist), DRM_MEM_PAGES, M_NOWAIT);
 	if (!temp_pagelist) {
 		free(entry->buflist, DRM_MEM_BUFS);
 		free(entry->seglist, DRM_MEM_SEGS);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 	memcpy(temp_pagelist,
 	       dma->pagelist, dma->page_count * sizeof(*dma->pagelist));
 	DRM_DEBUG("pagelist: %d entries\n",
 		  dma->page_count + (count << page_order));
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 	byte_count = 0;
 	page_count = 0;
 
 	while (entry->buf_count < count) {
 
 		dmah = drm_pci_alloc(dev, PAGE_SIZE << page_order, 0x1000, BUS_SPACE_MAXADDR);
 
 		if (!dmah) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			entry->seg_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			free(temp_pagelist, DRM_MEM_PAGES);
 			DRM_UNLOCK(dev);
 			atomic_dec(&dev->buf_alloc);
 			return -ENOMEM;
 		}
 		entry->seglist[entry->seg_count++] = dmah;
 		for (i = 0; i < (1 << page_order); i++) {
 			DRM_DEBUG("page %d @ 0x%08lx\n",
 				  dma->page_count + page_count,
 				  (unsigned long)dmah->vaddr + PAGE_SIZE * i);
 			temp_pagelist[dma->page_count + page_count++]
 				= (unsigned long)dmah->vaddr + PAGE_SIZE * i;
 		}
 		for (offset = 0;
 		     offset + size <= total && entry->buf_count < count;
 		     offset += alignment, ++entry->buf_count) {
 			buf = &entry->buflist[entry->buf_count];
 			buf->idx = dma->buf_count + entry->buf_count;
 			buf->total = alignment;
 			buf->order = order;
 			buf->used = 0;
 			buf->offset = (dma->byte_count + byte_count + offset);
 			buf->address = (void *)((char *)dmah->vaddr + offset);
 			buf->bus_address = dmah->busaddr + offset;
 			buf->next = NULL;
 			buf->waiting = 0;
 			buf->pending = 0;
 			buf->file_priv = NULL;
 
 			buf->dev_priv_size = dev->driver->dev_priv_size;
 			buf->dev_private = malloc(buf->dev_priv_size,
 			    DRM_MEM_BUFS, M_NOWAIT | M_ZERO);
 			if (!buf->dev_private) {
 				/* Set count correctly so we free the proper amount. */
 				entry->buf_count = count;
 				entry->seg_count = count;
 				drm_cleanup_buf_error(dev, entry);
 				free(temp_pagelist, DRM_MEM_PAGES);
 				DRM_UNLOCK(dev);
 				atomic_dec(&dev->buf_alloc);
 				return -ENOMEM;
 			}
 
 			DRM_DEBUG("buffer %d @ %p\n",
 				  entry->buf_count, buf->address);
 		}
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (!temp_buflist) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		free(temp_pagelist, DRM_MEM_PAGES);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	/* No allocations failed, so now we can replace the original pagelist
 	 * with the new one.
 	 */
 	if (dma->page_count) {
 		free(dma->pagelist, DRM_MEM_PAGES);
 	}
 	dma->pagelist = temp_pagelist;
 
 	dma->buf_count += entry->buf_count;
 	dma->seg_count += entry->seg_count;
 	dma->page_count += entry->seg_count << page_order;
 	dma->byte_count += PAGE_SIZE * (entry->seg_count << page_order);
 
 	DRM_UNLOCK(dev);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	if (request->flags & _DRM_PCI_BUFFER_RO)
 		dma->flags = _DRM_DMA_USE_PCI_RO;
 
 	atomic_dec(&dev->buf_alloc);
 	return 0;
 
 }
 EXPORT_SYMBOL(drm_addbufs_pci);
 
 static int drm_addbufs_sg(struct drm_device * dev, struct drm_buf_desc * request)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_entry *entry;
 	struct drm_buf *buf;
 	unsigned long offset;
 	unsigned long agp_offset;
 	int count;
 	int order;
 	int size;
 	int alignment;
 	int page_order;
 	int total;
 	int byte_count;
 	int i;
 	struct drm_buf **temp_buflist;
 
 	if (!drm_core_check_feature(dev, DRIVER_SG))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	if (!DRM_SUSER(DRM_CURPROC))
 		return -EPERM;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	alignment = (request->flags & _DRM_PAGE_ALIGN)
 	    ? PAGE_ALIGN(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	byte_count = 0;
 	agp_offset = request->agp_start;
 
 	DRM_DEBUG("count:      %d\n", count);
 	DRM_DEBUG("order:      %d\n", order);
 	DRM_DEBUG("size:       %d\n", size);
 	DRM_DEBUG("agp_offset: %lu\n", agp_offset);
 	DRM_DEBUG("alignment:  %d\n", alignment);
 	DRM_DEBUG("page_order: %d\n", page_order);
 	DRM_DEBUG("total:      %d\n", total);
 
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return -EINVAL;
 
 	mtx_lock(&dev->count_lock);
 	if (dev->buf_use) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	atomic_inc(&dev->buf_alloc);
 	mtx_unlock(&dev->count_lock);
 
 	DRM_LOCK(dev);
 	entry = &dma->bufs[order];
 	if (entry->buf_count) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;	/* May only call once for each order */
 	}
 
 	if (count < 0 || count > 4096) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -EINVAL;
 	}
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->buflist) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 
 	offset = 0;
 
 	while (entry->buf_count < count) {
 		buf = &entry->buflist[entry->buf_count];
 		buf->idx = dma->buf_count + entry->buf_count;
 		buf->total = alignment;
 		buf->order = order;
 		buf->used = 0;
 
 		buf->offset = (dma->byte_count + offset);
 		buf->bus_address = agp_offset + offset;
 		buf->address = (void *)(agp_offset + offset
 					+ (unsigned long)dev->sg->vaddr);
 		buf->next = NULL;
 		buf->waiting = 0;
 		buf->pending = 0;
 		buf->file_priv = NULL;
 
 		buf->dev_priv_size = dev->driver->dev_priv_size;
 		buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS,
 		    M_NOWAIT | M_ZERO);
 		if (!buf->dev_private) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			DRM_UNLOCK(dev);
 			atomic_dec(&dev->buf_alloc);
 			return -ENOMEM;
 		}
 
 		DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address);
 
 		offset += alignment;
 		entry->buf_count++;
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	DRM_DEBUG("byte_count: %d\n", byte_count);
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (!temp_buflist) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	dma->buf_count += entry->buf_count;
 	dma->seg_count += entry->seg_count;
 	dma->page_count += byte_count >> PAGE_SHIFT;
 	dma->byte_count += byte_count;
 
 	DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count);
 	DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count);
 
 	DRM_UNLOCK(dev);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	dma->flags = _DRM_DMA_USE_SG;
 
 	atomic_dec(&dev->buf_alloc);
 	return 0;
 }
 
 static int drm_addbufs_fb(struct drm_device * dev, struct drm_buf_desc * request)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_entry *entry;
 	struct drm_buf *buf;
 	unsigned long offset;
 	unsigned long agp_offset;
 	int count;
 	int order;
 	int size;
 	int alignment;
 	int page_order;
 	int total;
 	int byte_count;
 	int i;
 	struct drm_buf **temp_buflist;
 
 	if (!drm_core_check_feature(dev, DRIVER_FB_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	if (!DRM_SUSER(DRM_CURPROC))
 		return -EPERM;
 
 	count = request->count;
 	order = drm_order(request->size);
 	size = 1 << order;
 
 	alignment = (request->flags & _DRM_PAGE_ALIGN)
 	    ? PAGE_ALIGN(size) : size;
 	page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0;
 	total = PAGE_SIZE << page_order;
 
 	byte_count = 0;
 	agp_offset = request->agp_start;
 
 	DRM_DEBUG("count:      %d\n", count);
 	DRM_DEBUG("order:      %d\n", order);
 	DRM_DEBUG("size:       %d\n", size);
 	DRM_DEBUG("agp_offset: %lu\n", agp_offset);
 	DRM_DEBUG("alignment:  %d\n", alignment);
 	DRM_DEBUG("page_order: %d\n", page_order);
 	DRM_DEBUG("total:      %d\n", total);
 
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return -EINVAL;
 
 	mtx_lock(&dev->count_lock);
 	if (dev->buf_use) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	atomic_inc(&dev->buf_alloc);
 	mtx_unlock(&dev->count_lock);
 
 	DRM_LOCK(dev);
 	entry = &dma->bufs[order];
 	if (entry->buf_count) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;	/* May only call once for each order */
 	}
 
 	if (count < 0 || count > 4096) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -EINVAL;
 	}
 
 	entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS,
 	    M_NOWAIT | M_ZERO);
 	if (!entry->buflist) {
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 
 	entry->buf_size = size;
 	entry->page_order = page_order;
 
 	offset = 0;
 
 	while (entry->buf_count < count) {
 		buf = &entry->buflist[entry->buf_count];
 		buf->idx = dma->buf_count + entry->buf_count;
 		buf->total = alignment;
 		buf->order = order;
 		buf->used = 0;
 
 		buf->offset = (dma->byte_count + offset);
 		buf->bus_address = agp_offset + offset;
 		buf->address = (void *)(agp_offset + offset);
 		buf->next = NULL;
 		buf->waiting = 0;
 		buf->pending = 0;
 		buf->file_priv = NULL;
 
 		buf->dev_priv_size = dev->driver->dev_priv_size;
 		buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS,
 		    M_NOWAIT | M_ZERO);
 		if (!buf->dev_private) {
 			/* Set count correctly so we free the proper amount. */
 			entry->buf_count = count;
 			drm_cleanup_buf_error(dev, entry);
 			DRM_UNLOCK(dev);
 			atomic_dec(&dev->buf_alloc);
 			return -ENOMEM;
 		}
 
 		DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address);
 
 		offset += alignment;
 		entry->buf_count++;
 		byte_count += PAGE_SIZE << page_order;
 	}
 
 	DRM_DEBUG("byte_count: %d\n", byte_count);
 
 	temp_buflist = realloc(dma->buflist,
 	    (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist),
 	    DRM_MEM_BUFS, M_NOWAIT);
 	if (!temp_buflist) {
 		/* Free the entry because it isn't valid */
 		drm_cleanup_buf_error(dev, entry);
 		DRM_UNLOCK(dev);
 		atomic_dec(&dev->buf_alloc);
 		return -ENOMEM;
 	}
 	dma->buflist = temp_buflist;
 
 	for (i = 0; i < entry->buf_count; i++) {
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
 	dma->buf_count += entry->buf_count;
 	dma->seg_count += entry->seg_count;
 	dma->page_count += byte_count >> PAGE_SHIFT;
 	dma->byte_count += byte_count;
 
 	DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count);
 	DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count);
 
 	DRM_UNLOCK(dev);
 
 	request->count = entry->buf_count;
 	request->size = size;
 
 	dma->flags = _DRM_DMA_USE_FB;
 
 	atomic_dec(&dev->buf_alloc);
 	return 0;
 }
 
 
 /**
  * Add buffers for DMA transfers (ioctl).
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a struct drm_buf_desc request.
  * \return zero on success or a negative number on failure.
  *
  * According with the memory type specified in drm_buf_desc::flags and the
  * build options, it dispatches the call either to addbufs_agp(),
  * addbufs_sg() or addbufs_pci() for AGP, scatter-gather or consistent
  * PCI memory respectively.
  */
 int drm_addbufs(struct drm_device *dev, void *data,
 		struct drm_file *file_priv)
 {
 	struct drm_buf_desc *request = data;
 	int ret;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA))
 		return -EINVAL;
 
 #if __OS_HAS_AGP
 	if (request->flags & _DRM_AGP_BUFFER)
 		ret = drm_addbufs_agp(dev, request);
 	else
 #endif
 	if (request->flags & _DRM_SG_BUFFER)
 		ret = drm_addbufs_sg(dev, request);
 	else if (request->flags & _DRM_FB_BUFFER)
 		ret = drm_addbufs_fb(dev, request);
 	else
 		ret = drm_addbufs_pci(dev, request);
 
 	return ret;
 }
 
 /**
  * Get information about the buffer mappings.
  *
  * This was originally mean for debugging purposes, or by a sophisticated
  * client library to determine how best to use the available buffers (e.g.,
  * large buffers can be used for image transfer).
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a drm_buf_info structure.
  * \return zero on success or a negative number on failure.
  *
  * Increments drm_device::buf_use while holding the drm_device::count_lock
  * lock, preventing of allocating more buffers after this call. Information
  * about each requested buffer is then copied into user space.
  */
 int drm_infobufs(struct drm_device *dev, void *data,
 		 struct drm_file *file_priv)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_info *request = data;
 	int i;
 	int count;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	mtx_lock(&dev->count_lock);
 	if (atomic_read(&dev->buf_alloc)) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	++dev->buf_use;		/* Can't allocate more after this call */
 	mtx_unlock(&dev->count_lock);
 
 	for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) {
 		if (dma->bufs[i].buf_count)
 			++count;
 	}
 
 	DRM_DEBUG("count = %d\n", count);
 
 	if (request->count >= count) {
 		for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) {
 			if (dma->bufs[i].buf_count) {
 				struct drm_buf_desc __user *to =
 				    &request->list[count];
 				struct drm_buf_entry *from = &dma->bufs[i];
 				struct drm_freelist *list = &dma->bufs[i].freelist;
 				if (copy_to_user(&to->count,
 						 &from->buf_count,
 						 sizeof(from->buf_count)) ||
 				    copy_to_user(&to->size,
 						 &from->buf_size,
 						 sizeof(from->buf_size)) ||
 				    copy_to_user(&to->low_mark,
 						 &list->low_mark,
 						 sizeof(list->low_mark)) ||
 				    copy_to_user(&to->high_mark,
 						 &list->high_mark,
 						 sizeof(list->high_mark)))
 					return -EFAULT;
 
 				DRM_DEBUG("%d %d %d %d %d\n",
 					  i,
 					  dma->bufs[i].buf_count,
 					  dma->bufs[i].buf_size,
 					  dma->bufs[i].freelist.low_mark,
 					  dma->bufs[i].freelist.high_mark);
 				++count;
 			}
 		}
 	}
 	request->count = count;
 
 	return 0;
 }
 
 /**
  * Specifies a low and high water mark for buffer allocation
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg a pointer to a drm_buf_desc structure.
  * \return zero on success or a negative number on failure.
  *
  * Verifies that the size order is bounded between the admissible orders and
  * updates the respective drm_device_dma::bufs entry low and high water mark.
  *
  * \note This ioctl is deprecated and mostly never used.
  */
 int drm_markbufs(struct drm_device *dev, void *data,
 		 struct drm_file *file_priv)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_desc *request = data;
 	int order;
 	struct drm_buf_entry *entry;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	DRM_DEBUG("%d, %d, %d\n",
 		  request->size, request->low_mark, request->high_mark);
 	order = drm_order(request->size);
 	if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER)
 		return -EINVAL;
 	entry = &dma->bufs[order];
 
 	if (request->low_mark < 0 || request->low_mark > entry->buf_count)
 		return -EINVAL;
 	if (request->high_mark < 0 || request->high_mark > entry->buf_count)
 		return -EINVAL;
 
 	entry->freelist.low_mark = request->low_mark;
 	entry->freelist.high_mark = request->high_mark;
 
 	return 0;
 }
 
 /**
  * Unreserve the buffers in list, previously reserved using drmDMA.
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a drm_buf_free structure.
  * \return zero on success or a negative number on failure.
  *
  * Calls free_buffer() for each used buffer.
  * This function is primarily used for debugging.
  */
 int drm_freebufs(struct drm_device *dev, void *data,
 		 struct drm_file *file_priv)
 {
 	struct drm_device_dma *dma = dev->dma;
 	struct drm_buf_free *request = data;
 	int i;
 	int idx;
 	struct drm_buf *buf;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	DRM_DEBUG("%d\n", request->count);
 	for (i = 0; i < request->count; i++) {
 		if (copy_from_user(&idx, &request->list[i], sizeof(idx)))
 			return -EFAULT;
 		if (idx < 0 || idx >= dma->buf_count) {
 			DRM_ERROR("Index %d (of %d max)\n",
 				  idx, dma->buf_count - 1);
 			return -EINVAL;
 		}
 		buf = dma->buflist[idx];
 		if (buf->file_priv != file_priv) {
 			DRM_ERROR("Process %d freeing buffer not owned\n",
 				  DRM_CURRENTPID);
 			return -EINVAL;
 		}
 		drm_free_buffer(dev, buf);
 	}
 
 	return 0;
 }
 
 /**
  * Maps all of the DMA buffers into client-virtual space (ioctl).
  *
  * \param inode device inode.
  * \param file_priv DRM file private.
  * \param cmd command.
  * \param arg pointer to a drm_buf_map structure.
  * \return zero on success or a negative number on failure.
  *
  * Maps the AGP, SG or PCI buffer region with vm_mmap(), and copies information
  * about each buffer into user space. For PCI buffers, it calls vm_mmap() with
  * offset equal to 0, which drm_mmap() interpretes as PCI buffers and calls
  * drm_mmap_dma().
  */
 int drm_mapbufs(struct drm_device *dev, void *data,
 	        struct drm_file *file_priv)
 {
 	struct drm_device_dma *dma = dev->dma;
 	int retcode = 0;
 	const int zero = 0;
 	vm_offset_t virtual;
 	vm_offset_t address;
 	struct vmspace *vms;
 	struct drm_buf_map *request = data;
 	int i;
 
 	if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA))
 		return -EINVAL;
 
 	if (!dma)
 		return -EINVAL;
 
 	mtx_lock(&dev->count_lock);
 	if (atomic_read(&dev->buf_alloc)) {
 		mtx_unlock(&dev->count_lock);
 		return -EBUSY;
 	}
 	dev->buf_use++;		/* Can't allocate more after this call */
 	mtx_unlock(&dev->count_lock);
 
 	vms = DRM_CURPROC->td_proc->p_vmspace;
 
 	if (request->count >= dma->buf_count) {
 		if ((drm_core_has_AGP(dev) && (dma->flags & _DRM_DMA_USE_AGP))
 		    || (drm_core_check_feature(dev, DRIVER_SG)
 			&& (dma->flags & _DRM_DMA_USE_SG))
 		    || (drm_core_check_feature(dev, DRIVER_FB_DMA)
 			&& (dma->flags & _DRM_DMA_USE_FB))) {
 			struct drm_local_map *map = dev->agp_buffer_map;
 			vm_ooffset_t token = dev->agp_buffer_token;
 
 			if (!map) {
 				retcode = -EINVAL;
 				goto done;
 			}
 			retcode = vm_mmap(&vms->vm_map, &virtual, map->size,
-			    PROT_READ | PROT_WRITE, VM_PROT_ALL,
+			    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL,
 			    MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE,
 			    file_priv->minor->device, token);
 		} else {
 			retcode = vm_mmap(&vms->vm_map, &virtual, dma->byte_count,
-			    PROT_READ | PROT_WRITE, VM_PROT_ALL,
+			    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL,
 			    MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE,
 			    file_priv->minor->device, 0);
 		}
 		if (retcode) {
 			/* Real error */
 			retcode = -retcode;
 			goto done;
 		}
 		request->virtual = (void __user *)virtual;
 
 		for (i = 0; i < dma->buf_count; i++) {
 			if (copy_to_user(&request->list[i].idx,
 					 &dma->buflist[i]->idx,
 					 sizeof(request->list[0].idx))) {
 				retcode = -EFAULT;
 				goto done;
 			}
 			if (copy_to_user(&request->list[i].total,
 					 &dma->buflist[i]->total,
 					 sizeof(request->list[0].total))) {
 				retcode = -EFAULT;
 				goto done;
 			}
 			if (copy_to_user(&request->list[i].used,
 					 &zero, sizeof(zero))) {
 				retcode = -EFAULT;
 				goto done;
 			}
 			address = virtual + dma->buflist[i]->offset;	/* *** */
 			if (copy_to_user(&request->list[i].address,
 					 &address, sizeof(address))) {
 				retcode = -EFAULT;
 				goto done;
 			}
 		}
 	}
       done:
 	request->count = dma->buf_count;
 	DRM_DEBUG("%d buffers, retcode = %d\n", request->count, retcode);
 
 	return retcode;
 }
 
 /**
  * Compute size order.  Returns the exponent of the smaller power of two which
  * is greater or equal to given number.
  *
  * \param size size.
  * \return order.
  *
  * \todo Can be made faster.
  */
 int drm_order(unsigned long size)
 {
 	int order;
 	unsigned long tmp;
 
 	for (order = 0, tmp = size >> 1; tmp; tmp >>= 1, order++) ;
 
 	if (size & (size - 1))
 		++order;
 
 	return order;
 }
 EXPORT_SYMBOL(drm_order);
Index: head/sys/fs/devfs/devfs_vnops.c
===================================================================
--- head/sys/fs/devfs/devfs_vnops.c	(revision 283997)
+++ head/sys/fs/devfs/devfs_vnops.c	(revision 283998)
@@ -1,1835 +1,1900 @@
 /*-
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, cdevpriv_dtr_t priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp, 0);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (EBADF);
 	}
 	if ((vp->v_iflag & VI_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (EBADF);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (EBADF);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	i = *buflen;
 	dd = vp->v_data;
 
 	if (vp->v_type == VCHR) {
 		i -= strlen(dd->de_cdp->cdp_c.si_name);
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_cdp->cdp_c.si_name, buf + i,
 		    strlen(dd->de_cdp->cdp_c.si_name));
 		de = dd->de_dir;
 	} else if (vp->v_type == VDIR) {
 		if (dd == dmp->dm_rootdir) {
 			*dvp = vp;
 			vref(*dvp);
 			goto finished;
 		}
 		i -= dd->de_dirent->d_namlen;
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_dirent->d_name, buf + i,
 		    dd->de_dirent->d_namlen);
 		de = dd;
 	} else {
 		error = ENOENT;
 		goto finished;
 	}
 	*buflen = i;
 	de = devfs_parent_dirent(de);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		}
 		else if ((vp->v_iflag & VI_DOOMED) != 0) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		KASSERT(vp->v_usecount == 1,
 		    ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount));
 		dev->si_usecount += vp->v_usecount;
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int vp_locked, error, ref;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque1() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				VI_LOCK(vp);
 				if (count_dev(dev) == 2 &&
 				    (vp->v_iflag & VI_DOOMED) == 0) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				vrele(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/* Forced close. */
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (count_dev(dev) > 1) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	vholdl(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_fsync(struct vop_fsync_args *ap)
 {
 	int error;
 	struct bufobj *bo;
 	struct devfs_dirent *de;
 
 	if (!vn_isdisk(ap->a_vp, &error)) {
 		bo = &ap->a_vp->v_bufobj;
 		de = ap->a_vp->v_data;
 		if (error == ENXIO && bo->bo_dirty.bv_cnt > 0) {
 			printf("Device %s went missing before all of the data "
 			    "could be written to it; expect data loss.\n",
 			    de->de_dirent->d_name);
 
 			error = vop_stdfsync(ap);
 			if (bo->bo_dirty.bv_cnt != 0 || error != 0)
 				panic("devfs_fsync: vop_stdfsync failed.");
 		}
 
 		return (0);
 	}
 
 	return (vop_stdfsync(ap));
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	int error;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct vnode *vp;
 	struct vnode *vpold;
 	int error, i, ref;
 	const char *p;
 	struct fiodgname_arg *fgn;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_ioctl(fp, com, data, cred, td);
 		return (error);
 	}
 
 	if (com == FIODTYPE) {
 		*(int *)data = dsw->d_flags & D_TYPEMASK;
 		td->td_fpop = fpop;
 		dev_relthread(dev, ref);
 		return (0);
 	} else if (com == FIODGNAME) {
 		fgn = data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fgn->buf, i);
 		td->td_fpop = fpop;
 		dev_relthread(dev, ref);
 		return (error);
 	}
 	error = dsw->d_ioctl(dev, com, data, fp->f_flag, td);
 	td->td_fpop = NULL;
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 	if (error == 0 && com == TIOCSCTTY) {
 		vp = fp->f_vnode;
 
 		/* Do nothing if reassigning same control tty */
 		sx_slock(&proctree_lock);
 		if (td->td_proc->p_session->s_ttyvp == vp) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		vpold = td->td_proc->p_session->s_ttyvp;
 		VREF(vp);
 		SESS_LOCK(td->td_proc->p_session);
 		td->td_proc->p_session->s_ttyvp = vp;
 		td->td_proc->p_session->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(td->td_proc->p_session);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			vrele(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	dmp = VFSTODEVFS(dvp->v_mount);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 		error = devfs_allocv(de, dvp->v_mount,
 		    cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, dvp->v_mount, cnp->cn_lkflags & LK_TYPE_MASK,
 	    vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 	struct mtx *mtxp;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* cleanup any cdevpriv upon error */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 
 	/*
 	 * Hint to the dofilewrite() to not force the buffer draining
 	 * on the writer to the file.  Most likely, the write would
 	 * not need normal buffers.
 	 */
 	mtx_lock(mtxp);
 	fp->f_vnread_flags |= FDEVFS_VNODE;
 	mtx_unlock(mtxp);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_lock(&devfs_de_interlock);
 	de = vp->v_data;
 	if (de != NULL) {
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 	mtx_unlock(&devfs_de_interlock);
 
 	vnode_destroy_vobject(vp);
 
 	VI_LOCK(vp);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 
 	if (dev == NULL) {
 		dev_unlock();
 		VI_UNLOCK(vp);
 		return (0);
 	}
 
 	dev->si_usecount -= vp->v_usecount;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp, 0);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp, 0);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
  
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp,0);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				VI_LOCK(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK,
 				    curthread))
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			} 
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp, 0);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error)
 				return (error);
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error)
 				return (error);
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 	return (0);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
+static int
+devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
+    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
+    struct thread *td)
+{
+	struct cdev *dev;
+	struct cdevsw *dsw;
+	struct mount *mp;
+	struct vnode *vp;
+	struct file *fpop;
+	vm_object_t object;
+	vm_prot_t maxprot;
+	int error, ref;
+
+	vp = fp->f_vnode;
+
+	/*
+	 * Ensure that file and memory protections are
+	 * compatible.
+	 */
+	mp = vp->v_mount;
+	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0)
+		maxprot = VM_PROT_NONE;
+	else
+		maxprot = VM_PROT_EXECUTE;
+	if ((fp->f_flag & FREAD) != 0)
+		maxprot |= VM_PROT_READ;
+	else if ((prot & VM_PROT_READ) != 0)
+		return (EACCES);
+
+	/*
+	 * Character devices always share mappings, so
+	 * require a writable fd for writable mappings.
+	 */
+	if ((fp->f_flag & FWRITE) != 0)
+		maxprot |= VM_PROT_WRITE;
+	else if ((prot & VM_PROT_WRITE) != 0)
+		return (EACCES);
+	maxprot &= cap_maxprot;
+
+	fpop = td->td_fpop;
+	error = devfs_fp_check(fp, &dev, &dsw, &ref);
+	if (error != 0)
+		return (error);
+
+	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
+	    &object);
+	td->td_fpop = fpop;
+	dev_relthread(dev, ref);
+	if (error != 0)
+		return (error);
+
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, FALSE, td);
+	if (error != 0)
+		vm_object_deallocate(object);
+	return (error);
+}
+
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
+	.fo_mmap =	devfs_mmap_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 };
 
 struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		devfs_fsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 };
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
Index: head/sys/kern/subr_uio.c
===================================================================
--- head/sys/kern/subr_uio.c	(revision 283997)
+++ head/sys/kern/subr_uio.c	(revision 283998)
@@ -1,570 +1,570 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 
 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, UIO_MAXIOV,
 	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
 
 static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
 
 int
 copyin_nofault(const void *udaddr, void *kaddr, size_t len)
 {
 	int error, save;
 
 	save = vm_fault_disable_pagefaults();
 	error = copyin(udaddr, kaddr, len);
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 int
 copyout_nofault(const void *kaddr, void *udaddr, size_t len)
 {
 	int error, save;
 
 	save = vm_fault_disable_pagefaults();
 	error = copyout(kaddr, udaddr, len);
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 #define	PHYS_PAGE_COUNT(len)	(howmany(len, PAGE_SIZE) + 1)
 
 int
 physcopyin(void *src, vm_paddr_t dst, size_t len)
 {
 	vm_page_t m[PHYS_PAGE_COUNT(len)];
 	struct iovec iov[1];
 	struct uio uio;
 	int i;
 
 	iov[0].iov_base = src;
 	iov[0].iov_len = len;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE)
 		m[i] = PHYS_TO_VM_PAGE(dst);
 	return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio));
 }
 
 int
 physcopyout(vm_paddr_t src, void *dst, size_t len)
 {
 	vm_page_t m[PHYS_PAGE_COUNT(len)];
 	struct iovec iov[1];
 	struct uio uio;
 	int i;
 
 	iov[0].iov_base = dst;
 	iov[0].iov_len = len;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE)
 		m[i] = PHYS_TO_VM_PAGE(src);
 	return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio));
 }
 
 #undef PHYS_PAGE_COUNT
 
 int
 uiomove(void *cp, int n, struct uio *uio)
 {
 
 	return (uiomove_faultflag(cp, n, uio, 0));
 }
 
 int
 uiomove_nofault(void *cp, int n, struct uio *uio)
 {
 
 	return (uiomove_faultflag(cp, n, uio, 1));
 }
 
 static int
 uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
 {
 	struct thread *td;
 	struct iovec *iov;
 	size_t cnt;
 	int error, newflags, save;
 
 	td = curthread;
 	error = 0;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
 	    ("uiomove proc"));
 	if (!nofault)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "Calling uiomove()");
 
 	/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
 	newflags = TDP_DEADLKTREAT;
 	if (uio->uio_segflg == UIO_USERSPACE && nofault) {
 		/*
 		 * Fail if a non-spurious page fault occurs.
 		 */
 		newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
 	}
 	save = curthread_pflags_set(newflags);
 
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
 		if (cnt == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (cnt > n)
 			cnt = n;
 
 		switch (uio->uio_segflg) {
 
 		case UIO_USERSPACE:
 			maybe_yield();
 			if (uio->uio_rw == UIO_READ)
 				error = copyout(cp, iov->iov_base, cnt);
 			else
 				error = copyin(iov->iov_base, cp, cnt);
 			if (error)
 				goto out;
 			break;
 
 		case UIO_SYSSPACE:
 			if (uio->uio_rw == UIO_READ)
 				bcopy(cp, iov->iov_base, cnt);
 			else
 				bcopy(iov->iov_base, cp, cnt);
 			break;
 		case UIO_NOCOPY:
 			break;
 		}
 		iov->iov_base = (char *)iov->iov_base + cnt;
 		iov->iov_len -= cnt;
 		uio->uio_resid -= cnt;
 		uio->uio_offset += cnt;
 		cp = (char *)cp + cnt;
 		n -= cnt;
 	}
 out:
 	curthread_pflags_restore(save);
 	return (error);
 }
 
 /*
  * Wrapper for uiomove() that validates the arguments against a known-good
  * kernel buffer.  Currently, uiomove accepts a signed (n) argument, which
  * is almost definitely a bad thing, so we catch that here as well.  We
  * return a runtime failure, but it might be desirable to generate a runtime
  * assertion failure instead.
  */
 int
 uiomove_frombuf(void *buf, int buflen, struct uio *uio)
 {
 	size_t offset, n;
 
 	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
 	    (offset = uio->uio_offset) != uio->uio_offset)
 		return (EINVAL);
 	if (buflen <= 0 || offset >= buflen)
 		return (0);
 	if ((n = buflen - offset) > IOSIZE_MAX)
 		return (EINVAL);
 	return (uiomove((char *)buf + offset, n, uio));
 }
 
 /*
  * Give next character to user as result of read.
  */
 int
 ureadc(int c, struct uio *uio)
 {
 	struct iovec *iov;
 	char *iov_base;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "Calling ureadc()");
 
 again:
 	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
 		panic("ureadc");
 	iov = uio->uio_iov;
 	if (iov->iov_len == 0) {
 		uio->uio_iovcnt--;
 		uio->uio_iov++;
 		goto again;
 	}
 	switch (uio->uio_segflg) {
 
 	case UIO_USERSPACE:
 		if (subyte(iov->iov_base, c) < 0)
 			return (EFAULT);
 		break;
 
 	case UIO_SYSSPACE:
 		iov_base = iov->iov_base;
 		*iov_base = c;
 		break;
 
 	case UIO_NOCOPY:
 		break;
 	}
 	iov->iov_base = (char *)iov->iov_base + 1;
 	iov->iov_len--;
 	uio->uio_resid--;
 	uio->uio_offset++;
 	return (0);
 }
 
 int
 copyinfrom(const void * __restrict src, void * __restrict dst, size_t len,
     int seg)
 {
 	int error = 0;
 
 	switch (seg) {
 	case UIO_USERSPACE:
 		error = copyin(src, dst, len);
 		break;
 	case UIO_SYSSPACE:
 		bcopy(src, dst, len);
 		break;
 	default:
 		panic("copyinfrom: bad seg %d\n", seg);
 	}
 	return (error);
 }
 
 int
 copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len,
     size_t * __restrict copied, int seg)
 {
 	int error = 0;
 
 	switch (seg) {
 	case UIO_USERSPACE:
 		error = copyinstr(src, dst, len, copied);
 		break;
 	case UIO_SYSSPACE:
 		error = copystr(src, dst, len, copied);
 		break;
 	default:
 		panic("copyinstrfrom: bad seg %d\n", seg);
 	}
 	return (error);
 }
 
 int
 copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error)
 {
 	u_int iovlen;
 
 	*iov = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (error);
 	iovlen = iovcnt * sizeof (struct iovec);
 	*iov = malloc(iovlen, M_IOV, M_WAITOK);
 	error = copyin(iovp, *iov, iovlen);
 	if (error) {
 		free(*iov, M_IOV);
 		*iov = NULL;
 	}
 	return (error);
 }
 
 int
 copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec *iov;
 	struct uio *uio;
 	u_int iovlen;
 	int error, i;
 
 	*uiop = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (EINVAL);
 	iovlen = iovcnt * sizeof (struct iovec);
 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
 	iov = (struct iovec *)(uio + 1);
 	error = copyin(iovp, iov, iovlen);
 	if (error) {
 		free(uio, M_IOV);
 		return (error);
 	}
 	uio->uio_iov = iov;
 	uio->uio_iovcnt = iovcnt;
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) {
 			free(uio, M_IOV);
 			return (EINVAL);
 		}
 		uio->uio_resid += iov->iov_len;
 		iov++;
 	}
 	*uiop = uio;
 	return (0);
 }
 
 struct uio *
 cloneuio(struct uio *uiop)
 {
 	struct uio *uio;
 	int iovlen;
 
 	iovlen = uiop->uio_iovcnt * sizeof (struct iovec);
 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
 	*uio = *uiop;
 	uio->uio_iov = (struct iovec *)(uio + 1);
 	bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
 	return (uio);
 }
 
 /*
  * Map some anonymous memory in user space of size sz, rounded up to the page
  * boundary.
  */
 int
 copyout_map(struct thread *td, vm_offset_t *addr, size_t sz)
 {
 	struct vmspace *vms;
 	int error;
 	vm_size_t size;
 
 	vms = td->td_proc->p_vmspace;
 
 	/*
 	 * Map somewhere after heap in process memory.
 	 */
 	PROC_LOCK(td->td_proc);
 	*addr = round_page((vm_offset_t)vms->vm_daddr +
 	    lim_max(td->td_proc, RLIMIT_DATA));
 	PROC_UNLOCK(td->td_proc);
 
 	/* round size up to page boundry */
 	size = (vm_size_t)round_page(sz);
 
-	error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE,
+	error = vm_mmap(&vms->vm_map, addr, size, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0);
 
 	return (error);
 }
 
 /*
  * Unmap memory in user space.
  */
 int
 copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz)
 {
 	vm_map_t map;
 	vm_size_t size;
 
 	if (sz == 0)
 		return (0);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	size = (vm_size_t)round_page(sz);
 
 	if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS)
 		return (EINVAL);
 
 	return (0);
 }
 
 #ifdef NO_FUEWORD
 /*
  * XXXKIB The temporal implementation of fue*() functions which do not
  * handle usermode -1 properly, mixing it with the fault code.  Keep
  * this until MD code is written.  Currently sparc64, mips and arm do
  * not have proper implementation.
  */
 
 int
 fueword(volatile const void *base, long *val)
 {
 	long res;
 
 	res = fuword(base);
 	if (res == -1)
 		return (-1);
 	*val = res;
 	return (0);
 }
 
 int
 fueword32(volatile const void *base, int32_t *val)
 {
 	int32_t res;
 
 	res = fuword32(base);
 	if (res == -1)
 		return (-1);
 	*val = res;
 	return (0);
 }
 
 #ifdef _LP64
 int
 fueword64(volatile const void *base, int64_t *val)
 {
 	int32_t res;
 
 	res = fuword64(base);
 	if (res == -1)
 		return (-1);
 	*val = res;
 	return (0);
 }
 #endif
 
 int
 casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp,
     uint32_t newval)
 {
 	int32_t ov;
 
 	ov = casuword32(base, oldval, newval);
 	if (ov == -1)
 		return (-1);
 	*oldvalp = ov;
 	return (0);
 }
 
 int
 casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval)
 {
 	u_long ov;
 
 	ov = casuword(p, oldval, newval);
 	if (ov == -1)
 		return (-1);
 	*oldvalp = ov;
 	return (0);
 }
 #else /* NO_FUEWORD */
 int32_t
 fuword32(volatile const void *addr)
 {
 	int rv;
 	int32_t val;
 
 	rv = fueword32(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 
 #ifdef _LP64
 int64_t
 fuword64(volatile const void *addr)
 {
 	int rv;
 	int64_t val;
 
 	rv = fueword64(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 #endif /* _LP64 */
 
 long
 fuword(volatile const void *addr)
 {
 	long val;
 	int rv;
 
 	rv = fueword(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 
 uint32_t
 casuword32(volatile uint32_t *addr, uint32_t old, uint32_t new)
 {
 	int rv;
 	uint32_t val;
 
 	rv = casueword32(addr, old, &val, new);
 	return (rv == -1 ? -1 : val);
 }
 
 u_long
 casuword(volatile u_long *addr, u_long old, u_long new)
 {
 	int rv;
 	u_long val;
 
 	rv = casueword(addr, old, &val, new);
 	return (rv == -1 ? -1 : val);
 }
 
 #endif /* NO_FUEWORD */
Index: head/sys/kern/uipc_shm.c
===================================================================
--- head/sys/kern/uipc_shm.c	(revision 283997)
+++ head/sys/kern/uipc_shm.c	(revision 283998)
@@ -1,1060 +1,1088 @@
 /*-
  * Copyright (c) 2006, 2011 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2) and shm_unlink(2).  While most of the implementation is
  * here, vm_mmap.c contains mapping logic changes.
  *
  * TODO:
  *
  * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
  *     and ipcrm(1) be expanded or should new tools to manage both POSIX
  *     kernel semaphores and POSIX shared memory be written?
  *
  * (2) Add support for this file type to fstat(1).
  *
  * (3) Resource limits?  Does this need its own resource limits or are the
  *     existing limits in mmap(2) sufficient?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr *shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
 static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
 static void	shm_init(void *arg);
 static void	shm_drop(struct shmfd *shmfd);
 static struct shmfd *shm_hold(struct shmfd *shmfd);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	shm_dotruncate(struct shmfd *shmfd, off_t length);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
+static fo_mmap_t	shm_mmap;
 
 /* File descriptor operations. */
 static struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
+	.fo_mmap = shm_mmap,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	VM_OBJECT_WLOCK(obj);
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Parallel reads of the page content from disk are prevented
 	 * by exclusive busy.
 	 *
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, 0);
 			m = vm_page_lookup(obj, idx);
 			if (m == NULL) {
 				printf(
 		    "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
 				    obj, idx, rv);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 			if (rv != VM_PAGER_OK) {
 				printf(
 	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
 				    obj, idx, m->valid, rv);
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 	}
 	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	if (m->queue == PQ_NONE) {
 		vm_page_deactivate(m);
 	} else {
 		/* Requeue to maintain LRU ordering. */
 		vm_page_requeue(m);
 	}
 	vm_page_unlock(m);
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_page_dirty(m);
 		vm_pager_page_unswapped(m);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 static int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
 	vm_page_t m, ma[1];
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	object = shmfd->shm_object;
 	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
 			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
 				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
 				if (m == NULL) {
 					VM_OBJECT_WUNLOCK(object);
 					VM_WAIT;
 					VM_OBJECT_WLOCK(object);
 					goto retry;
 				} else if (m->valid != VM_PAGE_BITS_ALL) {
 					ma[0] = m;
 					rv = vm_pager_get_pages(object, ma, 1,
 					    0);
 					m = vm_page_lookup(object, idx);
 				} else
 					/* A cached page was reactivated. */
 					rv = VM_PAGER_OK;
 				vm_page_lock(m);
 				if (rv == VM_PAGER_OK) {
 					vm_page_deactivate(m);
 					vm_page_unlock(m);
 					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_dirty(m);
 				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = ptoa(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
 			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		/* Attempt to reserve the swap */
 		delta = ptoa(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 static struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 	int ino;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	shmfd->shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	ino = alloc_unr(shm_ino_unr);
 	if (ino == -1)
 		shmfd->shm_ino = 0;
 	else
 		shmfd->shm_ino = ino;
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 static struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 static void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		if (shmfd->shm_ino != 0)
 			free_unr(shm_ino_unr, shmfd->shm_ino);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 static int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
 	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /* System calls. */
 int
 sys_shm_open(struct thread *td, struct shm_open_args *uap)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
 	    (uap->flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (uap->path == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((uap->flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 		error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 #ifdef KTRACE
 		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
 			ktrnamei(path);
 #endif
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[0] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
 		}
 
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (uap->flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 			if ((uap->flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(uap->flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(uap->flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (uap->flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					shm_dotruncate(shmfd, 0);
 			}
 			if (error == 0)
 				shm_hold(shmfd);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
-/*
- * mmap() helper to validate mmap() requests against shm object state
- * and give mmap() the vm_object to use for the mapping.
- */
 int
-shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
-    vm_object_t *obj)
+shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
+    vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
+    vm_ooffset_t foff, struct thread *td)
 {
+	struct shmfd *shmfd;
+	vm_prot_t maxprot;
+	int error;
 
+	shmfd = fp->f_data;
+	maxprot = VM_PROT_NONE;
+
+	/* FREAD should always be set. */
+	if ((fp->f_flag & FREAD) != 0)
+		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
+	if ((fp->f_flag & FWRITE) != 0)
+		maxprot |= VM_PROT_WRITE;
+
+	/* Don't permit shared writable mappings on read-only descriptors. */
+	if ((flags & MAP_SHARED) != 0 &&
+	    (maxprot & VM_PROT_WRITE) == 0 &&
+	    (prot & VM_PROT_WRITE) != 0)
+		return (EACCES);
+	maxprot &= cap_maxprot;
+
+#ifdef MAC
+	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
+	if (error != 0)
+		return (error);
+#endif
+	
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (foff >= shmfd->shm_size ||
 	    foff + objsize > round_page(shmfd->shm_size))
 		return (EINVAL);
 
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
-	*obj = shmfd->shm_object;
+
+	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
+	    shmfd->shm_object, foff, FALSE, td);
+	if (error != 0)
+		vm_object_deallocate(shmfd->shm_object);
 	return (0);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct shmfd *shmfd;
 
 	kif->kf_type = KF_TYPE_SHM;
 	shmfd = fp->f_data;
 
 	mtx_lock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;	/* XXX */
 	mtx_unlock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		sx_slock(&shm_dict_lock);
 		if (shmfd->shm_path != NULL)
 			strlcpy(kif->kf_path, shmfd->shm_path,
 			    sizeof(kif->kf_path));
 		sx_sunlock(&shm_dict_lock);
 	}
 	return (0);
 }
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c	(revision 283997)
+++ head/sys/kern/vfs_vnops.c	(revision 283998)
@@ -1,2364 +1,2460 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vnode_pager.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_stat_t	vn_statfile;
 static fo_close_t	vn_closefile;
+static fo_mmap_t	vn_mmap;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
+	.fo_mmap = vn_mmap,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 static const int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
 	int *flagp, cmode;
 	struct file *fp;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 
 restart:
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		/*
 		 * Set NOCACHE to avoid flushing the cache when
 		 * rolling in many files at once.
 		*/
 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
 		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
 			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				goto restart;
 			}
 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 						   &ndp->ni_cnd, vap);
 			vput(ndp->ni_dvp);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = ISOPEN |
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
 		if (!(fmode & FWRITE))
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
 		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
 			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, td, fp);
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	struct mount *mp;
 	accmode_t accmode;
 	struct flock lf;
 	int error, have_flock, lock_flags, type;
 
 	if (vp->v_type == VLNK)
 		return (EMLINK);
 	if (vp->v_type == VSOCK)
 		return (EOPNOTSUPP);
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR)
 			return (EISDIR);
 		accmode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		accmode |= VREAD;
 	if (fmode & FEXEC)
 		accmode |= VEXEC;
 	if ((fmode & O_APPEND) && (fmode & FWRITE))
 		accmode |= VAPPEND;
 #ifdef MAC
 	if (fmode & O_CREAT)
 		accmode |= VCREAT;
 	if (fmode & O_VERIFY)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (accmode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
 				return (error);
 		}
 		if (accmode) {
 		        error = VOP_ACCESS(vp, accmode, cred, td);
 			if (error)
 				return (error);
 		}
 	}
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
 		return (error);
 
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		KASSERT(fp != NULL, ("open with flock requires fp"));
 		lock_flags = VOP_ISLOCKED(vp);
 		VOP_UNLOCK(vp, 0);
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 		have_flock = (error == 0);
 		vn_lock(vp, lock_flags | LK_RETRY);
 		if (error == 0 && vp->v_iflag & VI_DOOMED)
 			error = ENOENT;
 		/*
 		 * Another thread might have used this vnode as an
 		 * executable while the vnode lock was dropped.
 		 * Ensure the vnode is still able to be opened for
 		 * writing after the lock has been obtained.
 		 */
 		if (error == 0 && accmode & VWRITE)
 			error = vn_writechk(vp);
 		if (error) {
 			VOP_UNLOCK(vp, 0);
 			if (have_flock) {
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf,
 				    F_FLOCK);
 			}
 			vn_start_write(vp, &mp, V_WAIT);
 			vn_lock(vp, lock_flags | LK_RETRY);
 			(void)VOP_CLOSE(vp, fmode, cred, td);
 			vn_finished_write(mp);
 			/* Prevent second close from fdrop()->vn_close(). */
 			if (fp != NULL)
 				fp->f_ops= &badfileops;
 			return (error);
 		}
 		fp->f_flag |= FHASLOCK;
 	}
 	if (fmode & FWRITE) {
 		VOP_ADD_WRITECOUNT(vp, 1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (0);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, file_cred, td)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 	    MNT_EXTENDED_SHARED(vp->v_mount))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	if (flags & FWRITE) {
 		VNASSERT(vp->v_writecount > 0, vp, 
 		    ("vn_close: negative writecount"));
 		VOP_ADD_WRITECOUNT(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 	    uio->uio_offset == fp->f_nextoff) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
 		if (fp->f_seqcount > IO_SEQMAX)
 			fp->f_seqcount = IO_SEQMAX;
 		return (fp->f_seqcount << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount > 1)
 		fp->f_seqcount = 1;
 	else
 		fp->f_seqcount = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
 			else
 				lock_flags = LK_EXCLUSIVE;
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp, 0);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
     file_cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	void *base;
 	size_t len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	size_t *aresid;
 	struct thread *td;
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 #if OFF_MAX <= LONG_MAX
 	/*
 	 * Caller only wants the current f_offset value.  Assume that
 	 * the long and shorter integer types reads are atomic.
 	 */
 	if ((flags & FOF_NOLOCK) != 0)
 		return (fp->f_offset);
 #endif
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 #if OFF_MAX <= LONG_MAX
 	if ((flags & FOF_NOLOCK) != 0) {
 		if ((flags & FOF_NOUPDATE) == 0)
 			fp->f_offset = val;
 		if ((flags & FOF_NEXTOFF) != 0)
 			fp->f_nextoff = val;
 		return;
 	}
 #endif
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF) != 0)
 		fp->f_nextoff = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct mtx *mtxp;
 	int error, ioflag;
 	int advice;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    offset != uio->uio_offset) {
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush clean pages and
 		 * buffers for the backing file after a
 		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
 		 * case of using POSIX_FADV_NOREUSE with sequential
 		 * access, track the previous implicit DONTNEED
 		 * request and grow this request to include the
 		 * current read(2) in addition to the previous
 		 * DONTNEED.  With purely sequential access this will
 		 * cause the DONTNEED requests to continously grow to
 		 * cover all of the previously read regions of the
 		 * file.  This allows filesystem blocks that are
 		 * accessed by multiple calls to read(2) to be flushed
 		 * once the last read(2) finishes.
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
 			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
 				start = fp->f_advice->fa_prevstart;
 			else if (fp->f_advice->fa_prevstart != 0 &&
 			    fp->f_advice->fa_prevstart == end + 1)
 				end = fp->f_advice->fa_prevend;
 			fp->f_advice->fa_prevstart = start;
 			fp->f_advice->fa_prevend = end;
 		}
 		mtx_unlock(mtxp);
 		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
 	}
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct mtx *mtxp;
 	int error, ioflag, lock_flags;
 	int advice;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
 
 	advice = get_advice(fp, uio);
 
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
 		lock_flags = LK_SHARED;
 	} else {
 		lock_flags = LK_EXCLUSIVE;
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    offset != uio->uio_offset) {
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush clean pages and
 		 * buffers for the backing file after a
 		 * POSIX_FADV_NOREUSE write(2).  To optimize the
 		 * common case of using POSIX_FADV_NOREUSE with
 		 * sequential access, track the previous implicit
 		 * DONTNEED request and grow this request to include
 		 * the current write(2) in addition to the previous
 		 * DONTNEED.  With purely sequential access this will
 		 * cause the DONTNEED requests to continously grow to
 		 * cover all of the previously written regions of the
 		 * file.
 		 *
 		 * Note that the blocks just written are almost
 		 * certainly still dirty, so this only works when
 		 * VOP_ADVISE() calls from subsequent writes push out
 		 * the data written by this write(2) once the backing
 		 * buffers are clean.  However, as compared to forcing
 		 * IO_DIRECT, this gives much saner behavior.  Write
 		 * clustering is still allowed, and clean pages are
 		 * merely moved to the cache page queue rather than
 		 * outright thrown away.  This means a subsequent
 		 * read(2) can still avoid hitting the disk if the
 		 * pages have not been reclaimed.
 		 *
 		 * This does make POSIX_FADV_NOREUSE largely useless
 		 * with non-sequential access.  However, sequential
 		 * access is the more common use case and the flag is
 		 * merely advisory.
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
 			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
 				start = fp->f_advice->fa_prevstart;
 			else if (fp->f_advice->fa_prevstart != 0 &&
 			    fp->f_advice->fa_prevstart == end + 1)
 				end = fp->f_advice->fa_prevend;
 			fp->f_advice->fa_prevstart = start;
 			fp->f_advice->fa_prevend = end;
 		}
 		mtx_unlock(mtxp);
 		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
 	}
 	
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		return ((args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td));
 	case VN_IO_FAULT_VOP:
 		if (uio->uio_rw == UIO_READ) {
 			return (VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred));
 		} else if (uio->uio_rw == UIO_WRITE) {
 			return (VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred));
 		}
 		break;
 	}
 	panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind,
 	    uio->uio_rw);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, save, saveheld, prev_td_ma_cnt;
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	save = vm_fault_disable_pagefaults();
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > io_hold_cnt * PAGE_SIZE)
 			len = io_hold_cnt * PAGE_SIZE;
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		cnt = atop(end - trunc_page(addr));
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
 	vm_fault_enable_pagefaults(save);
 	free(uio_clone, M_IOV);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 	foffset_lock_uio(fp, uio, flags);
 	if (do_vn_io_fault(vp, uio)) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 		error = vn_io_fault1(vp, uio, &args, td);
 		vn_rangelock_unlock(vp, rl_cookie);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_writechk(vp);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
 	}
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(fp, sb, active_cred, td)
 	struct file *fp;
 	struct stat *sb;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 	VOP_UNLOCK(vp, 0);
 
 	return (error);
 }
 
 /*
  * Stat a vnode; implementation for the stat syscall
  */
 int
 vn_stat(vp, sb, active_cred, file_cred, td)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 #ifdef MAC
 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
 	if (error)
 		return (error);
 #endif
 
 	vap = &vattr;
 
 	/*
 	 * Initialize defaults for new and unusual fields, so that file
 	 * systems which don't support these fields don't need to know
 	 * about them.
 	 */
 	vap->va_birthtime.tv_sec = -1;
 	vap->va_birthtime.tv_nsec = 0;
 	vap->va_fsid = VNOVAL;
 	vap->va_rdev = NODEV;
 
 	error = VOP_GETATTR(vp, vap, active_cred);
 	if (error)
 		return (error);
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	bzero(sb, sizeof *sb);
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX)
 		return (EOVERFLOW);
 	sb->st_size = vap->va_size;
 	sb->st_atim = vap->va_atime;
 	sb->st_mtim = vap->va_mtime;
 	sb->st_ctim = vap->va_ctime;
 	sb->st_birthtim = vap->va_birthtime;
 
         /*
 	 * According to www.opengroup.org, the meaning of st_blksize is 
 	 *   "a filesystem-specific preferred I/O block size for this 
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
 	 */
 
 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
 	
 	sb->st_flags = vap->va_flags;
 	if (priv_check(td, PRIV_VFS_GENERATION))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, active_cred, td)
 	struct file *fp;
 	u_long com;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			error = VOP_GETATTR(vp, &vattr, active_cred);
 			VOP_UNLOCK(vp, 0);
 			if (error == 0)
 				*(int *)data = vattr.va_size - fp->f_offset;
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #ifdef MAC
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 	VOP_UNLOCK(vp, 0);
 	if (!error)
 #endif
 
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 int
 _vn_lock(struct vnode *vp, int flags, char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock called with no locktype."));
 	do {
 #ifdef DEBUG_VFS_LOCKS
 		KASSERT(vp->v_holdcnt != 0,
 		    ("vn_lock %p: zero hold count", vp));
 #endif
 		error = VOP_LOCK1(vp, flags, file, line);
 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
 		    flags, error));
 		/*
 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
 		 * If RETRY is not set, we return ENOENT instead.
 		 */
 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
 		    (flags & LK_RETRY) == 0) {
 			VOP_UNLOCK(vp, 0);
 			error = ENOENT;
 			break;
 		}
 	} while (flags & LK_RETRY && error != 0);
 	return (error);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
 
 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK)
 		vref(vp);
 
 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
 
 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 static bool
 vn_suspendable_mp(struct mount *mp)
 {
 
 	return ((mp->mnt_kern_flag & MNTK_SUSPENDABLE) != 0);
 }
 
 static bool
 vn_suspendable(struct vnode *vp, struct mount **mpp)
 {
 
 	if (vp != NULL)
 		*mpp = vp->v_mount;
 	if (*mpp == NULL)
 		return (false);
 
 	return (vn_suspendable_mp(*mpp));
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_locked(struct mount *mp, int flags)
 {
 	int error, mflags;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
 		    (flags & PCATCH) : 0) | (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if (flags & V_NOWAIT) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error)
 				goto unlock;
 		}
 	}
 	if (flags & V_XSLEEP)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 	if (!vn_suspendable(vp, mpp)) {
 		if ((flags & V_MNTREF) != 0)
 			vfs_rel(*mpp);
 		return (0);
 	}
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		MNT_REF(mp);
 
 	return (vn_start_write_locked(mp, flags));
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 	if (!vn_suspendable(vp, mpp)) {
 		if ((flags & V_MNTREF) != 0)
 			vfs_rel(*mpp);
 		return (0);
 	}
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
 	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
 	    "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL || !vn_suspendable_mp(mp))
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_writeopcount--;
 	if (mp->mnt_writeopcount < 0)
 		panic("vn_finished_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_writeopcount <= 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL || !vn_suspendable_mp(mp))
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	MPASS(vn_suspendable_mp(mp));
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
 		vfs_write_resume(mp, 0);
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MPASS(vn_suspendable_mp(mp));
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_locked(mp, 0);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	MPASS(vn_suspendable_mp(mp));
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp, 0);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp, 0);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (vp->v_iflag & VI_DOOMED) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp, 0);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (*rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     const struct thread *td)
 {
 
 	if (vp->v_type != VREG || td == NULL)
 		return (0);
 	PROC_LOCK(td->td_proc);
 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
 	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 		kern_psignal(td->td_proc, SIGXFSZ);
 		PROC_UNLOCK(td->td_proc);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (0);
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp, 0);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp, 0);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	struct vattr va;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("Wrong command %lu", cmd));
 
 	if (vn_lock(vp, LK_SHARED) != 0)
 		return (EBADF);
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto unlock;
 	}
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		goto unlock;
 	noff = *off;
 	if (noff >= va.va_size) {
 		error = ENXIO;
 		goto unlock;
 	}
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto unlock;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto unlock;
 		}
 	}
 	if (noff > va.va_size)
 		noff = va.va_size;
 	/* noff == va.va_size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 unlock:
 	VOP_UNLOCK(vp, 0);
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t foffset, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 	noneg = (vp->v_type != VCHR);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		VOP_UNLOCK(vp, 0);
 		if (error)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			vattr.va_size = size;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp, 0);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	return (0);
+}
+
+int
+vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
+    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
+    struct thread *td)
+{
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_in pkm;
+#endif
+	struct mount *mp;
+	struct vnode *vp;
+	vm_object_t object;
+	vm_prot_t maxprot;
+	boolean_t writecounted;
+	int error;
+
+#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
+    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
+	/*
+	 * POSIX shared-memory objects are defined to have
+	 * kernel persistence, and are not defined to support
+	 * read(2)/write(2) -- or even open(2).  Thus, we can
+	 * use MAP_ASYNC to trade on-disk coherence for speed.
+	 * The shm_open(3) library routine turns on the FPOSIXSHM
+	 * flag to request this behavior.
+	 */
+	if ((fp->f_flag & FPOSIXSHM) != 0)
+		flags |= MAP_NOSYNC;
+#endif
+	vp = fp->f_vnode;
+
+	/*
+	 * Ensure that file and memory protections are
+	 * compatible.  Note that we only worry about
+	 * writability if mapping is shared; in this case,
+	 * current and max prot are dictated by the open file.
+	 * XXX use the vnode instead?  Problem is: what
+	 * credentials do we use for determination? What if
+	 * proc does a setuid?
+	 */
+	mp = vp->v_mount;
+	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0)
+		maxprot = VM_PROT_NONE;
+	else
+		maxprot = VM_PROT_EXECUTE;
+	if ((fp->f_flag & FREAD) != 0)
+		maxprot |= VM_PROT_READ;
+	else if ((prot & VM_PROT_READ) != 0)
+		return (EACCES);
+
+	/*
+	 * If we are sharing potential changes via MAP_SHARED and we
+	 * are trying to get write permission although we opened it
+	 * without asking for it, bail out.
+	 */
+	if ((flags & MAP_SHARED) != 0) {
+		if ((fp->f_flag & FWRITE) != 0)
+			maxprot |= VM_PROT_WRITE;
+		else if ((prot & VM_PROT_WRITE) != 0)
+			return (EACCES);
+	} else {
+		maxprot |= VM_PROT_WRITE;
+		cap_maxprot |= VM_PROT_WRITE;
+	}
+	maxprot &= cap_maxprot;
+
+	writecounted = FALSE;
+	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
+	    &foff, &object, &writecounted);
+	if (error != 0)
+		return (error);
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, writecounted, td);
+	if (error != 0) {
+		/*
+		 * If this mapping was accounted for in the vnode's
+		 * writecount, then undo that now.
+		 */
+		if (writecounted)
+			vnode_pager_release_writecount(object, 0, size);
+		vm_object_deallocate(object);
+	}
+#ifdef HWPMC_HOOKS
+	/* Inform hwpmc(4) if an executable is being mapped. */
+	if (error == 0 && (prot & VM_PROT_EXECUTE) != 0) {
+		pkm.pm_file = vp;
+		pkm.pm_address = (uintptr_t) addr;
+		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
+	}
+#endif
+	return (error);
 }
Index: head/sys/sys/file.h
===================================================================
--- head/sys/sys/file.h	(revision 283997)
+++ head/sys/sys/file.h	(revision 283998)
@@ -1,396 +1,413 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)file.h	8.3 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILE_H_
 #define	_SYS_FILE_H_
 
 #ifndef _KERNEL
 #include <sys/types.h> /* XXX */
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #else
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <vm/vm.h>
 
 struct filedesc;
 struct stat;
 struct thread;
 struct uio;
 struct knote;
 struct vnode;
 struct socket;
 
 
 #endif /* _KERNEL */
 
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 #define	DTYPE_CRYPTO	6	/* crypto */
 #define	DTYPE_MQUEUE	7	/* posix message queue */
 #define	DTYPE_SHM	8	/* swap-backed shared memory */
 #define	DTYPE_SEM	9	/* posix semaphore */
 #define	DTYPE_PTS	10	/* pseudo teletype master device */
 #define	DTYPE_DEV	11	/* Device specific fd type */
 #define	DTYPE_PROCDESC	12	/* process descriptor */
 #define	DTYPE_LINUXEFD	13	/* emulation eventfd type */
 
 #ifdef _KERNEL
 
 struct file;
 struct filecaps;
 struct kinfo_file;
 struct ucred;
 
 #define	FOF_OFFSET	0x01	/* Use the offset in uio argument */
 #define	FOF_NOLOCK	0x02	/* Do not take FOFFSET_LOCK */
 #define	FOF_NEXTOFF	0x04	/* Also update f_nextoff */
 #define	FOF_NOUPDATE	0x10	/* Do not update f_offset */
 off_t foffset_lock(struct file *fp, int flags);
 void foffset_lock_uio(struct file *fp, struct uio *uio, int flags);
 void foffset_unlock(struct file *fp, off_t val, int flags);
 void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags);
 
 static inline off_t
 foffset_get(struct file *fp)
 {
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
 
 typedef int fo_rdwr_t(struct file *fp, struct uio *uio,
 		    struct ucred *active_cred, int flags,
 		    struct thread *td);
 typedef	int fo_truncate_t(struct file *fp, off_t length,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_ioctl_t(struct file *fp, u_long com, void *data,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_poll_t(struct file *fp, int events,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_kqfilter_t(struct file *fp, struct knote *kn);
 typedef	int fo_stat_t(struct file *fp, struct stat *sb,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_close_t(struct file *fp, struct thread *td);
 typedef	int fo_chmod_t(struct file *fp, mode_t mode,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_chown_t(struct file *fp, uid_t uid, gid_t gid,
 		    struct ucred *active_cred, struct thread *td);
 typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio,
 		    struct uio *trl_uio, off_t offset, size_t nbytes,
 		    off_t *sent, int flags, int kflags, struct thread *td);
 typedef int fo_seek_t(struct file *fp, off_t offset, int whence,
 		    struct thread *td);
 typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif,
 		    struct filedesc *fdp);
+typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr,
+		    vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot,
+		    int flags, vm_ooffset_t foff, struct thread *td);
 typedef	int fo_flags_t;
 
 struct fileops {
 	fo_rdwr_t	*fo_read;
 	fo_rdwr_t	*fo_write;
 	fo_truncate_t	*fo_truncate;
 	fo_ioctl_t	*fo_ioctl;
 	fo_poll_t	*fo_poll;
 	fo_kqfilter_t	*fo_kqfilter;
 	fo_stat_t	*fo_stat;
 	fo_close_t	*fo_close;
 	fo_chmod_t	*fo_chmod;
 	fo_chown_t	*fo_chown;
 	fo_sendfile_t	*fo_sendfile;
 	fo_seek_t	*fo_seek;
 	fo_fill_kinfo_t	*fo_fill_kinfo;
+	fo_mmap_t	*fo_mmap;
 	fo_flags_t	fo_flags;	/* DFLAG_* below */
 };
 
 #define DFLAG_PASSABLE	0x01	/* may be passed via unix sockets. */
 #define DFLAG_SEEKABLE	0x02	/* seekable / nonsequential */
 #endif /* _KERNEL */
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 /*
  * Kernel descriptor table.
  * One entry for each open kernel vnode and socket.
  *
  * Below is the list of locks that protects members in struct file.
  *
  * (a) f_vnode lock required (shared allows both reads and writes)
  * (f) protected with mtx_lock(mtx_pool_find(fp))
  * (d) cdevpriv_mtx
  * none	not locked
  */
 
 struct fadvise_info {
 	int		fa_advice;	/* (f) FADV_* type. */
 	off_t		fa_start;	/* (f) Region start. */
 	off_t		fa_end;		/* (f) Region end. */
 	off_t		fa_prevstart;	/* (f) Previous NOREUSE start. */
 	off_t		fa_prevend;	/* (f) Previous NOREUSE end. */
 };
 
 struct file {
 	void		*f_data;	/* file descriptor specific data */
 	struct fileops	*f_ops;		/* File operations */
 	struct ucred	*f_cred;	/* associated credentials. */
 	struct vnode 	*f_vnode;	/* NULL or applicable vnode */
 	short		f_type;		/* descriptor type */
 	short		f_vnread_flags; /* (f) Sleep lock for f_offset */
 	volatile u_int	f_flag;		/* see fcntl.h */
 	volatile u_int 	f_count;	/* reference count */
 	/*
 	 *  DTYPE_VNODE specific fields.
 	 */
 	int		f_seqcount;	/* (a) Count of sequential accesses. */
 	off_t		f_nextoff;	/* next expected read/write offset. */
 	union {
 		struct cdev_privdata *fvn_cdevpriv;
 					/* (d) Private data for the cdev. */
 		struct fadvise_info *fvn_advice;
 	} f_vnun;
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */
 	off_t		f_offset;
 	/*
 	 * Mandatory Access control information.
 	 */
 	void		*f_label;	/* Place-holder for MAC label. */
 };
 
 #define	f_cdevpriv	f_vnun.fvn_cdevpriv
 #define	f_advice	f_vnun.fvn_advice
 
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2
 #define	FDEVFS_VNODE	     0x4
 
 #endif /* _KERNEL || _WANT_FILE */
 
 /*
  * Userland version of struct file, for sysctl
  */
 struct xfile {
 	size_t	xf_size;	/* size of struct xfile */
 	pid_t	xf_pid;		/* owning process */
 	uid_t	xf_uid;		/* effective uid of owning process */
 	int	xf_fd;		/* descriptor number */
 	void	*xf_file;	/* address of struct file */
 	short	xf_type;	/* descriptor type */
 	int	xf_count;	/* reference count */
 	int	xf_msgcount;	/* references from message queue */
 	off_t	xf_offset;	/* file offset */
 	void	*xf_data;	/* file descriptor specific data */
 	void	*xf_vnode;	/* vnode pointer */
 	u_int	xf_flag;	/* flags (see fcntl.h) */
 };
 
 #ifdef _KERNEL
 
 extern struct fileops vnops;
 extern struct fileops badfileops;
 extern struct fileops socketops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
 extern volatile int openfiles;	/* actual number of open files */
 
 int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp);
 int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp,
     u_char *maxprotp, struct file **fpp);
 int fget_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp,
     int needfcntl, struct file **fpp);
 int _fdrop(struct file *fp, struct thread *td);
 
 fo_rdwr_t	invfo_rdwr;
 fo_truncate_t	invfo_truncate;
 fo_ioctl_t	invfo_ioctl;
 fo_poll_t	invfo_poll;
 fo_kqfilter_t	invfo_kqfilter;
 fo_chmod_t	invfo_chmod;
 fo_chown_t	invfo_chown;
 fo_sendfile_t	invfo_sendfile;
 
 fo_sendfile_t	vn_sendfile;
 fo_seek_t	vn_seek;
 fo_fill_kinfo_t	vn_fill_kinfo;
 int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif);
 
 void finit(struct file *, u_int, short, void *, struct fileops *);
 int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp);
 int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 
 int fgetsock(struct thread *td, int fd, cap_rights_t *rightsp,
     struct socket **spp, u_int *fflagp);
 void fputsock(struct socket *sp);
 
 static __inline int
 _fnoop(void)
 {
 
 	return (0);
 }
 
 #define	fhold(fp)							\
 	(refcount_acquire(&(fp)->f_count))
 #define	fdrop(fp, td)							\
 	(refcount_release(&(fp)->f_count) ? _fdrop((fp), (td)) : _fnoop())
 
 static __inline fo_rdwr_t	fo_read;
 static __inline fo_rdwr_t	fo_write;
 static __inline fo_truncate_t	fo_truncate;
 static __inline fo_ioctl_t	fo_ioctl;
 static __inline fo_poll_t	fo_poll;
 static __inline fo_kqfilter_t	fo_kqfilter;
 static __inline fo_stat_t	fo_stat;
 static __inline fo_close_t	fo_close;
 static __inline fo_chmod_t	fo_chmod;
 static __inline fo_chown_t	fo_chown;
 static __inline fo_sendfile_t	fo_sendfile;
 
 static __inline int
 fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td));
 }
 
 static __inline int
 fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td));
 }
 
 static __inline int
 fo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td));
 }
 
 static __inline int
 fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td));
 }
 
 static __inline int
 fo_close(struct file *fp, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_close)(fp, td));
 }
 
 static __inline int
 fo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return ((*fp->f_ops->fo_kqfilter)(fp, kn));
 }
 
 static __inline int
 fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td));
 }
 
 static __inline int
 fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td));
 }
 
 static __inline int
 fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset,
 	    nbytes, sent, flags, kflags, td));
 }
 
 static __inline int
 fo_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_seek)(fp, offset, whence, td));
 }
 
 static __inline int
 fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp));
+}
+
+static __inline int
+fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
+    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
+    struct thread *td)
+{
+
+	if (fp->f_ops->fo_mmap == NULL)
+		return (ENODEV);
+	return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot,
+	    flags, foff, td));
 }
 
 #endif /* _KERNEL */
 
 #endif /* !SYS_FILE_H */
Index: head/sys/sys/mman.h
===================================================================
--- head/sys/sys/mman.h	(revision 283997)
+++ head/sys/sys/mman.h	(revision 283998)
@@ -1,273 +1,271 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mman.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MMAN_H_
 #define _SYS_MMAN_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 
 #if __BSD_VISIBLE
 /*
  * Inheritance for minherit()
  */
 #define INHERIT_SHARE	0
 #define INHERIT_COPY	1
 #define INHERIT_NONE	2
 #endif
 
 /*
  * Protections are chosen from these bits, or-ed together
  */
 #define	PROT_NONE	0x00	/* no permissions */
 #define	PROT_READ	0x01	/* pages can be read */
 #define	PROT_WRITE	0x02	/* pages can be written */
 #define	PROT_EXEC	0x04	/* pages can be executed */
 
 /*
  * Flags contain sharing type and options.
  * Sharing types; choose one.
  */
 #define	MAP_SHARED	0x0001		/* share changes */
 #define	MAP_PRIVATE	0x0002		/* changes are private */
 #if __BSD_VISIBLE
 #define	MAP_COPY	MAP_PRIVATE	/* Obsolete */
 #endif
 
 /*
  * Other flags
  */
 #define	MAP_FIXED	 0x0010	/* map addr must be exactly as requested */
 
 #if __BSD_VISIBLE
 #define	MAP_RESERVED0020 0x0020	/* previously unimplemented MAP_RENAME */
 #define	MAP_RESERVED0040 0x0040	/* previously unimplemented MAP_NORESERVE */
 #define	MAP_RESERVED0080 0x0080	/* previously misimplemented MAP_INHERIT */
 #define	MAP_RESERVED0100 0x0100	/* previously unimplemented MAP_NOEXTEND */
 #define	MAP_HASSEMAPHORE 0x0200	/* region may contain semaphores */
 #define	MAP_STACK	 0x0400	/* region grows down, like a stack */
 #define	MAP_NOSYNC	 0x0800 /* page to but do not sync underlying file */
 
 /*
  * Mapping type
  */
 #define	MAP_FILE	 0x0000	/* map from file (default) */
 #define	MAP_ANON	 0x1000	/* allocated from memory, swap space */
 #ifndef _KERNEL
 #define	MAP_ANONYMOUS	 MAP_ANON /* For compatibility. */
 #endif /* !_KERNEL */
 
 /*
  * Extended flags
  */
 #define	MAP_EXCL	 0x00004000 /* for MAP_FIXED, fail if address is used */
 #define	MAP_NOCORE	 0x00020000 /* dont include these pages in a coredump */
 #define	MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */
 #ifdef __LP64__
 #define	MAP_32BIT	 0x00080000 /* map in the low 2GB of address space */
 #endif
 
 /*
  * Request specific alignment (n == log2 of the desired alignment).
  *
  * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does
  * not enforce a specific alignment.
  */
 #define	MAP_ALIGNED(n)	 ((n) << MAP_ALIGNMENT_SHIFT)
 #define	MAP_ALIGNMENT_SHIFT	24
 #define	MAP_ALIGNMENT_MASK	MAP_ALIGNED(0xff)
 #define	MAP_ALIGNED_SUPER	MAP_ALIGNED(1) /* align on a superpage */
 #endif /* __BSD_VISIBLE */
 
 #if __POSIX_VISIBLE >= 199309
 /*
  * Process memory locking
  */
 #define MCL_CURRENT	0x0001	/* Lock only current memory */
 #define MCL_FUTURE	0x0002	/* Lock all future memory as well */
 #endif
 
 /*
  * Error return from mmap()
  */
 #define MAP_FAILED	((void *)-1)
 
 /*
  * msync() flags
  */
 #define	MS_SYNC		0x0000	/* msync synchronously */
 #define MS_ASYNC	0x0001	/* return immediately */
 #define MS_INVALIDATE	0x0002	/* invalidate all cached data */
 
 /*
  * Advice to madvise
  */
 #define	_MADV_NORMAL	0	/* no further special treatment */
 #define	_MADV_RANDOM	1	/* expect random page references */
 #define	_MADV_SEQUENTIAL 2	/* expect sequential page references */
 #define	_MADV_WILLNEED	3	/* will need these pages */
 #define	_MADV_DONTNEED	4	/* dont need these pages */
 
 #if __BSD_VISIBLE
 #define	MADV_NORMAL	_MADV_NORMAL
 #define	MADV_RANDOM	_MADV_RANDOM
 #define	MADV_SEQUENTIAL _MADV_SEQUENTIAL
 #define	MADV_WILLNEED	_MADV_WILLNEED
 #define	MADV_DONTNEED	_MADV_DONTNEED
 #define	MADV_FREE	5	/* dont need these pages, and junk contents */
 #define	MADV_NOSYNC	6	/* try to avoid flushes to physical media */
 #define	MADV_AUTOSYNC	7	/* revert to default flushing strategy */
 #define	MADV_NOCORE	8	/* do not include these pages in a core file */
 #define	MADV_CORE	9	/* revert to including pages in a core file */
 #define	MADV_PROTECT	10	/* protect process from pageout kill */
 
 /*
  * Return bits from mincore
  */
 #define	MINCORE_INCORE	 	 0x1 /* Page is incore */
 #define	MINCORE_REFERENCED	 0x2 /* Page has been referenced by us */
 #define	MINCORE_MODIFIED	 0x4 /* Page has been modified by us */
 #define	MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */
 #define	MINCORE_MODIFIED_OTHER	0x10 /* Page has been modified */
 #define	MINCORE_SUPER		0x20 /* Page is a "super" page */
 
 /*
  * Anonymous object constant for shm_open().
  */
 #define	SHM_ANON		((char *)1)
 #endif /* __BSD_VISIBLE */
 
 /*
  * XXX missing POSIX_TYPED_MEM_* macros and
  * posix_typed_mem_info structure.
  */
 #if __POSIX_VISIBLE >= 200112
 #define	POSIX_MADV_NORMAL	_MADV_NORMAL
 #define	POSIX_MADV_RANDOM	_MADV_RANDOM
 #define	POSIX_MADV_SEQUENTIAL	_MADV_SEQUENTIAL
 #define	POSIX_MADV_WILLNEED	_MADV_WILLNEED
 #define	POSIX_MADV_DONTNEED	_MADV_DONTNEED
 #endif
 
 #ifndef _MODE_T_DECLARED
 typedef	__mode_t	mode_t;
 #define	_MODE_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t		off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rangelock.h>
 #include <vm/vm.h>
 
 struct file;
 
 struct shmfd {
 	size_t		shm_size;
 	vm_object_t	shm_object;
 	int		shm_refs;
 	uid_t		shm_uid;
 	gid_t		shm_gid;
 	mode_t		shm_mode;
 	int		shm_kmappings;
 
 	/*
 	 * Values maintained solely to make this a better-behaved file
 	 * descriptor for fstat() to run on.
 	 */
 	struct timespec	shm_atime;
 	struct timespec	shm_mtime;
 	struct timespec	shm_ctime;
 	struct timespec	shm_birthtime;
 	ino_t		shm_ino;
 
 	struct label	*shm_label;		/* MAC label */
 	const char	*shm_path;
 
 	struct rangelock shm_rl;
 	struct mtx	shm_mtx;
 };
 #endif
 
 #ifdef _KERNEL
-int	shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
-	    vm_object_t *obj);
 int	shm_map(struct file *fp, size_t size, off_t offset, void **memp);
 int	shm_unmap(struct file *fp, void *mem, size_t size);
 
 #else /* !_KERNEL */
 
 __BEGIN_DECLS
 /*
  * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(),
  * posix_typed_mem_open().
  */
 #if __BSD_VISIBLE
 int	getpagesizes(size_t *, int);
 int	madvise(void *, size_t, int);
 int	mincore(const void *, size_t, char *);
 int	minherit(void *, size_t, int);
 #endif
 int	mlock(const void *, size_t);
 #ifndef _MMAP_DECLARED
 #define	_MMAP_DECLARED
 void *	mmap(void *, size_t, int, int, int, off_t);
 #endif
 int	mprotect(const void *, size_t, int);
 int	msync(void *, size_t, int);
 int	munlock(const void *, size_t);
 int	munmap(void *, size_t);
 #if __POSIX_VISIBLE >= 200112
 int	posix_madvise(void *, size_t, int);
 #endif
 #if __POSIX_VISIBLE >= 199309
 int	mlockall(int);
 int	munlockall(void);
 int	shm_open(const char *, int, mode_t);
 int	shm_unlink(const char *);
 #endif
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_MMAN_H_ */
Index: head/sys/vm/vm_extern.h
===================================================================
--- head/sys/vm/vm_extern.h	(revision 283997)
+++ head/sys/vm/vm_extern.h	(revision 283998)
@@ -1,108 +1,118 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
  * $FreeBSD$
  */
 
 #ifndef _VM_EXTERN_H_
 #define	_VM_EXTERN_H_
 
 struct pmap;
 struct proc;
 struct vmspace;
 struct vnode;
 struct vmem;
 
 #ifdef _KERNEL
+struct cdev;
+struct cdevsw;
 
 /* These operate on kernel virtual addresses only. */
 vm_offset_t kva_alloc(vm_size_t);
 void kva_free(vm_offset_t, vm_size_t);
 
 /* These operate on pageable virtual addresses. */
 vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t);
 void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
 
 /* These operate on virtual addresses backed by memory. */
 vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
 vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr);
 vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags);
 void kmem_free(struct vmem *, vm_offset_t, vm_size_t);
 
 /* This provides memory for previously allocated address space. */
 int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int);
 void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
 
 /* Bootstrapping. */
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
 void kmem_init(vm_offset_t, vm_offset_t);
 void kmem_init_zero_region(void);
 void kmeminit(void);
 
 void swapout_procs(int);
 int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
     vm_ooffset_t *);
 int vm_fault_disable_pagefaults(void);
 void vm_fault_enable_pagefaults(int save);
 int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
-int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *,
+    struct vmspace *, int);
 void vm_waitproc(struct proc *);
-int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
+int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
+    objtype_t, void *, vm_ooffset_t);
+int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
+    vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
 int vm_mmap_to_errno(int rv);
+int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
+    int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
+int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
+    struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
 struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *);
 int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
 int vmspace_unshare(struct proc *);
 void vmspace_exit(struct thread *);
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
 struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
 int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c	(revision 283997)
+++ head/sys/vm/vm_mmap.c	(revision 283998)
@@ -1,1732 +1,1611 @@
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 int old_mlock = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
-static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
-static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
-static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
-
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_sbrk(td, uap)
 	struct thread *td;
 	struct sbrk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_sstk(td, uap)
 	struct thread *td;
 	struct sstk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 int
 ogetpagesize(td, uap)
 	struct thread *td;
 	struct getpagesize_args *uap;
 {
 	/* MP SAFE */
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 */
 
 
 /*
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mmap(td, uap)
 	struct thread *td;
 	struct mmap_args *uap;
 {
-#ifdef HWPMC_HOOKS
-	struct pmckern_map_in pkm;
-#endif
 	struct file *fp;
-	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
-	vm_prot_t cap_maxprot, maxprot;
-	void *handle;
-	objtype_t handle_type;
+	vm_prot_t cap_maxprot;
 	int align, error, flags, prot;
 	off_t pos;
 	struct vmspace *vms = td->td_proc->p_vmspace;
 	cap_rights_t rights;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot;
 	flags = uap->flags;
 	pos = uap->pos;
 
 	fp = NULL;
 
 	/*
 	 * Ignore old flags that used to be defined but did not do anything.
 	 */
 	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
 	
 	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
 	 * zero position for new code. Be nice to ancient a.out
 	 * binaries and correct pos for anonymous mapping, since old
 	 * ld.so sometimes issues anonymous map requests with non-zero
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
 		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
 		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
 			pos = 0;
 	}
 
 	if (flags & MAP_STACK) {
 		if ((uap->fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
 	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
 	    MAP_PREFAULT_READ |
 #ifdef MAP_32BIT
 	    MAP_32BIT |
 #endif
 	    MAP_ALIGNMENT_MASK)) != 0)
 		return (EINVAL);
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
 	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
 		return (EINVAL);
 	if (prot != PROT_NONE &&
 	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Adjust size for rounding (on both ends). */
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
 	/* Ensure alignment is at least a page and fits in a pointer. */
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 
 		/* Address range must be all in user VM space. */
 		if (addr < vm_map_min(&vms->vm_map) ||
 		    addr + size > vm_map_max(&vms->vm_map))
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
 #ifdef MAP_32BIT
 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
 			return (EINVAL);
 	} else if (flags & MAP_32BIT) {
 		/*
 		 * For MAP_32BIT, override the hint if it is too high and
 		 * do not bother moving the mapping past the heap (since
 		 * the heap is usually above 2GB).
 		 */
 		if (addr + size > MAP_32BIT_MAX_ADDR)
 			addr = 0;
 #endif
 	} else {
 		/*
 		 * XXX for non-fixed mappings where no hint is provided or
 		 * the hint would fall in the potential heap space,
 		 * place it after the end of the largest possible heap.
 		 *
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
 		PROC_LOCK(td->td_proc);
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td->td_proc, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
 			    lim_max(td->td_proc, RLIMIT_DATA));
 		PROC_UNLOCK(td->td_proc);
 	}
-	if (flags & MAP_ANON) {
+	if (size == 0) {
 		/*
+		 * Return success without mapping anything for old
+		 * binaries that request a page-aligned mapping of
+		 * length 0.  For modern binaries, this function
+		 * returns an error earlier.
+		 */
+		error = 0;
+	} else if (flags & MAP_ANON) {
+		/*
 		 * Mapping blank space is trivial.
+		 *
+		 * This relies on VM_PROT_* matching PROT_*.
 		 */
-		handle = NULL;
-		handle_type = OBJT_DEFAULT;
-		maxprot = VM_PROT_ALL;
-		cap_maxprot = VM_PROT_ALL;
+		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
+		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
 		 * descriptor disappear on us if we block. Check capability
 		 * rights, but also return the maximum rights to be combined
 		 * with maxprot later.
 		 */
 		cap_rights_init(&rights, CAP_MMAP);
 		if (prot & PROT_READ)
 			cap_rights_set(&rights, CAP_MMAP_R);
 		if ((flags & MAP_SHARED) != 0) {
 			if (prot & PROT_WRITE)
 				cap_rights_set(&rights, CAP_MMAP_W);
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set(&rights, CAP_MMAP_X);
 		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
 		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
 		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
 			error = EINVAL;
 			goto done;
 		}
-		if (fp->f_type == DTYPE_SHM) {
-			handle = fp->f_data;
-			handle_type = OBJT_SWAP;
-			maxprot = VM_PROT_NONE;
 
-			/* FREAD should always be set. */
-			if (fp->f_flag & FREAD)
-				maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
-			if (fp->f_flag & FWRITE)
-				maxprot |= VM_PROT_WRITE;
-			goto map;
-		}
-		if (fp->f_type != DTYPE_VNODE) {
-			error = ENODEV;
-			goto done;
-		}
-#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
-    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
-		/*
-		 * POSIX shared-memory objects are defined to have
-		 * kernel persistence, and are not defined to support
-		 * read(2)/write(2) -- or even open(2).  Thus, we can
-		 * use MAP_ASYNC to trade on-disk coherence for speed.
-		 * The shm_open(3) library routine turns on the FPOSIXSHM
-		 * flag to request this behavior.
-		 */
-		if (fp->f_flag & FPOSIXSHM)
-			flags |= MAP_NOSYNC;
-#endif
-		vp = fp->f_vnode;
-		/*
-		 * Ensure that file and memory protections are
-		 * compatible.  Note that we only worry about
-		 * writability if mapping is shared; in this case,
-		 * current and max prot are dictated by the open file.
-		 * XXX use the vnode instead?  Problem is: what
-		 * credentials do we use for determination? What if
-		 * proc does a setuid?
-		 */
-		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
-			maxprot = VM_PROT_NONE;
-		else
-			maxprot = VM_PROT_EXECUTE;
-		if (fp->f_flag & FREAD) {
-			maxprot |= VM_PROT_READ;
-		} else if (prot & PROT_READ) {
-			error = EACCES;
-			goto done;
-		}
-		/*
-		 * If we are sharing potential changes (either via
-		 * MAP_SHARED or via the implicit sharing of character
-		 * device mappings), and we are trying to get write
-		 * permission although we opened it without asking
-		 * for it, bail out.
-		 */
-		if ((flags & MAP_SHARED) != 0) {
-			if ((fp->f_flag & FWRITE) != 0) {
-				maxprot |= VM_PROT_WRITE;
-			} else if ((prot & PROT_WRITE) != 0) {
-				error = EACCES;
-				goto done;
-			}
-		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
-			maxprot |= VM_PROT_WRITE;
-			cap_maxprot |= VM_PROT_WRITE;
-		}
-		handle = (void *)vp;
-		handle_type = OBJT_VNODE;
+		/* This relies on VM_PROT_* matching PROT_*. */
+		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
+		    cap_maxprot, flags, pos, td);
 	}
-map:
-	td->td_fpop = fp;
-	maxprot &= cap_maxprot;
 
-	/* This relies on VM_PROT_* matching PROT_*. */
-	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
-	    flags, handle_type, handle, pos);
-	td->td_fpop = NULL;
-#ifdef HWPMC_HOOKS
-	/* inform hwpmc(4) if an executable is being mapped */
-	if (error == 0 && handle_type == OBJT_VNODE &&
-	    (prot & PROT_EXEC)) {
-		pkm.pm_file = handle;
-		pkm.pm_address = (uintptr_t) addr;
-		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
-	}
-#endif
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
 	if (fp)
 		fdrop(fp, td);
 
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
 	struct mmap_args oargs;
 
 	oargs.addr = uap->addr;
 	oargs.len = uap->len;
 	oargs.prot = uap->prot;
 	oargs.flags = uap->flags;
 	oargs.fd = uap->fd;
 	oargs.pos = uap->pos;
 	return (sys_mmap(td, &oargs));
 }
 #endif
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(td, uap)
 	struct thread *td;
 	struct ommap_args *uap;
 {
 	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
 	nargs.addr = uap->addr;
 	nargs.len = uap->len;
 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
 #ifdef COMPAT_FREEBSD32
 #if defined(__amd64__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
 	    nargs.prot != 0)
 		nargs.prot |= PROT_EXEC;
 #endif
 #endif
 	nargs.flags = 0;
 	if (uap->flags & OMAP_ANON)
 		nargs.flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		nargs.flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		nargs.flags |= MAP_SHARED;
 	else
 		nargs.flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		nargs.flags |= MAP_FIXED;
 	nargs.fd = uap->fd;
 	nargs.pos = uap->pos;
 	return (sys_mmap(td, &nargs));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	size_t len;
 	int flags;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_msync(td, uap)
 	struct thread *td;
 	struct msync_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int flags;
 	vm_map_t map;
 	int rv;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	flags = uap->flags;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 		return (ENOMEM);
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_munmap(td, uap)
 	struct thread *td;
 	struct munmap_args *uap;
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
 #endif
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_map_t map;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap...
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
 		return (EINVAL);
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
 	/*
 	 * Inform hwpmc if the address range being unmapped contains
 	 * an executable region.
 	 */
 	pkm.pm_address = (uintptr_t) NULL;
 	if (vm_map_lookup_entry(map, addr, &entry)) {
 		for (;
 		     entry != &map->header && entry->start < addr + size;
 		     entry = entry->next) {
 			if (vm_map_check_protection(map, entry->start,
 				entry->end, VM_PROT_EXECUTE) == TRUE) {
 				pkm.pm_address = (uintptr_t) addr;
 				pkm.pm_size = (size_t) size;
 				break;
 			}
 		}
 	}
 #endif
 	vm_map_delete(map, addr, addr + size);
 
 #ifdef HWPMC_HOOKS
 	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
 	vm_map_lock_downgrade(map);
 	if (pkm.pm_address != (uintptr_t) NULL)
 		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
 	vm_map_unlock_read(map);
 #else
 	vm_map_unlock(map);
 #endif
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_mprotect(td, uap)
 	struct thread *td;
 	struct mprotect_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, prot, FALSE)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	case KERN_RESOURCE_SHORTAGE:
 		return (ENOMEM);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_minherit(td, uap)
 	struct thread *td;
 	struct minherit_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	inherit = uap->inherit;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, inherit)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_madvise(td, uap)
 	struct thread *td;
 	struct madvise_args *uap;
 {
 	vm_offset_t start, end;
 	vm_map_t map;
 	int flags;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
 	if (uap->behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
 	}
 
 	/*
 	 * Check for illegal behavior
 	 */
 	if (uap->behav < 0 || uap->behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
 		return (EINVAL);
 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
 	start = trunc_page((vm_offset_t) uap->addr);
 	end = round_page((vm_offset_t) uap->addr + uap->len);
 
 	if (vm_map_madvise(map, start, end, uap->behav))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mincore(td, uap)
 	struct thread *td;
 	struct mincore_args *uap;
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
 	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_paddr_t locked_pa;
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int mincoreinfo;
 	unsigned int timestamp;
 	boolean_t locked;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + (vm_size_t)round_page(uap->len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
 	/*
 	 * Address of byte vector
 	 */
 	vec = uap->vec;
 
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry)) {
 		vm_map_unlock_read(map);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	for (current = entry;
 	    (current != &map->header) && (current->start < end);
 	    current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
 		if (current->end < end &&
 		    (entry->next == &map->header ||
 		     current->next->start > current->end)) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			current->object.vm_object == NULL)
 			continue;
 
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		/*
 		 * scan this entry one page at a time
 		 */
 		while (addr < cend) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			object = NULL;
 			locked_pa = 0;
 		retry:
 			m = NULL;
 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
 			if (locked_pa != 0) {
 				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
 				 * other mappings might be examined.
 				 */
 				m = PHYS_TO_VM_PAGE(locked_pa);
 				if (m->object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = m->object;
 					locked = VM_OBJECT_TRYWLOCK(object);
 					vm_page_unlock(m);
 					if (!locked) {
 						VM_OBJECT_WLOCK(object);
 						vm_page_lock(m);
 						goto retry;
 					}
 				} else
 					vm_page_unlock(m);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("mincore: page %p is mapped but invalid",
 				    m));
 			} else if (mincoreinfo == 0) {
 				/*
 				 * The page is not mapped by this process.  If
 				 * the object implements managed pages, then
 				 * determine if the page is resident so that
 				 * the mappings might be examined.
 				 */
 				if (current->object.vm_object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = current->object.vm_object;
 					VM_OBJECT_WLOCK(object);
 				}
 				if (object->type == OBJT_DEFAULT ||
 				    object->type == OBJT_SWAP ||
 				    object->type == OBJT_VNODE) {
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
 					if (m == NULL &&
 					    vm_page_is_cached(object, pindex))
 						mincoreinfo = MINCORE_INCORE;
 					if (m != NULL && m->valid == 0)
 						m = NULL;
 					if (m != NULL)
 						mincoreinfo = MINCORE_INCORE;
 				}
 			}
 			if (m != NULL) {
 				/* Examine other mappings to the page. */
 				if (m->dirty == 0 && pmap_is_modified(m))
 					vm_page_dirty(m);
 				if (m->dirty != 0)
 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
 				/*
 				 * The first test for PGA_REFERENCED is an
 				 * optimization.  The second test is
 				 * required because a concurrent pmap
 				 * operation could clear the last reference
 				 * and set PGA_REFERENCED before the call to
 				 * pmap_is_referenced(). 
 				 */
 				if ((m->aflags & PGA_REFERENCED) != 0 ||
 				    pmap_is_referenced(m) ||
 				    (m->aflags & PGA_REFERENCED) != 0)
 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
 			}
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = OFF_TO_IDX(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				++lastvecindex;
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte(vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 			addr += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 	}
 
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_mlock(td, uap)
 	struct thread *td;
 	struct mlock_args *uap;
 {
 
 	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
 }
 
 int
 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	vm_map_t map;
 	unsigned long nsize;
 	int error;
 
 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
 	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(proc);
 		error = racct_set(proc, RACCT_MEMLOCK, nsize);
 		PROC_UNLOCK(proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 	error = vm_map_wire(map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(proc);
 		racct_set(proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mlockall(td, uap)
 	struct thread *td;
 	struct mlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 
 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 		return (EINVAL);
 
 	/*
 	 * If wiring all pages in the process would cause it to exceed
 	 * a hard resource limit, return ENOMEM.
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		PROC_LOCK(td->td_proc);
 		if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 
 	if (uap->how & MCL_FUTURE) {
 		vm_map_lock(map);
 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
 		vm_map_unlock(map);
 		error = 0;
 	}
 
 	if (uap->how & MCL_CURRENT) {
 		/*
 		 * P1003.1-2001 mandates that all currently mapped pages
 		 * will be memory resident and locked (wired) upon return
 		 * from mlockall(). vm_map_wire() will wire pages, by
 		 * calling vm_fault_wire() for each page in the region.
 		 */
 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 	}
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlockall_args {
 	register_t dummy;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_munlockall(td, uap)
 	struct thread *td;
 	struct munlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
 	vm_map_lock(map);
 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
 	vm_map_unlock(map);
 
 	/* Forcibly unwire all pages. */
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_munlock(td, uap)
 	struct thread *td;
 	struct munlock_args *uap;
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
 #ifdef RACCT
 	vm_map_t map;
 #endif
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		map = &td->td_proc->p_vmspace->vm_map;
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap_vnode()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
- *
- * For VCHR vnodes, the vnode lock is held over the call to
- * vm_mmap_cdev() to keep vp->v_rdev valid.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
     boolean_t *writecounted)
 {
 	struct vattr va;
 	vm_object_t obj;
 	vm_offset_t foff;
 	struct ucred *cred;
 	int error, flags, locktype;
 
 	cred = td->td_ucred;
 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 		locktype = LK_EXCLUSIVE;
 	else
 		locktype = LK_SHARED;
 	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
 	if (vp->v_type == VREG) {
 		/*
 		 * Get the proper underlying object
 		 */
 		if (obj == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode *)obj->handle;
 			/*
 			 * Bypass filesystems obey the mpsafety of the
 			 * underlying fs.  Tmpfs never bypasses.
 			 */
 			error = vget(vp, locktype, td);
 			if (error != 0)
 				return (error);
 		}
 		if (locktype == LK_EXCLUSIVE) {
 			*writecounted = TRUE;
 			vnode_pager_update_writecount(obj, 0, objsize);
 		}
-	} else if (vp->v_type == VCHR) {
-		error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
-		    vp->v_rdev, foffp, objp);
-		if (error == 0)
-			goto mark_atime;
-		goto done;
 	} else {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
-	error = mac_vnode_check_mmap(cred, vp, prot, flags);
+	/* This relies on VM_PROT_* matching PROT_*. */
+	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
-			if (prot & PROT_WRITE) {
+			if (prot & VM_PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
 			*maxprotp &= ~VM_PROT_WRITE;
 		}
 	}
 	/*
 	 * If it is a regular file without any references
 	 * we do not need to sync it.
 	 * Adjust object size to be the size of actual file.
 	 */
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
 	if (obj->type == OBJT_VNODE) {
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
 		if (obj == NULL) {
 			error = ENOMEM;
 			goto done;
 		}
 	} else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
 		VM_OBJECT_WLOCK(obj);
 		vm_object_reference_locked(obj);
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(obj, 0);
 #endif
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	*objp = obj;
 	*flagsp = flags;
 
-mark_atime:
 	vfs_mark_atime(vp, cred);
 
 done:
 	if (error != 0 && *writecounted) {
 		*writecounted = FALSE;
 		vnode_pager_update_writecount(obj, objsize, 0);
 	}
 	vput(vp);
 	return (error);
 }
 
 /*
  * vm_mmap_cdev()
  *
  * MPSAFE
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
 int
-vm_mmap_cdev(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
+vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
+    vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
+    vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
-	struct cdevsw *dsw;
-	int error, flags, ref;
+	int error, flags;
 
 	flags = *flagsp;
 
-	dsw = dev_refthread(cdev, &ref);
-	if (dsw == NULL)
-		return (ENXIO);
 	if (dsw->d_flags & D_MMAP_ANON) {
-		dev_relthread(cdev, ref);
+		*objp = NULL;
+		*foff = 0;
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
 	}
 	/*
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0) {
-		dev_relthread(cdev, ref);
+	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
-	}
-	if (flags & (MAP_PRIVATE|MAP_COPY)) {
-		dev_relthread(cdev, ref);
+	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
-	}
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
-	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
-	if (error != 0) {
-		dev_relthread(cdev, ref);
+	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
+	if (error != 0)
 		return (error);
-	}
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
 	 * (returns ENODEV), fall back to using the device pager.
 	 * Note that d_mmap_single() must return a reference to the
 	 * object (it needs to bump the reference count of the object
 	 * it returns somehow).
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
-	dev_relthread(cdev, ref);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 	    td->td_ucred);
 	if (obj == NULL)
 		return (EINVAL);
 	*objp = obj;
 	*flagsp = flags;
 	return (0);
 }
 
 /*
- * vm_mmap_shm()
+ * vm_mmap()
  *
- * MPSAFE
- *
- * Helper function for vm_mmap.  Perform sanity check specific for mmap
- * operations on shm file descriptors.
+ * Internal version of mmap used by exec, sys5 shared memory, and
+ * various device drivers.  Handle is either a vnode pointer, a
+ * character device, or NULL for MAP_ANON.
  */
 int
-vm_mmap_shm(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
+vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+	vm_prot_t maxprot, int flags,
+	objtype_t handle_type, void *handle,
+	vm_ooffset_t foff)
 {
+	vm_object_t object;
+	struct thread *td = curthread;
 	int error;
+	boolean_t writecounted;
 
-	if ((*flagsp & MAP_SHARED) != 0 &&
-	    (*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0)
-		return (EACCES);
-#ifdef MAC
-	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
-	if (error != 0)
-		return (error);
-#endif
-	error = shm_mmap(shmfd, objsize, foff, objp);
+	if (size == 0)
+		return (EINVAL);
+
+	size = round_page(size);
+	writecounted = FALSE;
+
+	/*
+	 * Lookup/allocate object.
+	 */
+	switch (handle_type) {
+	case OBJT_DEVICE: {
+		struct cdevsw *dsw;
+		struct cdev *cdev;
+		int ref;
+
+		cdev = handle;
+		dsw = dev_refthread(cdev, &ref);
+		if (dsw == NULL)
+			return (ENXIO);
+		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
+		    dsw, &foff, &object);
+		dev_relthread(cdev, ref);
+		break;
+	}
+	case OBJT_VNODE:
+		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
+		    handle, &foff, &object, &writecounted);
+		break;
+	case OBJT_DEFAULT:
+		if (handle == NULL) {
+			error = 0;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		error = EINVAL;
+		break;
+	}
 	if (error)
 		return (error);
-	return (0);
+
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, writecounted, td);
+	if (error != 0 && object != NULL) {
+		/*
+		 * If this mapping was accounted for in the vnode's
+		 * writecount, then undo that now.
+		 */
+		if (writecounted)
+			vnode_pager_release_writecount(object, 0, size);
+		vm_object_deallocate(object);
+	}
+	return (error);
 }
 
 /*
- * vm_mmap()
- *
- * MPSAFE
- *
- * Internal version of mmap.  Currently used by mmap, exec, and sys5
- * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
+ * Internal version of mmap that maps a specific VM object into an
+ * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
  */
 int
-vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
-	vm_prot_t maxprot, int flags,
-	objtype_t handle_type, void *handle,
-	vm_ooffset_t foff)
+vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+    vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
+    boolean_t writecounted, struct thread *td)
 {
 	boolean_t fitit;
-	vm_object_t object = NULL;
-	struct thread *td = curthread;
 	int docow, error, findspace, rv;
-	boolean_t writecounted;
 
-	if (size == 0)
-		return (0);
-
-	size = round_page(size);
-
 	if (map == &td->td_proc->p_vmspace->vm_map) {
 		PROC_LOCK(td->td_proc);
 		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
 			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (ENOMEM);
 			}
 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + size);
 			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (error);
 			}
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
-	 * The check is here rather than in the syscall because the
-	 * kernel calls this function internally for other mmaping
-	 * operations (such as in exec) and non-aligned offsets will
-	 * cause pmap inconsistencies...so we want to be sure to
-	 * disallow this in all cases.
+	 * The mmap() system call already enforces this by subtracting
+	 * the page offset from the file offset, but checking here
+	 * catches errors in device drivers (e.g. d_single_mmap()
+	 * callbacks) and other internal mapping requests (such as in
+	 * exec).
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 	}
-	writecounted = FALSE;
 
-	/*
-	 * Lookup/allocate object.
-	 */
-	switch (handle_type) {
-	case OBJT_DEVICE:
-		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object);
-		break;
-	case OBJT_VNODE:
-		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object, &writecounted);
-		break;
-	case OBJT_SWAP:
-		error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
-		    handle, foff, &object);
-		break;
-	case OBJT_DEFAULT:
-		if (handle == NULL) {
-			error = 0;
-			break;
-		}
-		/* FALLTHROUGH */
-	default:
-		error = EINVAL;
-		break;
-	}
-	if (error)
-		return (error);
 	if (flags & MAP_ANON) {
-		object = NULL;
+		if (object != NULL || foff != 0)
+			return (EINVAL);
 		docow = 0;
-		/*
-		 * Unnamed anonymous regions always start at 0.
-		 */
-		if (handle == 0)
-			foff = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
 		docow = MAP_PREFAULT_PARTIAL;
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 	/* Shared memory is also shared with children. */
 	if (flags & MAP_SHARED)
 		docow |= MAP_INHERIT_SHARE;
 	if (writecounted)
 		docow |= MAP_VN_WRITECOUNT;
 	if (flags & MAP_STACK) {
 		if (object != NULL)
 			return (EINVAL);
 		docow |= MAP_STACK_GROWS_DOWN;
 	}
 	if ((flags & MAP_EXCL) != 0)
 		docow |= MAP_CHECK_EXCL;
 
 	if (fitit) {
 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 			findspace = VMFS_SUPER_SPACE;
 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 			findspace = VMFS_ALIGNED_SPACE(flags >>
 			    MAP_ALIGNMENT_SHIFT);
 		else
 			findspace = VMFS_OPTIMAL_SPACE;
 		rv = vm_map_find(map, object, foff, addr, size,
 #ifdef MAP_32BIT
 		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
 #endif
 		    0, findspace, prot, maxprot, docow);
 	} else {
 		rv = vm_map_fixed(map, object, foff, *addr, size,
 		    prot, maxprot, docow);
 	}
 
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * If the process has requested that all future mappings
 		 * be wired, then heed this.
 		 */
 		if (map->flags & MAP_WIREFUTURE) {
 			vm_map_wire(map, *addr, *addr + size,
 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 		}
-	} else {
-		/*
-		 * If this mapping was accounted for in the vnode's
-		 * writecount, then undo that now.
-		 */
-		if (writecounted)
-			vnode_pager_release_writecount(object, 0, size);
-		/*
-		 * Lose the object reference.  Will destroy the
-		 * object if it's an unnamed anonymous mapping
-		 * or named anonymous without other references.
-		 */
-		vm_object_deallocate(object);
 	}
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * Translate a Mach VM return code to zero on success or the appropriate errno
  * on failure.
  */
 int
 vm_mmap_to_errno(int rv)
 {
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }