Index: head/sys/dev/drm/drm_bufs.c =================================================================== --- head/sys/dev/drm/drm_bufs.c (revision 283997) +++ head/sys/dev/drm/drm_bufs.c (revision 283998) @@ -1,1130 +1,1130 @@ /*- * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rickard E. (Rik) Faith * Gareth Hughes * */ #include __FBSDID("$FreeBSD$"); /** @file drm_bufs.c * Implementation of the ioctls for setup of DRM mappings and DMA buffers. */ #include "dev/pci/pcireg.h" #include "dev/drm/drmP.h" /* Allocation of PCI memory resources (framebuffer, registers, etc.) for * drm_get_resource_*. Note that they are not RF_ACTIVE, so there's no virtual * address for accessing them. Cleaned up at unload. */ static int drm_alloc_resource(struct drm_device *dev, int resource) { struct resource *res; int rid; DRM_SPINLOCK_ASSERT(&dev->dev_lock); if (resource >= DRM_MAX_PCI_RESOURCE) { DRM_ERROR("Resource %d too large\n", resource); return 1; } if (dev->pcir[resource] != NULL) { return 0; } DRM_UNLOCK(); rid = PCIR_BAR(resource); res = bus_alloc_resource_any(dev->device, SYS_RES_MEMORY, &rid, RF_SHAREABLE); DRM_LOCK(); if (res == NULL) { DRM_ERROR("Couldn't find resource 0x%x\n", resource); return 1; } if (dev->pcir[resource] == NULL) { dev->pcirid[resource] = rid; dev->pcir[resource] = res; } return 0; } unsigned long drm_get_resource_start(struct drm_device *dev, unsigned int resource) { if (drm_alloc_resource(dev, resource) != 0) return 0; return rman_get_start(dev->pcir[resource]); } unsigned long drm_get_resource_len(struct drm_device *dev, unsigned int resource) { if (drm_alloc_resource(dev, resource) != 0) return 0; return rman_get_size(dev->pcir[resource]); } int drm_addmap(struct drm_device * dev, unsigned long offset, unsigned long size, enum drm_map_type type, enum drm_map_flags flags, drm_local_map_t **map_ptr) { drm_local_map_t *map; int align; /*drm_agp_mem_t *entry; int valid;*/ /* Only allow shared memory to be removable since we only keep enough * book keeping information about shared memory to allow for removal * when processes fork. */ if ((flags & _DRM_REMOVABLE) && type != _DRM_SHM) { DRM_ERROR("Requested removable map for non-DRM_SHM\n"); return EINVAL; } if ((offset & PAGE_MASK) || (size & PAGE_MASK)) { DRM_ERROR("offset/size not page aligned: 0x%lx/0x%lx\n", offset, size); return EINVAL; } if (offset + size < offset) { DRM_ERROR("offset and size wrap around: 0x%lx/0x%lx\n", offset, size); return EINVAL; } DRM_DEBUG("offset = 0x%08lx, size = 0x%08lx, type = %d\n", offset, size, type); /* Check if this is just another version of a kernel-allocated map, and * just hand that back if so. */ if (type == _DRM_REGISTERS || type == _DRM_FRAME_BUFFER || type == _DRM_SHM) { TAILQ_FOREACH(map, &dev->maplist, link) { if (map->type == type && (map->offset == offset || (map->type == _DRM_SHM && map->flags == _DRM_CONTAINS_LOCK))) { map->size = size; DRM_DEBUG("Found kernel map %d\n", type); goto done; } } } DRM_UNLOCK(); /* Allocate a new map structure, fill it in, and do any type-specific * initialization necessary. */ map = malloc(sizeof(*map), DRM_MEM_MAPS, M_ZERO | M_NOWAIT); if (!map) { DRM_LOCK(); return ENOMEM; } map->offset = offset; map->size = size; map->type = type; map->flags = flags; map->handle = (void *)((unsigned long)alloc_unr(dev->map_unrhdr) << DRM_MAP_HANDLE_SHIFT); switch (map->type) { case _DRM_REGISTERS: map->virtual = drm_ioremap(dev, map); if (!(map->flags & _DRM_WRITE_COMBINING)) break; /* FALLTHROUGH */ case _DRM_FRAME_BUFFER: if (drm_mtrr_add(map->offset, map->size, DRM_MTRR_WC) == 0) map->mtrr = 1; break; case _DRM_SHM: map->virtual = malloc(map->size, DRM_MEM_MAPS, M_NOWAIT); DRM_DEBUG("%lu %d %p\n", map->size, drm_order(map->size), map->virtual); if (!map->virtual) { free(map, DRM_MEM_MAPS); DRM_LOCK(); return ENOMEM; } map->offset = (unsigned long)map->virtual; if (map->flags & _DRM_CONTAINS_LOCK) { /* Prevent a 2nd X Server from creating a 2nd lock */ DRM_LOCK(); if (dev->lock.hw_lock != NULL) { DRM_UNLOCK(); free(map->virtual, DRM_MEM_MAPS); free(map, DRM_MEM_MAPS); return EBUSY; } dev->lock.hw_lock = map->virtual; /* Pointer to lock */ DRM_UNLOCK(); } break; case _DRM_AGP: /*valid = 0;*/ /* In some cases (i810 driver), user space may have already * added the AGP base itself, because dev->agp->base previously * only got set during AGP enable. So, only add the base * address if the map's offset isn't already within the * aperture. */ if (map->offset < dev->agp->base || map->offset > dev->agp->base + dev->agp->info.ai_aperture_size - 1) { map->offset += dev->agp->base; } map->mtrr = dev->agp->mtrr; /* for getmap */ /*for (entry = dev->agp->memory; entry; entry = entry->next) { if ((map->offset >= entry->bound) && (map->offset + map->size <= entry->bound + entry->pages * PAGE_SIZE)) { valid = 1; break; } } if (!valid) { free(map, DRM_MEM_MAPS); DRM_LOCK(); return EACCES; }*/ break; case _DRM_SCATTER_GATHER: if (!dev->sg) { free(map, DRM_MEM_MAPS); DRM_LOCK(); return EINVAL; } map->virtual = (void *)(dev->sg->vaddr + offset); map->offset = dev->sg->vaddr + offset; break; case _DRM_CONSISTENT: /* Unfortunately, we don't get any alignment specification from * the caller, so we have to guess. drm_pci_alloc requires * a power-of-two alignment, so try to align the bus address of * the map to it size if possible, otherwise just assume * PAGE_SIZE alignment. */ align = map->size; if ((align & (align - 1)) != 0) align = PAGE_SIZE; map->dmah = drm_pci_alloc(dev, map->size, align, 0xfffffffful); if (map->dmah == NULL) { free(map, DRM_MEM_MAPS); DRM_LOCK(); return ENOMEM; } map->virtual = map->dmah->vaddr; map->offset = map->dmah->busaddr; break; default: DRM_ERROR("Bad map type %d\n", map->type); free(map, DRM_MEM_MAPS); DRM_LOCK(); return EINVAL; } DRM_LOCK(); TAILQ_INSERT_TAIL(&dev->maplist, map, link); done: /* Jumped to, with lock held, when a kernel map is found. */ DRM_DEBUG("Added map %d 0x%lx/0x%lx\n", map->type, map->offset, map->size); *map_ptr = map; return 0; } int drm_addmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_map *request = data; drm_local_map_t *map; int err; if (!(dev->flags & (FREAD|FWRITE))) return EACCES; /* Require read/write */ if (!DRM_SUSER(DRM_CURPROC) && request->type != _DRM_AGP) return EACCES; DRM_LOCK(); err = drm_addmap(dev, request->offset, request->size, request->type, request->flags, &map); DRM_UNLOCK(); if (err != 0) return err; request->offset = map->offset; request->size = map->size; request->type = map->type; request->flags = map->flags; request->mtrr = map->mtrr; request->handle = (void *)map->handle; return 0; } void drm_rmmap(struct drm_device *dev, drm_local_map_t *map) { DRM_SPINLOCK_ASSERT(&dev->dev_lock); if (map == NULL) return; TAILQ_REMOVE(&dev->maplist, map, link); switch (map->type) { case _DRM_REGISTERS: if (map->bsr == NULL) drm_ioremapfree(map); /* FALLTHROUGH */ case _DRM_FRAME_BUFFER: if (map->mtrr) { int __unused retcode; retcode = drm_mtrr_del(0, map->offset, map->size, DRM_MTRR_WC); DRM_DEBUG("mtrr_del = %d\n", retcode); } break; case _DRM_SHM: free(map->virtual, DRM_MEM_MAPS); break; case _DRM_AGP: case _DRM_SCATTER_GATHER: break; case _DRM_CONSISTENT: drm_pci_free(dev, map->dmah); break; default: DRM_ERROR("Bad map type %d\n", map->type); break; } if (map->bsr != NULL) { bus_release_resource(dev->device, SYS_RES_MEMORY, map->rid, map->bsr); } DRM_UNLOCK(); if (map->handle) free_unr(dev->map_unrhdr, (unsigned long)map->handle >> DRM_MAP_HANDLE_SHIFT); DRM_LOCK(); free(map, DRM_MEM_MAPS); } /* Remove a map private from list and deallocate resources if the mapping * isn't in use. */ int drm_rmmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_local_map_t *map; struct drm_map *request = data; DRM_LOCK(); TAILQ_FOREACH(map, &dev->maplist, link) { if (map->handle == request->handle && map->flags & _DRM_REMOVABLE) break; } /* No match found. */ if (map == NULL) { DRM_UNLOCK(); return EINVAL; } drm_rmmap(dev, map); DRM_UNLOCK(); return 0; } static void drm_cleanup_buf_error(struct drm_device *dev, drm_buf_entry_t *entry) { int i; if (entry->seg_count) { for (i = 0; i < entry->seg_count; i++) { drm_pci_free(dev, entry->seglist[i]); } free(entry->seglist, DRM_MEM_SEGS); entry->seg_count = 0; } if (entry->buf_count) { for (i = 0; i < entry->buf_count; i++) { free(entry->buflist[i].dev_private, DRM_MEM_BUFS); } free(entry->buflist, DRM_MEM_BUFS); entry->buf_count = 0; } } static int drm_do_addbufs_agp(struct drm_device *dev, struct drm_buf_desc *request) { drm_device_dma_t *dma = dev->dma; drm_buf_entry_t *entry; /*drm_agp_mem_t *agp_entry; int valid*/ drm_buf_t *buf; unsigned long offset; unsigned long agp_offset; int count; int order; int size; int alignment; int page_order; int total; int byte_count; int i; drm_buf_t **temp_buflist; count = request->count; order = drm_order(request->size); size = 1 << order; alignment = (request->flags & _DRM_PAGE_ALIGN) ? round_page(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; byte_count = 0; agp_offset = dev->agp->base + request->agp_start; DRM_DEBUG("count: %d\n", count); DRM_DEBUG("order: %d\n", order); DRM_DEBUG("size: %d\n", size); DRM_DEBUG("agp_offset: 0x%lx\n", agp_offset); DRM_DEBUG("alignment: %d\n", alignment); DRM_DEBUG("page_order: %d\n", page_order); DRM_DEBUG("total: %d\n", total); /* Make sure buffers are located in AGP memory that we own */ /* Breaks MGA due to drm_alloc_agp not setting up entries for the * memory. Safe to ignore for now because these ioctls are still * root-only. */ /*valid = 0; for (agp_entry = dev->agp->memory; agp_entry; agp_entry = agp_entry->next) { if ((agp_offset >= agp_entry->bound) && (agp_offset + total * count <= agp_entry->bound + agp_entry->pages * PAGE_SIZE)) { valid = 1; break; } } if (!valid) { DRM_DEBUG("zone invalid\n"); return EINVAL; }*/ entry = &dma->bufs[order]; entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!entry->buflist) { return ENOMEM; } entry->buf_size = size; entry->page_order = page_order; offset = 0; while (entry->buf_count < count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + offset); buf->bus_address = agp_offset + offset; buf->address = (void *)(agp_offset + offset); buf->next = NULL; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->buf_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (buf->dev_private == NULL) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; drm_cleanup_buf_error(dev, entry); return ENOMEM; } offset += alignment; entry->buf_count++; byte_count += PAGE_SIZE << page_order; } DRM_DEBUG("byte_count: %d\n", byte_count); temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (temp_buflist == NULL) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); return ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } dma->buf_count += entry->buf_count; dma->byte_count += byte_count; DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count); DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count); request->count = entry->buf_count; request->size = size; dma->flags = _DRM_DMA_USE_AGP; return 0; } static int drm_do_addbufs_pci(struct drm_device *dev, struct drm_buf_desc *request) { drm_device_dma_t *dma = dev->dma; int count; int order; int size; int total; int page_order; drm_buf_entry_t *entry; drm_buf_t *buf; int alignment; unsigned long offset; int i; int byte_count; int page_count; unsigned long *temp_pagelist; drm_buf_t **temp_buflist; count = request->count; order = drm_order(request->size); size = 1 << order; DRM_DEBUG("count=%d, size=%d (%d), order=%d\n", request->count, request->size, size, order); alignment = (request->flags & _DRM_PAGE_ALIGN) ? round_page(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; entry = &dma->bufs[order]; entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); entry->seglist = malloc(count * sizeof(*entry->seglist), DRM_MEM_SEGS, M_NOWAIT | M_ZERO); /* Keep the original pagelist until we know all the allocations * have succeeded */ temp_pagelist = malloc((dma->page_count + (count << page_order)) * sizeof(*dma->pagelist), DRM_MEM_PAGES, M_NOWAIT); if (entry->buflist == NULL || entry->seglist == NULL || temp_pagelist == NULL) { free(temp_pagelist, DRM_MEM_PAGES); free(entry->seglist, DRM_MEM_SEGS); free(entry->buflist, DRM_MEM_BUFS); return ENOMEM; } memcpy(temp_pagelist, dma->pagelist, dma->page_count * sizeof(*dma->pagelist)); DRM_DEBUG("pagelist: %d entries\n", dma->page_count + (count << page_order)); entry->buf_size = size; entry->page_order = page_order; byte_count = 0; page_count = 0; while (entry->buf_count < count) { DRM_SPINUNLOCK(&dev->dma_lock); drm_dma_handle_t *dmah = drm_pci_alloc(dev, size, alignment, 0xfffffffful); DRM_SPINLOCK(&dev->dma_lock); if (dmah == NULL) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; entry->seg_count = count; drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); return ENOMEM; } entry->seglist[entry->seg_count++] = dmah; for (i = 0; i < (1 << page_order); i++) { DRM_DEBUG("page %d @ %p\n", dma->page_count + page_count, (char *)dmah->vaddr + PAGE_SIZE * i); temp_pagelist[dma->page_count + page_count++] = (long)dmah->vaddr + PAGE_SIZE * i; } for (offset = 0; offset + size <= total && entry->buf_count < count; offset += alignment, ++entry->buf_count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + byte_count + offset); buf->address = ((char *)dmah->vaddr + offset); buf->bus_address = dmah->busaddr + offset; buf->next = NULL; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->buf_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (buf->dev_private == NULL) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; entry->seg_count = count; drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); return ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); } byte_count += PAGE_SIZE << page_order; } temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (temp_buflist == NULL) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); return ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } /* No allocations failed, so now we can replace the orginal pagelist * with the new one. */ free(dma->pagelist, DRM_MEM_PAGES); dma->pagelist = temp_pagelist; dma->buf_count += entry->buf_count; dma->seg_count += entry->seg_count; dma->page_count += entry->seg_count << page_order; dma->byte_count += PAGE_SIZE * (entry->seg_count << page_order); request->count = entry->buf_count; request->size = size; return 0; } static int drm_do_addbufs_sg(struct drm_device *dev, struct drm_buf_desc *request) { drm_device_dma_t *dma = dev->dma; drm_buf_entry_t *entry; drm_buf_t *buf; unsigned long offset; unsigned long agp_offset; int count; int order; int size; int alignment; int page_order; int total; int byte_count; int i; drm_buf_t **temp_buflist; count = request->count; order = drm_order(request->size); size = 1 << order; alignment = (request->flags & _DRM_PAGE_ALIGN) ? round_page(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; byte_count = 0; agp_offset = request->agp_start; DRM_DEBUG("count: %d\n", count); DRM_DEBUG("order: %d\n", order); DRM_DEBUG("size: %d\n", size); DRM_DEBUG("agp_offset: %ld\n", agp_offset); DRM_DEBUG("alignment: %d\n", alignment); DRM_DEBUG("page_order: %d\n", page_order); DRM_DEBUG("total: %d\n", total); entry = &dma->bufs[order]; entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (entry->buflist == NULL) return ENOMEM; entry->buf_size = size; entry->page_order = page_order; offset = 0; while (entry->buf_count < count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + offset); buf->bus_address = agp_offset + offset; buf->address = (void *)(agp_offset + offset + dev->sg->vaddr); buf->next = NULL; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->buf_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (buf->dev_private == NULL) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; drm_cleanup_buf_error(dev, entry); return ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); offset += alignment; entry->buf_count++; byte_count += PAGE_SIZE << page_order; } DRM_DEBUG("byte_count: %d\n", byte_count); temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (temp_buflist == NULL) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); return ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } dma->buf_count += entry->buf_count; dma->byte_count += byte_count; DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count); DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count); request->count = entry->buf_count; request->size = size; dma->flags = _DRM_DMA_USE_SG; return 0; } int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc *request) { int order, ret; if (request->count < 0 || request->count > 4096) return EINVAL; order = drm_order(request->size); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return EINVAL; DRM_SPINLOCK(&dev->dma_lock); /* No more allocations after first buffer-using ioctl. */ if (dev->buf_use != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return EBUSY; } /* No more than one allocation per order */ if (dev->dma->bufs[order].buf_count != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return ENOMEM; } ret = drm_do_addbufs_agp(dev, request); DRM_SPINUNLOCK(&dev->dma_lock); return ret; } int drm_addbufs_sg(struct drm_device *dev, struct drm_buf_desc *request) { int order, ret; if (!DRM_SUSER(DRM_CURPROC)) return EACCES; if (request->count < 0 || request->count > 4096) return EINVAL; order = drm_order(request->size); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return EINVAL; DRM_SPINLOCK(&dev->dma_lock); /* No more allocations after first buffer-using ioctl. */ if (dev->buf_use != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return EBUSY; } /* No more than one allocation per order */ if (dev->dma->bufs[order].buf_count != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return ENOMEM; } ret = drm_do_addbufs_sg(dev, request); DRM_SPINUNLOCK(&dev->dma_lock); return ret; } int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc *request) { int order, ret; if (!DRM_SUSER(DRM_CURPROC)) return EACCES; if (request->count < 0 || request->count > 4096) return EINVAL; order = drm_order(request->size); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return EINVAL; DRM_SPINLOCK(&dev->dma_lock); /* No more allocations after first buffer-using ioctl. */ if (dev->buf_use != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return EBUSY; } /* No more than one allocation per order */ if (dev->dma->bufs[order].buf_count != 0) { DRM_SPINUNLOCK(&dev->dma_lock); return ENOMEM; } ret = drm_do_addbufs_pci(dev, request); DRM_SPINUNLOCK(&dev->dma_lock); return ret; } int drm_addbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_buf_desc *request = data; int err; if (request->flags & _DRM_AGP_BUFFER) err = drm_addbufs_agp(dev, request); else if (request->flags & _DRM_SG_BUFFER) err = drm_addbufs_sg(dev, request); else err = drm_addbufs_pci(dev, request); return err; } int drm_infobufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_device_dma_t *dma = dev->dma; struct drm_buf_info *request = data; int i; int count; int retcode = 0; DRM_SPINLOCK(&dev->dma_lock); ++dev->buf_use; /* Can't allocate more after this call */ DRM_SPINUNLOCK(&dev->dma_lock); for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) { if (dma->bufs[i].buf_count) ++count; } DRM_DEBUG("count = %d\n", count); if (request->count >= count) { for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) { if (dma->bufs[i].buf_count) { struct drm_buf_desc from; from.count = dma->bufs[i].buf_count; from.size = dma->bufs[i].buf_size; from.low_mark = dma->bufs[i].freelist.low_mark; from.high_mark = dma->bufs[i].freelist.high_mark; if (DRM_COPY_TO_USER(&request->list[count], &from, sizeof(struct drm_buf_desc)) != 0) { retcode = EFAULT; break; } DRM_DEBUG("%d %d %d %d %d\n", i, dma->bufs[i].buf_count, dma->bufs[i].buf_size, dma->bufs[i].freelist.low_mark, dma->bufs[i].freelist.high_mark); ++count; } } } request->count = count; return retcode; } int drm_markbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_device_dma_t *dma = dev->dma; struct drm_buf_desc *request = data; int order; DRM_DEBUG("%d, %d, %d\n", request->size, request->low_mark, request->high_mark); order = drm_order(request->size); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER || request->low_mark < 0 || request->high_mark < 0) { return EINVAL; } DRM_SPINLOCK(&dev->dma_lock); if (request->low_mark > dma->bufs[order].buf_count || request->high_mark > dma->bufs[order].buf_count) { DRM_SPINUNLOCK(&dev->dma_lock); return EINVAL; } dma->bufs[order].freelist.low_mark = request->low_mark; dma->bufs[order].freelist.high_mark = request->high_mark; DRM_SPINUNLOCK(&dev->dma_lock); return 0; } int drm_freebufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_device_dma_t *dma = dev->dma; struct drm_buf_free *request = data; int i; int idx; drm_buf_t *buf; int retcode = 0; DRM_DEBUG("%d\n", request->count); DRM_SPINLOCK(&dev->dma_lock); for (i = 0; i < request->count; i++) { if (DRM_COPY_FROM_USER(&idx, &request->list[i], sizeof(idx))) { retcode = EFAULT; break; } if (idx < 0 || idx >= dma->buf_count) { DRM_ERROR("Index %d (of %d max)\n", idx, dma->buf_count - 1); retcode = EINVAL; break; } buf = dma->buflist[idx]; if (buf->file_priv != file_priv) { DRM_ERROR("Process %d freeing buffer not owned\n", DRM_CURRENTPID); retcode = EINVAL; break; } drm_free_buffer(dev, buf); } DRM_SPINUNLOCK(&dev->dma_lock); return retcode; } int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_device_dma_t *dma = dev->dma; int retcode = 0; const int zero = 0; vm_offset_t address; struct vmspace *vms; vm_ooffset_t foff; vm_size_t size; vm_offset_t vaddr; struct drm_buf_map *request = data; int i; vms = DRM_CURPROC->td_proc->p_vmspace; DRM_SPINLOCK(&dev->dma_lock); dev->buf_use++; /* Can't allocate more after this call */ DRM_SPINUNLOCK(&dev->dma_lock); if (request->count < dma->buf_count) goto done; if ((drm_core_has_AGP(dev) && (dma->flags & _DRM_DMA_USE_AGP)) || (drm_core_check_feature(dev, DRIVER_SG) && (dma->flags & _DRM_DMA_USE_SG))) { drm_local_map_t *map = dev->agp_buffer_map; if (map == NULL) { retcode = EINVAL; goto done; } size = round_page(map->size); foff = (unsigned long)map->handle; } else { size = round_page(dma->byte_count), foff = 0; } vaddr = round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ); #if __FreeBSD_version >= 600023 - retcode = vm_mmap(&vms->vm_map, &vaddr, size, PROT_READ | PROT_WRITE, - VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE, + retcode = vm_mmap(&vms->vm_map, &vaddr, size, VM_PROT_READ | + VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE, dev->devnode, foff); #else - retcode = vm_mmap(&vms->vm_map, &vaddr, size, PROT_READ | PROT_WRITE, - VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, + retcode = vm_mmap(&vms->vm_map, &vaddr, size, VM_PROT_READ | + VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, SLIST_FIRST(&dev->devnode->si_hlist), foff); #endif if (retcode) goto done; request->virtual = (void *)vaddr; for (i = 0; i < dma->buf_count; i++) { if (DRM_COPY_TO_USER(&request->list[i].idx, &dma->buflist[i]->idx, sizeof(request->list[0].idx))) { retcode = EFAULT; goto done; } if (DRM_COPY_TO_USER(&request->list[i].total, &dma->buflist[i]->total, sizeof(request->list[0].total))) { retcode = EFAULT; goto done; } if (DRM_COPY_TO_USER(&request->list[i].used, &zero, sizeof(zero))) { retcode = EFAULT; goto done; } address = vaddr + dma->buflist[i]->offset; /* *** */ if (DRM_COPY_TO_USER(&request->list[i].address, &address, sizeof(address))) { retcode = EFAULT; goto done; } } done: request->count = dma->buf_count; DRM_DEBUG("%d buffers, retcode = %d\n", request->count, retcode); return retcode; } /* * Compute order. Can be made faster. */ int drm_order(unsigned long size) { int order; if (size == 0) return 0; order = flsl(size) - 1; if (size & ~(1ul << order)) ++order; return order; } Index: head/sys/dev/drm2/drm_bufs.c =================================================================== --- head/sys/dev/drm2/drm_bufs.c (revision 283997) +++ head/sys/dev/drm2/drm_bufs.c (revision 283998) @@ -1,1708 +1,1708 @@ /** * \file drm_bufs.c * Generic buffer template * * \author Rickard E. (Rik) Faith * \author Gareth Hughes */ /* * Created: Thu Nov 23 03:10:50 2000 by gareth@valinux.com * * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* Allocation of PCI memory resources (framebuffer, registers, etc.) for * drm_get_resource_*. Note that they are not RF_ACTIVE, so there's no virtual * address for accessing them. Cleaned up at unload. */ static int drm_alloc_resource(struct drm_device *dev, int resource) { struct resource *res; int rid; if (resource >= DRM_MAX_PCI_RESOURCE) { DRM_ERROR("Resource %d too large\n", resource); return 1; } if (dev->pcir[resource] != NULL) { return 0; } rid = PCIR_BAR(resource); res = bus_alloc_resource_any(dev->dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE); if (res == NULL) { DRM_ERROR("Couldn't find resource 0x%x\n", resource); return 1; } if (dev->pcir[resource] == NULL) { dev->pcirid[resource] = rid; dev->pcir[resource] = res; } return 0; } unsigned long drm_get_resource_start(struct drm_device *dev, unsigned int resource) { unsigned long start; mtx_lock(&dev->pcir_lock); if (drm_alloc_resource(dev, resource) != 0) return 0; start = rman_get_start(dev->pcir[resource]); mtx_unlock(&dev->pcir_lock); return (start); } unsigned long drm_get_resource_len(struct drm_device *dev, unsigned int resource) { unsigned long len; mtx_lock(&dev->pcir_lock); if (drm_alloc_resource(dev, resource) != 0) return 0; len = rman_get_size(dev->pcir[resource]); mtx_unlock(&dev->pcir_lock); return (len); } static struct drm_map_list *drm_find_matching_map(struct drm_device *dev, struct drm_local_map *map) { struct drm_map_list *entry; list_for_each_entry(entry, &dev->maplist, head) { /* * Because the kernel-userspace ABI is fixed at a 32-bit offset * while PCI resources may live above that, we only compare the * lower 32 bits of the map offset for maps of type * _DRM_FRAMEBUFFER or _DRM_REGISTERS. * It is assumed that if a driver have more than one resource * of each type, the lower 32 bits are different. */ if (!entry->map || map->type != entry->map->type || entry->master != dev->primary->master) continue; switch (map->type) { case _DRM_SHM: if (map->flags != _DRM_CONTAINS_LOCK) break; return entry; case _DRM_REGISTERS: case _DRM_FRAME_BUFFER: if ((entry->map->offset & 0xffffffff) == (map->offset & 0xffffffff)) return entry; default: /* Make gcc happy */ ; } if (entry->map->offset == map->offset) return entry; } return NULL; } static int drm_map_handle(struct drm_device *dev, struct drm_hash_item *hash, unsigned long user_token, int hashed_handle, int shm) { int use_hashed_handle, shift; unsigned long add; #if (BITS_PER_LONG == 64) use_hashed_handle = ((user_token & 0xFFFFFFFF00000000UL) || hashed_handle); #elif (BITS_PER_LONG == 32) use_hashed_handle = hashed_handle; #else #error Unsupported long size. Neither 64 nor 32 bits. #endif if (!use_hashed_handle) { int ret; hash->key = user_token >> PAGE_SHIFT; ret = drm_ht_insert_item(&dev->map_hash, hash); if (ret != -EINVAL) return ret; } shift = 0; add = DRM_MAP_HASH_OFFSET >> PAGE_SHIFT; if (shm && (SHMLBA > PAGE_SIZE)) { int bits = ilog2(SHMLBA >> PAGE_SHIFT) + 1; /* For shared memory, we have to preserve the SHMLBA * bits of the eventual vma->vm_pgoff value during * mmap(). Otherwise we run into cache aliasing problems * on some platforms. On these platforms, the pgoff of * a mmap() request is used to pick a suitable virtual * address for the mmap() region such that it will not * cause cache aliasing problems. * * Therefore, make sure the SHMLBA relevant bits of the * hash value we use are equal to those in the original * kernel virtual address. */ shift = bits; add |= ((user_token >> PAGE_SHIFT) & ((1UL << bits) - 1UL)); } return drm_ht_just_insert_please(&dev->map_hash, hash, user_token, 32 - PAGE_SHIFT - 3, shift, add); } /** * Core function to create a range of memory available for mapping by a * non-root process. * * Adjusts the memory offset to its absolute value according to the mapping * type. Adds the map to the map list drm_device::maplist. Adds MTRR's where * applicable and if supported by the kernel. */ static int drm_addmap_core(struct drm_device * dev, resource_size_t offset, unsigned int size, enum drm_map_type type, enum drm_map_flags flags, struct drm_map_list ** maplist) { struct drm_local_map *map; struct drm_map_list *list; drm_dma_handle_t *dmah; unsigned long user_token; int ret; int align; map = malloc(sizeof(*map), DRM_MEM_MAPS, M_NOWAIT); if (!map) return -ENOMEM; map->offset = offset; map->size = size; map->flags = flags; map->type = type; /* Only allow shared memory to be removable since we only keep enough * book keeping information about shared memory to allow for removal * when processes fork. */ if ((map->flags & _DRM_REMOVABLE) && map->type != _DRM_SHM) { free(map, DRM_MEM_MAPS); return -EINVAL; } DRM_DEBUG("offset = 0x%08llx, size = 0x%08lx, type = %d\n", (unsigned long long)map->offset, map->size, map->type); /* page-align _DRM_SHM maps. They are allocated here so there is no security * hole created by that and it works around various broken drivers that use * a non-aligned quantity to map the SAREA. --BenH */ if (map->type == _DRM_SHM) map->size = PAGE_ALIGN(map->size); /* * FreeBSD port note: FreeBSD's PAGE_MASK is the inverse of * Linux's one. That's why the test below doesn't inverse the * constant. */ if ((map->offset & ((resource_size_t)PAGE_MASK)) || (map->size & (PAGE_MASK))) { free(map, DRM_MEM_MAPS); return -EINVAL; } map->mtrr = -1; map->handle = NULL; switch (map->type) { case _DRM_REGISTERS: case _DRM_FRAME_BUFFER: #ifdef __linux__ #if !defined(__sparc__) && !defined(__alpha__) && !defined(__ia64__) && !defined(__powerpc64__) && !defined(__x86_64__) && !defined(__arm__) if (map->offset + (map->size-1) < map->offset || map->offset < virt_to_phys(high_memory)) { kfree(map); return -EINVAL; } #endif #endif /* Some drivers preinitialize some maps, without the X Server * needing to be aware of it. Therefore, we just return success * when the server tries to create a duplicate map. */ list = drm_find_matching_map(dev, map); if (list != NULL) { if (list->map->size != map->size) { DRM_DEBUG("Matching maps of type %d with " "mismatched sizes, (%ld vs %ld)\n", map->type, map->size, list->map->size); list->map->size = map->size; } free(map, DRM_MEM_MAPS); *maplist = list; return 0; } if (drm_core_has_MTRR(dev)) { if (map->type == _DRM_FRAME_BUFFER || (map->flags & _DRM_WRITE_COMBINING)) { if (drm_mtrr_add( map->offset, map->size, DRM_MTRR_WC) == 0) map->mtrr = 1; } } if (map->type == _DRM_REGISTERS) { drm_core_ioremap(map, dev); if (!map->handle) { free(map, DRM_MEM_MAPS); return -ENOMEM; } } break; case _DRM_SHM: list = drm_find_matching_map(dev, map); if (list != NULL) { if(list->map->size != map->size) { DRM_DEBUG("Matching maps of type %d with " "mismatched sizes, (%ld vs %ld)\n", map->type, map->size, list->map->size); list->map->size = map->size; } free(map, DRM_MEM_MAPS); *maplist = list; return 0; } map->handle = malloc(map->size, DRM_MEM_MAPS, M_NOWAIT); DRM_DEBUG("%lu %d %p\n", map->size, drm_order(map->size), map->handle); if (!map->handle) { free(map, DRM_MEM_MAPS); return -ENOMEM; } map->offset = (unsigned long)map->handle; if (map->flags & _DRM_CONTAINS_LOCK) { /* Prevent a 2nd X Server from creating a 2nd lock */ if (dev->primary->master->lock.hw_lock != NULL) { free(map->handle, DRM_MEM_MAPS); free(map, DRM_MEM_MAPS); return -EBUSY; } dev->sigdata.lock = dev->primary->master->lock.hw_lock = map->handle; /* Pointer to lock */ } break; case _DRM_AGP: { struct drm_agp_mem *entry; int valid = 0; if (!drm_core_has_AGP(dev)) { free(map, DRM_MEM_MAPS); return -EINVAL; } #ifdef __linux__ #ifdef __alpha__ map->offset += dev->hose->mem_space->start; #endif #endif /* In some cases (i810 driver), user space may have already * added the AGP base itself, because dev->agp->base previously * only got set during AGP enable. So, only add the base * address if the map's offset isn't already within the * aperture. */ if (map->offset < dev->agp->base || map->offset > dev->agp->base + dev->agp->agp_info.ai_aperture_size * 1024 * 1024 - 1) { map->offset += dev->agp->base; } map->mtrr = dev->agp->agp_mtrr; /* for getmap */ /* This assumes the DRM is in total control of AGP space. * It's not always the case as AGP can be in the control * of user space (i.e. i810 driver). So this loop will get * skipped and we double check that dev->agp->memory is * actually set as well as being invalid before EPERM'ing */ list_for_each_entry(entry, &dev->agp->memory, head) { if ((map->offset >= entry->bound) && (map->offset + map->size <= entry->bound + entry->pages * PAGE_SIZE)) { valid = 1; break; } } if (!list_empty(&dev->agp->memory) && !valid) { free(map, DRM_MEM_MAPS); return -EPERM; } DRM_DEBUG("AGP offset = 0x%08llx, size = 0x%08lx\n", (unsigned long long)map->offset, map->size); break; } case _DRM_GEM: DRM_ERROR("tried to addmap GEM object\n"); break; case _DRM_SCATTER_GATHER: if (!dev->sg) { free(map, DRM_MEM_MAPS); return -EINVAL; } map->handle = (void *)(dev->sg->vaddr + offset); map->offset += dev->sg->vaddr; break; case _DRM_CONSISTENT: /* dma_addr_t is 64bit on i386 with CONFIG_HIGHMEM64G, * As we're limiting the address to 2^32-1 (or less), * casting it down to 32 bits is no problem, but we * need to point to a 64bit variable first. */ align = map->size; if ((align & (align - 1)) != 0) align = PAGE_SIZE; dmah = drm_pci_alloc(dev, map->size, align, BUS_SPACE_MAXADDR); if (!dmah) { free(map, DRM_MEM_MAPS); return -ENOMEM; } map->handle = dmah->vaddr; map->offset = dmah->busaddr; map->dmah = dmah; break; default: free(map, DRM_MEM_MAPS); return -EINVAL; } list = malloc(sizeof(*list), DRM_MEM_MAPS, M_ZERO | M_NOWAIT); if (!list) { if (map->type == _DRM_REGISTERS) drm_core_ioremapfree(map, dev); free(map, DRM_MEM_MAPS); return -EINVAL; } list->map = map; DRM_LOCK(dev); list_add(&list->head, &dev->maplist); /* Assign a 32-bit handle */ /* We do it here so that dev->struct_mutex protects the increment */ user_token = (map->type == _DRM_SHM) ? (unsigned long)map->handle : map->offset; ret = drm_map_handle(dev, &list->hash, user_token, 0, (map->type == _DRM_SHM)); if (ret) { if (map->type == _DRM_REGISTERS) drm_core_ioremapfree(map, dev); free(map, DRM_MEM_MAPS); free(list, DRM_MEM_MAPS); DRM_UNLOCK(dev); return ret; } list->user_token = list->hash.key << PAGE_SHIFT; DRM_UNLOCK(dev); if (!(map->flags & _DRM_DRIVER)) list->master = dev->primary->master; *maplist = list; return 0; } int drm_addmap(struct drm_device * dev, resource_size_t offset, unsigned int size, enum drm_map_type type, enum drm_map_flags flags, struct drm_local_map ** map_ptr) { struct drm_map_list *list; int rc; rc = drm_addmap_core(dev, offset, size, type, flags, &list); if (!rc) *map_ptr = list->map; return rc; } EXPORT_SYMBOL(drm_addmap); /** * Ioctl to specify a range of memory that is available for mapping by a * non-root process. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_map structure. * \return zero on success or a negative value on error. * */ int drm_addmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_map *map = data; struct drm_map_list *maplist; int err; if (!(DRM_SUSER(DRM_CURPROC) || map->type == _DRM_AGP || map->type == _DRM_SHM)) return -EPERM; err = drm_addmap_core(dev, map->offset, map->size, map->type, map->flags, &maplist); if (err) return err; /* avoid a warning on 64-bit, this casting isn't very nice, but the API is set so too late */ map->handle = (void *)(unsigned long)maplist->user_token; return 0; } /** * Remove a map private from list and deallocate resources if the mapping * isn't in use. * * Searches the map on drm_device::maplist, removes it from the list, see if * its being used, and free any associate resource (such as MTRR's) if it's not * being on use. * * \sa drm_addmap */ int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map) { struct drm_map_list *r_list = NULL, *list_t; int found = 0; struct drm_master *master; /* Find the list entry for the map and remove it */ list_for_each_entry_safe(r_list, list_t, &dev->maplist, head) { if (r_list->map == map) { master = r_list->master; list_del(&r_list->head); drm_ht_remove_key(&dev->map_hash, r_list->user_token >> PAGE_SHIFT); free(r_list, DRM_MEM_MAPS); found = 1; break; } } if (!found) return -EINVAL; switch (map->type) { case _DRM_REGISTERS: drm_core_ioremapfree(map, dev); /* FALLTHROUGH */ case _DRM_FRAME_BUFFER: if (drm_core_has_MTRR(dev) && map->mtrr >= 0) { int retcode; retcode = drm_mtrr_del(map->mtrr, map->offset, map->size, DRM_MTRR_WC); DRM_DEBUG("mtrr_del=%d\n", retcode); } break; case _DRM_SHM: free(map->handle, DRM_MEM_MAPS); if (master) { if (dev->sigdata.lock == master->lock.hw_lock) dev->sigdata.lock = NULL; master->lock.hw_lock = NULL; /* SHM removed */ master->lock.file_priv = NULL; DRM_WAKEUP_INT((void *)&master->lock.lock_queue); } break; case _DRM_AGP: case _DRM_SCATTER_GATHER: break; case _DRM_CONSISTENT: drm_pci_free(dev, map->dmah); break; case _DRM_GEM: DRM_ERROR("tried to rmmap GEM object\n"); break; } free(map, DRM_MEM_MAPS); return 0; } EXPORT_SYMBOL(drm_rmmap_locked); int drm_rmmap(struct drm_device *dev, struct drm_local_map *map) { int ret; DRM_LOCK(dev); ret = drm_rmmap_locked(dev, map); DRM_UNLOCK(dev); return ret; } EXPORT_SYMBOL(drm_rmmap); /* The rmmap ioctl appears to be unnecessary. All mappings are torn down on * the last close of the device, and this is necessary for cleanup when things * exit uncleanly. Therefore, having userland manually remove mappings seems * like a pointless exercise since they're going away anyway. * * One use case might be after addmap is allowed for normal users for SHM and * gets used by drivers that the server doesn't need to care about. This seems * unlikely. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a struct drm_map structure. * \return zero on success or a negative value on error. */ int drm_rmmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_map *request = data; struct drm_local_map *map = NULL; struct drm_map_list *r_list; int ret; DRM_LOCK(dev); list_for_each_entry(r_list, &dev->maplist, head) { if (r_list->map && r_list->user_token == (unsigned long)request->handle && r_list->map->flags & _DRM_REMOVABLE) { map = r_list->map; break; } } /* List has wrapped around to the head pointer, or its empty we didn't * find anything. */ if (list_empty(&dev->maplist) || !map) { DRM_UNLOCK(dev); return -EINVAL; } /* Register and framebuffer maps are permanent */ if ((map->type == _DRM_REGISTERS) || (map->type == _DRM_FRAME_BUFFER)) { DRM_UNLOCK(dev); return 0; } ret = drm_rmmap_locked(dev, map); DRM_UNLOCK(dev); return ret; } /** * Cleanup after an error on one of the addbufs() functions. * * \param dev DRM device. * \param entry buffer entry where the error occurred. * * Frees any pages and buffers associated with the given entry. */ static void drm_cleanup_buf_error(struct drm_device * dev, struct drm_buf_entry * entry) { int i; if (entry->seg_count) { for (i = 0; i < entry->seg_count; i++) { if (entry->seglist[i]) { drm_pci_free(dev, entry->seglist[i]); } } free(entry->seglist, DRM_MEM_SEGS); entry->seg_count = 0; } if (entry->buf_count) { for (i = 0; i < entry->buf_count; i++) { free(entry->buflist[i].dev_private, DRM_MEM_BUFS); } free(entry->buflist, DRM_MEM_BUFS); entry->buf_count = 0; } } #if __OS_HAS_AGP /** * Add AGP buffers for DMA transfers. * * \param dev struct drm_device to which the buffers are to be added. * \param request pointer to a struct drm_buf_desc describing the request. * \return zero on success or a negative number on failure. * * After some sanity checks creates a drm_buf structure for each buffer and * reallocates the buffer list of the same size order to accommodate the new * buffers. */ int drm_addbufs_agp(struct drm_device * dev, struct drm_buf_desc * request) { struct drm_device_dma *dma = dev->dma; struct drm_buf_entry *entry; struct drm_agp_mem *agp_entry; struct drm_buf *buf; unsigned long offset; unsigned long agp_offset; int count; int order; int size; int alignment; int page_order; int total; int byte_count; int i, valid; struct drm_buf **temp_buflist; if (!dma) return -EINVAL; count = request->count; order = drm_order(request->size); size = 1 << order; alignment = (request->flags & _DRM_PAGE_ALIGN) ? PAGE_ALIGN(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; byte_count = 0; agp_offset = dev->agp->base + request->agp_start; DRM_DEBUG("count: %d\n", count); DRM_DEBUG("order: %d\n", order); DRM_DEBUG("size: %d\n", size); DRM_DEBUG("agp_offset: %lx\n", agp_offset); DRM_DEBUG("alignment: %d\n", alignment); DRM_DEBUG("page_order: %d\n", page_order); DRM_DEBUG("total: %d\n", total); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return -EINVAL; /* Make sure buffers are located in AGP memory that we own */ valid = 0; list_for_each_entry(agp_entry, &dev->agp->memory, head) { if ((agp_offset >= agp_entry->bound) && (agp_offset + total * count <= agp_entry->bound + agp_entry->pages * PAGE_SIZE)) { valid = 1; break; } } if (!list_empty(&dev->agp->memory) && !valid) { DRM_DEBUG("zone invalid\n"); return -EINVAL; } mtx_lock(&dev->count_lock); if (dev->buf_use) { mtx_unlock(&dev->count_lock); return -EBUSY; } atomic_inc(&dev->buf_alloc); mtx_unlock(&dev->count_lock); DRM_LOCK(dev); entry = &dma->bufs[order]; if (entry->buf_count) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; /* May only call once for each order */ } if (count < 0 || count > 4096) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -EINVAL; } entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!entry->buflist) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } entry->buf_size = size; entry->page_order = page_order; offset = 0; while (entry->buf_count < count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + offset); buf->bus_address = agp_offset + offset; buf->address = (void *)(agp_offset + offset); buf->next = NULL; buf->waiting = 0; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->dev_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!buf->dev_private) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); offset += alignment; entry->buf_count++; byte_count += PAGE_SIZE << page_order; } DRM_DEBUG("byte_count: %d\n", byte_count); temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (!temp_buflist) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } dma->buf_count += entry->buf_count; dma->seg_count += entry->seg_count; dma->page_count += byte_count >> PAGE_SHIFT; dma->byte_count += byte_count; DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count); DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count); DRM_UNLOCK(dev); request->count = entry->buf_count; request->size = size; dma->flags = _DRM_DMA_USE_AGP; atomic_dec(&dev->buf_alloc); return 0; } EXPORT_SYMBOL(drm_addbufs_agp); #endif /* __OS_HAS_AGP */ int drm_addbufs_pci(struct drm_device * dev, struct drm_buf_desc * request) { struct drm_device_dma *dma = dev->dma; int count; int order; int size; int total; int page_order; struct drm_buf_entry *entry; drm_dma_handle_t *dmah; struct drm_buf *buf; int alignment; unsigned long offset; int i; int byte_count; int page_count; unsigned long *temp_pagelist; struct drm_buf **temp_buflist; if (!drm_core_check_feature(dev, DRIVER_PCI_DMA)) return -EINVAL; if (!dma) return -EINVAL; if (!DRM_SUSER(DRM_CURPROC)) return -EPERM; count = request->count; order = drm_order(request->size); size = 1 << order; DRM_DEBUG("count=%d, size=%d (%d), order=%d\n", request->count, request->size, size, order); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return -EINVAL; alignment = (request->flags & _DRM_PAGE_ALIGN) ? PAGE_ALIGN(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; mtx_lock(&dev->count_lock); if (dev->buf_use) { mtx_unlock(&dev->count_lock); return -EBUSY; } atomic_inc(&dev->buf_alloc); mtx_unlock(&dev->count_lock); DRM_LOCK(dev); entry = &dma->bufs[order]; if (entry->buf_count) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; /* May only call once for each order */ } if (count < 0 || count > 4096) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -EINVAL; } entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!entry->buflist) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } entry->seglist = malloc(count * sizeof(*entry->seglist), DRM_MEM_SEGS, M_NOWAIT | M_ZERO); if (!entry->seglist) { free(entry->buflist, DRM_MEM_BUFS); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } /* Keep the original pagelist until we know all the allocations * have succeeded */ temp_pagelist = malloc((dma->page_count + (count << page_order)) * sizeof(*dma->pagelist), DRM_MEM_PAGES, M_NOWAIT); if (!temp_pagelist) { free(entry->buflist, DRM_MEM_BUFS); free(entry->seglist, DRM_MEM_SEGS); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } memcpy(temp_pagelist, dma->pagelist, dma->page_count * sizeof(*dma->pagelist)); DRM_DEBUG("pagelist: %d entries\n", dma->page_count + (count << page_order)); entry->buf_size = size; entry->page_order = page_order; byte_count = 0; page_count = 0; while (entry->buf_count < count) { dmah = drm_pci_alloc(dev, PAGE_SIZE << page_order, 0x1000, BUS_SPACE_MAXADDR); if (!dmah) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; entry->seg_count = count; drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } entry->seglist[entry->seg_count++] = dmah; for (i = 0; i < (1 << page_order); i++) { DRM_DEBUG("page %d @ 0x%08lx\n", dma->page_count + page_count, (unsigned long)dmah->vaddr + PAGE_SIZE * i); temp_pagelist[dma->page_count + page_count++] = (unsigned long)dmah->vaddr + PAGE_SIZE * i; } for (offset = 0; offset + size <= total && entry->buf_count < count; offset += alignment, ++entry->buf_count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + byte_count + offset); buf->address = (void *)((char *)dmah->vaddr + offset); buf->bus_address = dmah->busaddr + offset; buf->next = NULL; buf->waiting = 0; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->dev_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!buf->dev_private) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; entry->seg_count = count; drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); } byte_count += PAGE_SIZE << page_order; } temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (!temp_buflist) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); free(temp_pagelist, DRM_MEM_PAGES); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } /* No allocations failed, so now we can replace the original pagelist * with the new one. */ if (dma->page_count) { free(dma->pagelist, DRM_MEM_PAGES); } dma->pagelist = temp_pagelist; dma->buf_count += entry->buf_count; dma->seg_count += entry->seg_count; dma->page_count += entry->seg_count << page_order; dma->byte_count += PAGE_SIZE * (entry->seg_count << page_order); DRM_UNLOCK(dev); request->count = entry->buf_count; request->size = size; if (request->flags & _DRM_PCI_BUFFER_RO) dma->flags = _DRM_DMA_USE_PCI_RO; atomic_dec(&dev->buf_alloc); return 0; } EXPORT_SYMBOL(drm_addbufs_pci); static int drm_addbufs_sg(struct drm_device * dev, struct drm_buf_desc * request) { struct drm_device_dma *dma = dev->dma; struct drm_buf_entry *entry; struct drm_buf *buf; unsigned long offset; unsigned long agp_offset; int count; int order; int size; int alignment; int page_order; int total; int byte_count; int i; struct drm_buf **temp_buflist; if (!drm_core_check_feature(dev, DRIVER_SG)) return -EINVAL; if (!dma) return -EINVAL; if (!DRM_SUSER(DRM_CURPROC)) return -EPERM; count = request->count; order = drm_order(request->size); size = 1 << order; alignment = (request->flags & _DRM_PAGE_ALIGN) ? PAGE_ALIGN(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; byte_count = 0; agp_offset = request->agp_start; DRM_DEBUG("count: %d\n", count); DRM_DEBUG("order: %d\n", order); DRM_DEBUG("size: %d\n", size); DRM_DEBUG("agp_offset: %lu\n", agp_offset); DRM_DEBUG("alignment: %d\n", alignment); DRM_DEBUG("page_order: %d\n", page_order); DRM_DEBUG("total: %d\n", total); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return -EINVAL; mtx_lock(&dev->count_lock); if (dev->buf_use) { mtx_unlock(&dev->count_lock); return -EBUSY; } atomic_inc(&dev->buf_alloc); mtx_unlock(&dev->count_lock); DRM_LOCK(dev); entry = &dma->bufs[order]; if (entry->buf_count) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; /* May only call once for each order */ } if (count < 0 || count > 4096) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -EINVAL; } entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!entry->buflist) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } entry->buf_size = size; entry->page_order = page_order; offset = 0; while (entry->buf_count < count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + offset); buf->bus_address = agp_offset + offset; buf->address = (void *)(agp_offset + offset + (unsigned long)dev->sg->vaddr); buf->next = NULL; buf->waiting = 0; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->dev_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!buf->dev_private) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); offset += alignment; entry->buf_count++; byte_count += PAGE_SIZE << page_order; } DRM_DEBUG("byte_count: %d\n", byte_count); temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (!temp_buflist) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } dma->buf_count += entry->buf_count; dma->seg_count += entry->seg_count; dma->page_count += byte_count >> PAGE_SHIFT; dma->byte_count += byte_count; DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count); DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count); DRM_UNLOCK(dev); request->count = entry->buf_count; request->size = size; dma->flags = _DRM_DMA_USE_SG; atomic_dec(&dev->buf_alloc); return 0; } static int drm_addbufs_fb(struct drm_device * dev, struct drm_buf_desc * request) { struct drm_device_dma *dma = dev->dma; struct drm_buf_entry *entry; struct drm_buf *buf; unsigned long offset; unsigned long agp_offset; int count; int order; int size; int alignment; int page_order; int total; int byte_count; int i; struct drm_buf **temp_buflist; if (!drm_core_check_feature(dev, DRIVER_FB_DMA)) return -EINVAL; if (!dma) return -EINVAL; if (!DRM_SUSER(DRM_CURPROC)) return -EPERM; count = request->count; order = drm_order(request->size); size = 1 << order; alignment = (request->flags & _DRM_PAGE_ALIGN) ? PAGE_ALIGN(size) : size; page_order = order - PAGE_SHIFT > 0 ? order - PAGE_SHIFT : 0; total = PAGE_SIZE << page_order; byte_count = 0; agp_offset = request->agp_start; DRM_DEBUG("count: %d\n", count); DRM_DEBUG("order: %d\n", order); DRM_DEBUG("size: %d\n", size); DRM_DEBUG("agp_offset: %lu\n", agp_offset); DRM_DEBUG("alignment: %d\n", alignment); DRM_DEBUG("page_order: %d\n", page_order); DRM_DEBUG("total: %d\n", total); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return -EINVAL; mtx_lock(&dev->count_lock); if (dev->buf_use) { mtx_unlock(&dev->count_lock); return -EBUSY; } atomic_inc(&dev->buf_alloc); mtx_unlock(&dev->count_lock); DRM_LOCK(dev); entry = &dma->bufs[order]; if (entry->buf_count) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; /* May only call once for each order */ } if (count < 0 || count > 4096) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -EINVAL; } entry->buflist = malloc(count * sizeof(*entry->buflist), DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!entry->buflist) { DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } entry->buf_size = size; entry->page_order = page_order; offset = 0; while (entry->buf_count < count) { buf = &entry->buflist[entry->buf_count]; buf->idx = dma->buf_count + entry->buf_count; buf->total = alignment; buf->order = order; buf->used = 0; buf->offset = (dma->byte_count + offset); buf->bus_address = agp_offset + offset; buf->address = (void *)(agp_offset + offset); buf->next = NULL; buf->waiting = 0; buf->pending = 0; buf->file_priv = NULL; buf->dev_priv_size = dev->driver->dev_priv_size; buf->dev_private = malloc(buf->dev_priv_size, DRM_MEM_BUFS, M_NOWAIT | M_ZERO); if (!buf->dev_private) { /* Set count correctly so we free the proper amount. */ entry->buf_count = count; drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } DRM_DEBUG("buffer %d @ %p\n", entry->buf_count, buf->address); offset += alignment; entry->buf_count++; byte_count += PAGE_SIZE << page_order; } DRM_DEBUG("byte_count: %d\n", byte_count); temp_buflist = realloc(dma->buflist, (dma->buf_count + entry->buf_count) * sizeof(*dma->buflist), DRM_MEM_BUFS, M_NOWAIT); if (!temp_buflist) { /* Free the entry because it isn't valid */ drm_cleanup_buf_error(dev, entry); DRM_UNLOCK(dev); atomic_dec(&dev->buf_alloc); return -ENOMEM; } dma->buflist = temp_buflist; for (i = 0; i < entry->buf_count; i++) { dma->buflist[i + dma->buf_count] = &entry->buflist[i]; } dma->buf_count += entry->buf_count; dma->seg_count += entry->seg_count; dma->page_count += byte_count >> PAGE_SHIFT; dma->byte_count += byte_count; DRM_DEBUG("dma->buf_count : %d\n", dma->buf_count); DRM_DEBUG("entry->buf_count : %d\n", entry->buf_count); DRM_UNLOCK(dev); request->count = entry->buf_count; request->size = size; dma->flags = _DRM_DMA_USE_FB; atomic_dec(&dev->buf_alloc); return 0; } /** * Add buffers for DMA transfers (ioctl). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a struct drm_buf_desc request. * \return zero on success or a negative number on failure. * * According with the memory type specified in drm_buf_desc::flags and the * build options, it dispatches the call either to addbufs_agp(), * addbufs_sg() or addbufs_pci() for AGP, scatter-gather or consistent * PCI memory respectively. */ int drm_addbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_buf_desc *request = data; int ret; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA)) return -EINVAL; #if __OS_HAS_AGP if (request->flags & _DRM_AGP_BUFFER) ret = drm_addbufs_agp(dev, request); else #endif if (request->flags & _DRM_SG_BUFFER) ret = drm_addbufs_sg(dev, request); else if (request->flags & _DRM_FB_BUFFER) ret = drm_addbufs_fb(dev, request); else ret = drm_addbufs_pci(dev, request); return ret; } /** * Get information about the buffer mappings. * * This was originally mean for debugging purposes, or by a sophisticated * client library to determine how best to use the available buffers (e.g., * large buffers can be used for image transfer). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_buf_info structure. * \return zero on success or a negative number on failure. * * Increments drm_device::buf_use while holding the drm_device::count_lock * lock, preventing of allocating more buffers after this call. Information * about each requested buffer is then copied into user space. */ int drm_infobufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_device_dma *dma = dev->dma; struct drm_buf_info *request = data; int i; int count; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA)) return -EINVAL; if (!dma) return -EINVAL; mtx_lock(&dev->count_lock); if (atomic_read(&dev->buf_alloc)) { mtx_unlock(&dev->count_lock); return -EBUSY; } ++dev->buf_use; /* Can't allocate more after this call */ mtx_unlock(&dev->count_lock); for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) { if (dma->bufs[i].buf_count) ++count; } DRM_DEBUG("count = %d\n", count); if (request->count >= count) { for (i = 0, count = 0; i < DRM_MAX_ORDER + 1; i++) { if (dma->bufs[i].buf_count) { struct drm_buf_desc __user *to = &request->list[count]; struct drm_buf_entry *from = &dma->bufs[i]; struct drm_freelist *list = &dma->bufs[i].freelist; if (copy_to_user(&to->count, &from->buf_count, sizeof(from->buf_count)) || copy_to_user(&to->size, &from->buf_size, sizeof(from->buf_size)) || copy_to_user(&to->low_mark, &list->low_mark, sizeof(list->low_mark)) || copy_to_user(&to->high_mark, &list->high_mark, sizeof(list->high_mark))) return -EFAULT; DRM_DEBUG("%d %d %d %d %d\n", i, dma->bufs[i].buf_count, dma->bufs[i].buf_size, dma->bufs[i].freelist.low_mark, dma->bufs[i].freelist.high_mark); ++count; } } } request->count = count; return 0; } /** * Specifies a low and high water mark for buffer allocation * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg a pointer to a drm_buf_desc structure. * \return zero on success or a negative number on failure. * * Verifies that the size order is bounded between the admissible orders and * updates the respective drm_device_dma::bufs entry low and high water mark. * * \note This ioctl is deprecated and mostly never used. */ int drm_markbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_device_dma *dma = dev->dma; struct drm_buf_desc *request = data; int order; struct drm_buf_entry *entry; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA)) return -EINVAL; if (!dma) return -EINVAL; DRM_DEBUG("%d, %d, %d\n", request->size, request->low_mark, request->high_mark); order = drm_order(request->size); if (order < DRM_MIN_ORDER || order > DRM_MAX_ORDER) return -EINVAL; entry = &dma->bufs[order]; if (request->low_mark < 0 || request->low_mark > entry->buf_count) return -EINVAL; if (request->high_mark < 0 || request->high_mark > entry->buf_count) return -EINVAL; entry->freelist.low_mark = request->low_mark; entry->freelist.high_mark = request->high_mark; return 0; } /** * Unreserve the buffers in list, previously reserved using drmDMA. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_buf_free structure. * \return zero on success or a negative number on failure. * * Calls free_buffer() for each used buffer. * This function is primarily used for debugging. */ int drm_freebufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_device_dma *dma = dev->dma; struct drm_buf_free *request = data; int i; int idx; struct drm_buf *buf; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA)) return -EINVAL; if (!dma) return -EINVAL; DRM_DEBUG("%d\n", request->count); for (i = 0; i < request->count; i++) { if (copy_from_user(&idx, &request->list[i], sizeof(idx))) return -EFAULT; if (idx < 0 || idx >= dma->buf_count) { DRM_ERROR("Index %d (of %d max)\n", idx, dma->buf_count - 1); return -EINVAL; } buf = dma->buflist[idx]; if (buf->file_priv != file_priv) { DRM_ERROR("Process %d freeing buffer not owned\n", DRM_CURRENTPID); return -EINVAL; } drm_free_buffer(dev, buf); } return 0; } /** * Maps all of the DMA buffers into client-virtual space (ioctl). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_buf_map structure. * \return zero on success or a negative number on failure. * * Maps the AGP, SG or PCI buffer region with vm_mmap(), and copies information * about each buffer into user space. For PCI buffers, it calls vm_mmap() with * offset equal to 0, which drm_mmap() interpretes as PCI buffers and calls * drm_mmap_dma(). */ int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_device_dma *dma = dev->dma; int retcode = 0; const int zero = 0; vm_offset_t virtual; vm_offset_t address; struct vmspace *vms; struct drm_buf_map *request = data; int i; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA)) return -EINVAL; if (!dma) return -EINVAL; mtx_lock(&dev->count_lock); if (atomic_read(&dev->buf_alloc)) { mtx_unlock(&dev->count_lock); return -EBUSY; } dev->buf_use++; /* Can't allocate more after this call */ mtx_unlock(&dev->count_lock); vms = DRM_CURPROC->td_proc->p_vmspace; if (request->count >= dma->buf_count) { if ((drm_core_has_AGP(dev) && (dma->flags & _DRM_DMA_USE_AGP)) || (drm_core_check_feature(dev, DRIVER_SG) && (dma->flags & _DRM_DMA_USE_SG)) || (drm_core_check_feature(dev, DRIVER_FB_DMA) && (dma->flags & _DRM_DMA_USE_FB))) { struct drm_local_map *map = dev->agp_buffer_map; vm_ooffset_t token = dev->agp_buffer_token; if (!map) { retcode = -EINVAL; goto done; } retcode = vm_mmap(&vms->vm_map, &virtual, map->size, - PROT_READ | PROT_WRITE, VM_PROT_ALL, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE, file_priv->minor->device, token); } else { retcode = vm_mmap(&vms->vm_map, &virtual, dma->byte_count, - PROT_READ | PROT_WRITE, VM_PROT_ALL, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL, MAP_SHARED | MAP_NOSYNC, OBJT_DEVICE, file_priv->minor->device, 0); } if (retcode) { /* Real error */ retcode = -retcode; goto done; } request->virtual = (void __user *)virtual; for (i = 0; i < dma->buf_count; i++) { if (copy_to_user(&request->list[i].idx, &dma->buflist[i]->idx, sizeof(request->list[0].idx))) { retcode = -EFAULT; goto done; } if (copy_to_user(&request->list[i].total, &dma->buflist[i]->total, sizeof(request->list[0].total))) { retcode = -EFAULT; goto done; } if (copy_to_user(&request->list[i].used, &zero, sizeof(zero))) { retcode = -EFAULT; goto done; } address = virtual + dma->buflist[i]->offset; /* *** */ if (copy_to_user(&request->list[i].address, &address, sizeof(address))) { retcode = -EFAULT; goto done; } } } done: request->count = dma->buf_count; DRM_DEBUG("%d buffers, retcode = %d\n", request->count, retcode); return retcode; } /** * Compute size order. Returns the exponent of the smaller power of two which * is greater or equal to given number. * * \param size size. * \return order. * * \todo Can be made faster. */ int drm_order(unsigned long size) { int order; unsigned long tmp; for (order = 0, tmp = size >> 1; tmp; tmp >>= 1, order++) ; if (size & (size - 1)) ++order; return order; } EXPORT_SYMBOL(drm_order); Index: head/sys/fs/devfs/devfs_vnops.c =================================================================== --- head/sys/fs/devfs/devfs_vnops.c (revision 283997) +++ head/sys/fs/devfs/devfs_vnops.c (revision 283998) @@ -1,1835 +1,1900 @@ /*- * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct fileops devfs_ops_f; #include #include #include +#include +#include +#include + static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, cdevpriv_dtr_t priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp, 0); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (EBADF); } if ((vp->v_iflag & VI_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (EBADF); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (EBADF); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; int *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); i = *buflen; dd = vp->v_data; if (vp->v_type == VCHR) { i -= strlen(dd->de_cdp->cdp_c.si_name); if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_cdp->cdp_c.si_name, buf + i, strlen(dd->de_cdp->cdp_c.si_name)); de = dd->de_dir; } else if (vp->v_type == VDIR) { if (dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); de = dd; } else { error = ENOENT; goto finished; } *buflen = i; de = devfs_parent_dirent(de); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if ((vp->v_iflag & VI_DOOMED) != 0) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred, NULL); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } /* ARGSUSED */ static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int vp_locked, error, ref; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_fsync(struct vop_fsync_args *ap) { int error; struct bufobj *bo; struct devfs_dirent *de; if (!vn_isdisk(ap->a_vp, &error)) { bo = &ap->a_vp->v_bufobj; de = ap->a_vp->v_data; if (error == ENXIO && bo->bo_dirty.bv_cnt > 0) { printf("Device %s went missing before all of the data " "could be written to it; expect data loss.\n", de->de_dirent->d_name); error = vop_stdfsync(ap); if (bo->bo_dirty.bv_cnt != 0 || error != 0) panic("devfs_fsync: vop_stdfsync failed."); } return (0); } return (vop_stdfsync(ap)); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; int error; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct vnode *vp; struct vnode *vpold; int error, i, ref; const char *p; struct fiodgname_arg *fgn; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_ioctl(fp, com, data, cred, td); return (error); } if (com == FIODTYPE) { *(int *)data = dsw->d_flags & D_TYPEMASK; td->td_fpop = fpop; dev_relthread(dev, ref); return (0); } else if (com == FIODGNAME) { fgn = data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fgn->buf, i); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } error = dsw->d_ioctl(dev, com, data, fp->f_flag, td); td->td_fpop = NULL; dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { vp = fp->f_vnode; /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = devfs_allocv(de, dvp->v_mount, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, dvp->v_mount, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; struct mtx *mtxp; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* cleanup any cdevpriv upon error */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); mtxp = mtx_pool_find(mtxpool_sleep, fp); /* * Hint to the dofilewrite() to not force the buffer draining * on the writer to the file. Most likely, the write would * not need normal buffers. */ mtx_lock(mtxp); fp->f_vnread_flags |= FDEVFS_VNODE; mtx_unlock(mtxp); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vnode_destroy_vobject(vp); VI_LOCK(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev == NULL) { dev_unlock(); VI_UNLOCK(vp); return (0); } dev->si_usecount -= vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp, 0); if (dvp != vp) VOP_UNLOCK(dvp, 0); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp, 0); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error) return (error); } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error) return (error); } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) return (error); if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } return (0); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } +static int +devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, + vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, + struct thread *td) +{ + struct cdev *dev; + struct cdevsw *dsw; + struct mount *mp; + struct vnode *vp; + struct file *fpop; + vm_object_t object; + vm_prot_t maxprot; + int error, ref; + + vp = fp->f_vnode; + + /* + * Ensure that file and memory protections are + * compatible. + */ + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) + maxprot = VM_PROT_NONE; + else + maxprot = VM_PROT_EXECUTE; + if ((fp->f_flag & FREAD) != 0) + maxprot |= VM_PROT_READ; + else if ((prot & VM_PROT_READ) != 0) + return (EACCES); + + /* + * Character devices always share mappings, so + * require a writable fd for writable mappings. + */ + if ((fp->f_flag & FWRITE) != 0) + maxprot |= VM_PROT_WRITE; + else if ((prot & VM_PROT_WRITE) != 0) + return (EACCES); + maxprot &= cap_maxprot; + + fpop = td->td_fpop; + error = devfs_fp_check(fp, &dev, &dsw, &ref); + if (error != 0) + return (error); + + error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, + &object); + td->td_fpop = fpop; + dev_relthread(dev, ref); + if (error != 0) + return (error); + + error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, + foff, FALSE, td); + if (error != 0) + vm_object_deallocate(object); + return (error); +} + dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, + .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, }; struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = devfs_fsync, .vop_getattr = devfs_getattr, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); Index: head/sys/kern/subr_uio.c =================================================================== --- head/sys/kern/subr_uio.c (revision 283997) +++ head/sys/kern/subr_uio.c (revision 283998) @@ -1,570 +1,570 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, UIO_MAXIOV, "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault); int copyin_nofault(const void *udaddr, void *kaddr, size_t len) { int error, save; save = vm_fault_disable_pagefaults(); error = copyin(udaddr, kaddr, len); vm_fault_enable_pagefaults(save); return (error); } int copyout_nofault(const void *kaddr, void *udaddr, size_t len) { int error, save; save = vm_fault_disable_pagefaults(); error = copyout(kaddr, udaddr, len); vm_fault_enable_pagefaults(save); return (error); } #define PHYS_PAGE_COUNT(len) (howmany(len, PAGE_SIZE) + 1) int physcopyin(void *src, vm_paddr_t dst, size_t len) { vm_page_t m[PHYS_PAGE_COUNT(len)]; struct iovec iov[1]; struct uio uio; int i; iov[0].iov_base = src; iov[0].iov_len = len; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE) m[i] = PHYS_TO_VM_PAGE(dst); return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio)); } int physcopyout(vm_paddr_t src, void *dst, size_t len) { vm_page_t m[PHYS_PAGE_COUNT(len)]; struct iovec iov[1]; struct uio uio; int i; iov[0].iov_base = dst; iov[0].iov_len = len; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE) m[i] = PHYS_TO_VM_PAGE(src); return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio)); } #undef PHYS_PAGE_COUNT int uiomove(void *cp, int n, struct uio *uio) { return (uiomove_faultflag(cp, n, uio, 0)); } int uiomove_nofault(void *cp, int n, struct uio *uio) { return (uiomove_faultflag(cp, n, uio, 1)); } static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) { struct thread *td; struct iovec *iov; size_t cnt; int error, newflags, save; td = curthread; error = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, ("uiomove proc")); if (!nofault) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Calling uiomove()"); /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ newflags = TDP_DEADLKTREAT; if (uio->uio_segflg == UIO_USERSPACE && nofault) { /* * Fail if a non-spurious page fault occurs. */ newflags |= TDP_NOFAULTING | TDP_RESETSPUR; } save = curthread_pflags_set(newflags); while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE: maybe_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) goto out; break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp = (char *)cp + cnt; n -= cnt; } out: curthread_pflags_restore(save); return (error); } /* * Wrapper for uiomove() that validates the arguments against a known-good * kernel buffer. Currently, uiomove accepts a signed (n) argument, which * is almost definitely a bad thing, so we catch that here as well. We * return a runtime failure, but it might be desirable to generate a runtime * assertion failure instead. */ int uiomove_frombuf(void *buf, int buflen, struct uio *uio) { size_t offset, n; if (uio->uio_offset < 0 || uio->uio_resid < 0 || (offset = uio->uio_offset) != uio->uio_offset) return (EINVAL); if (buflen <= 0 || offset >= buflen) return (0); if ((n = buflen - offset) > IOSIZE_MAX) return (EINVAL); return (uiomove((char *)buf + offset, n, uio)); } /* * Give next character to user as result of read. */ int ureadc(int c, struct uio *uio) { struct iovec *iov; char *iov_base; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Calling ureadc()"); again: if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) panic("ureadc"); iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iovcnt--; uio->uio_iov++; goto again; } switch (uio->uio_segflg) { case UIO_USERSPACE: if (subyte(iov->iov_base, c) < 0) return (EFAULT); break; case UIO_SYSSPACE: iov_base = iov->iov_base; *iov_base = c; break; case UIO_NOCOPY: break; } iov->iov_base = (char *)iov->iov_base + 1; iov->iov_len--; uio->uio_resid--; uio->uio_offset++; return (0); } int copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, int seg) { int error = 0; switch (seg) { case UIO_USERSPACE: error = copyin(src, dst, len); break; case UIO_SYSSPACE: bcopy(src, dst, len); break; default: panic("copyinfrom: bad seg %d\n", seg); } return (error); } int copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, size_t * __restrict copied, int seg) { int error = 0; switch (seg) { case UIO_USERSPACE: error = copyinstr(src, dst, len, copied); break; case UIO_SYSSPACE: error = copystr(src, dst, len, copied); break; default: panic("copyinstrfrom: bad seg %d\n", seg); } return (error); } int copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) { u_int iovlen; *iov = NULL; if (iovcnt > UIO_MAXIOV) return (error); iovlen = iovcnt * sizeof (struct iovec); *iov = malloc(iovlen, M_IOV, M_WAITOK); error = copyin(iovp, *iov, iovlen); if (error) { free(*iov, M_IOV); *iov = NULL; } return (error); } int copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop) { struct iovec *iov; struct uio *uio; u_int iovlen; int error, i; *uiop = NULL; if (iovcnt > UIO_MAXIOV) return (EINVAL); iovlen = iovcnt * sizeof (struct iovec); uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); iov = (struct iovec *)(uio + 1); error = copyin(iovp, iov, iovlen); if (error) { free(uio, M_IOV); return (error); } uio->uio_iov = iov; uio->uio_iovcnt = iovcnt; uio->uio_segflg = UIO_USERSPACE; uio->uio_offset = -1; uio->uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) { free(uio, M_IOV); return (EINVAL); } uio->uio_resid += iov->iov_len; iov++; } *uiop = uio; return (0); } struct uio * cloneuio(struct uio *uiop) { struct uio *uio; int iovlen; iovlen = uiop->uio_iovcnt * sizeof (struct iovec); uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); *uio = *uiop; uio->uio_iov = (struct iovec *)(uio + 1); bcopy(uiop->uio_iov, uio->uio_iov, iovlen); return (uio); } /* * Map some anonymous memory in user space of size sz, rounded up to the page * boundary. */ int copyout_map(struct thread *td, vm_offset_t *addr, size_t sz) { struct vmspace *vms; int error; vm_size_t size; vms = td->td_proc->p_vmspace; /* * Map somewhere after heap in process memory. */ PROC_LOCK(td->td_proc); *addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td->td_proc, RLIMIT_DATA)); PROC_UNLOCK(td->td_proc); /* round size up to page boundry */ size = (vm_size_t)round_page(sz); - error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE, + error = vm_mmap(&vms->vm_map, addr, size, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0); return (error); } /* * Unmap memory in user space. */ int copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz) { vm_map_t map; vm_size_t size; if (sz == 0) return (0); map = &td->td_proc->p_vmspace->vm_map; size = (vm_size_t)round_page(sz); if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS) return (EINVAL); return (0); } #ifdef NO_FUEWORD /* * XXXKIB The temporal implementation of fue*() functions which do not * handle usermode -1 properly, mixing it with the fault code. Keep * this until MD code is written. Currently sparc64, mips and arm do * not have proper implementation. */ int fueword(volatile const void *base, long *val) { long res; res = fuword(base); if (res == -1) return (-1); *val = res; return (0); } int fueword32(volatile const void *base, int32_t *val) { int32_t res; res = fuword32(base); if (res == -1) return (-1); *val = res; return (0); } #ifdef _LP64 int fueword64(volatile const void *base, int64_t *val) { int32_t res; res = fuword64(base); if (res == -1) return (-1); *val = res; return (0); } #endif int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval) { int32_t ov; ov = casuword32(base, oldval, newval); if (ov == -1) return (-1); *oldvalp = ov; return (0); } int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval) { u_long ov; ov = casuword(p, oldval, newval); if (ov == -1) return (-1); *oldvalp = ov; return (0); } #else /* NO_FUEWORD */ int32_t fuword32(volatile const void *addr) { int rv; int32_t val; rv = fueword32(addr, &val); return (rv == -1 ? -1 : val); } #ifdef _LP64 int64_t fuword64(volatile const void *addr) { int rv; int64_t val; rv = fueword64(addr, &val); return (rv == -1 ? -1 : val); } #endif /* _LP64 */ long fuword(volatile const void *addr) { long val; int rv; rv = fueword(addr, &val); return (rv == -1 ? -1 : val); } uint32_t casuword32(volatile uint32_t *addr, uint32_t old, uint32_t new) { int rv; uint32_t val; rv = casueword32(addr, old, &val, new); return (rv == -1 ? -1 : val); } u_long casuword(volatile u_long *addr, u_long old, u_long new) { int rv; u_long val; rv = casueword(addr, old, &val, new); return (rv == -1 ? -1 : val); } #endif /* NO_FUEWORD */ Index: head/sys/kern/uipc_shm.c =================================================================== --- head/sys/kern/uipc_shm.c (revision 283997) +++ head/sys/kern/uipc_shm.c (revision 283998) @@ -1,1060 +1,1088 @@ /*- * Copyright (c) 2006, 2011 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * TODO: * * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) * and ipcrm(1) be expanded or should new tools to manage both POSIX * kernel semaphores and POSIX shared memory be written? * * (2) Add support for this file type to fstat(1). * * (3) Resource limits? Does this need its own resource limits or are the * existing limits in mmap(2) sufficient? */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr *shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); static void shm_init(void *arg); static void shm_drop(struct shmfd *shmfd); static struct shmfd *shm_hold(struct shmfd *shmfd); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int shm_dotruncate(struct shmfd *shmfd, off_t length); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; +static fo_mmap_t shm_mmap; /* File descriptor operations. */ static struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, + .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, 0); m = vm_page_lookup(obj, idx); if (m == NULL) { printf( "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n", obj, idx, rv); VM_OBJECT_WUNLOCK(obj); return (EIO); } if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); } vm_page_xunbusy(m); vm_page_lock(m); vm_page_hold(m); if (m->queue == PQ_NONE) { vm_page_deactivate(m); } else { /* Requeue to maintain LRU ordering. */ vm_page_requeue(m); } vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize; mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } static int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m, ma[1]; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); if (m == NULL) { VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; } else if (m->valid != VM_PAGE_BITS_ALL) { ma[0] = m; rv = vm_pager_get_pages(object, ma, 1, 0); m = vm_page_lookup(object, idx); } else /* A cached page was reactivated. */ rv = VM_PAGER_OK; vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = ptoa(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Attempt to reserve the swap */ delta = ptoa(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ static struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; int ino; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; ino = alloc_unr(shm_ino_unr); if (ino == -1) shmfd->shm_ino = 0; else shmfd->shm_ino = ino; refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } static struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } static void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); if (shmfd->shm_ino != 0) free_unr(shm_ino_unr, shmfd->shm_ino); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON)) return (ECAPMODE); #endif if ((uap->flags & O_ACCMODE) != O_RDONLY && (uap->flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (uap->path == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((uap->flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); error = copyinstr(uap->path, path, MAXPATHLEN, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[0] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (uap->flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((uap->flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(uap->flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(uap->flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (uap->flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, path, MAXPATHLEN, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } -/* - * mmap() helper to validate mmap() requests against shm object state - * and give mmap() the vm_object to use for the mapping. - */ int -shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, - vm_object_t *obj) +shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, + vm_prot_t prot, vm_prot_t cap_maxprot, int flags, + vm_ooffset_t foff, struct thread *td) { + struct shmfd *shmfd; + vm_prot_t maxprot; + int error; + shmfd = fp->f_data; + maxprot = VM_PROT_NONE; + + /* FREAD should always be set. */ + if ((fp->f_flag & FREAD) != 0) + maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; + if ((fp->f_flag & FWRITE) != 0) + maxprot |= VM_PROT_WRITE; + + /* Don't permit shared writable mappings on read-only descriptors. */ + if ((flags & MAP_SHARED) != 0 && + (maxprot & VM_PROT_WRITE) == 0 && + (prot & VM_PROT_WRITE) != 0) + return (EACCES); + maxprot &= cap_maxprot; + +#ifdef MAC + error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); + if (error != 0) + return (error); +#endif + /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (foff >= shmfd->shm_size || foff + objsize > round_page(shmfd->shm_size)) return (EINVAL); mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); - *obj = shmfd->shm_object; + + error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, + shmfd->shm_object, foff, FALSE, td); + if (error != 0) + vm_object_deallocate(shmfd->shm_object); return (0); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct shmfd *shmfd; kif->kf_type = KF_TYPE_SHM; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ mtx_unlock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { sx_slock(&shm_dict_lock); if (shmfd->shm_path != NULL) strlcpy(kif->kf_path, shmfd->shm_path, sizeof(kif->kf_path)); sx_sunlock(&shm_dict_lock); } return (0); } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 283997) +++ head/sys/kern/vfs_vnops.c (revision 283998) @@ -1,2364 +1,2460 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; +static fo_mmap_t vn_mmap; struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, + .fo_mmap = vn_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static const int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; int *flagp, cmode; struct file *fp; { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int fmode, error; restart: fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. */ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; if (!(fmode & FWRITE)) ndp->ni_cnd.cn_flags |= LOCKSHARED; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, td, fp); if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { struct mount *mp; accmode_t accmode; struct flock lf; int error, have_flock, lock_flags, type; if (vp->v_type == VLNK) return (EMLINK); if (vp->v_type == VSOCK) return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if (fmode & FREAD) accmode |= VREAD; if (fmode & FEXEC) accmode |= VEXEC; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if (fmode & O_CREAT) accmode |= VCREAT; if (fmode & O_VERIFY) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0) { if (accmode & VWRITE) { error = vn_writechk(vp); if (error) return (error); } if (accmode) { error = VOP_ACCESS(vp, accmode, cred, td); if (error) return (error); } } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) return (error); if (fmode & (O_EXLOCK | O_SHLOCK)) { KASSERT(fp != NULL, ("open with flock requires fp")); lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); have_flock = (error == 0); vn_lock(vp, lock_flags | LK_RETRY); if (error == 0 && vp->v_iflag & VI_DOOMED) error = ENOENT; /* * Another thread might have used this vnode as an * executable while the vnode lock was dropped. * Ensure the vnode is still able to be opened for * writing after the lock has been obtained. */ if (error == 0 && accmode & VWRITE) error = vn_writechk(vp); if (error) { VOP_UNLOCK(vp, 0); if (have_flock) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); } vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); (void)VOP_CLOSE(vp, fmode, cred, td); vn_finished_write(mp); /* Prevent second close from fdrop()->vn_close(). */ if (fp != NULL) fp->f_ops= &badfileops; return (error); } fp->f_flag |= FHASLOCK; } if (fmode & FWRITE) { VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (0); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, file_cred, td) register struct vnode *vp; int flags; struct ucred *file_cred; struct thread *td; { struct mount *mp; int error, lock_flags; if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && MNT_EXTENDED_SHARED(vp->v_mount)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); if (flags & FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); vput(vp); vn_finished_write(mp); return (error); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { ASSERT_VOP_LOCKED(fp->f_vnode, __func__); if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ fp->f_seqcount += howmany(uio->uio_resid, 16384); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return (fp->f_seqcount << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp, 0); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; size_t len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; size_t *aresid; struct thread *td; { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX /* * Caller only wants the current f_offset value. Assume that * the long and shorter integer types reads are atomic. */ if ((flags & FOF_NOLOCK) != 0) return (fp->f_offset); #endif /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX if ((flags & FOF_NOLOCK) != 0) { if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; return; } #endif mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } /* * File table vnode read routine. */ static int vn_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { struct vnode *vp; struct mtx *mtxp; int error, ioflag; int advice; off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && offset != uio->uio_offset) { /* * Use POSIX_FADV_DONTNEED to flush clean pages and * buffers for the backing file after a * POSIX_FADV_NOREUSE read(2). To optimize the common * case of using POSIX_FADV_NOREUSE with sequential * access, track the previous implicit DONTNEED * request and grow this request to include the * current read(2) in addition to the previous * DONTNEED. With purely sequential access this will * cause the DONTNEED requests to continously grow to * cover all of the previously read regions of the * file. This allows filesystem blocks that are * accessed by multiple calls to read(2) to be flushed * once the last read(2) finishes. */ start = offset; end = uio->uio_offset - 1; mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { if (start != 0 && fp->f_advice->fa_prevend + 1 == start) start = fp->f_advice->fa_prevstart; else if (fp->f_advice->fa_prevstart != 0 && fp->f_advice->fa_prevstart == end + 1) end = fp->f_advice->fa_prevend; fp->f_advice->fa_prevstart = start; fp->f_advice->fa_prevend = end; } mtx_unlock(mtxp); error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); } return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { struct vnode *vp; struct mount *mp; struct mtx *mtxp; int error, ioflag, lock_flags; int advice; off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; advice = get_advice(fp, uio); if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && offset != uio->uio_offset) { /* * Use POSIX_FADV_DONTNEED to flush clean pages and * buffers for the backing file after a * POSIX_FADV_NOREUSE write(2). To optimize the * common case of using POSIX_FADV_NOREUSE with * sequential access, track the previous implicit * DONTNEED request and grow this request to include * the current write(2) in addition to the previous * DONTNEED. With purely sequential access this will * cause the DONTNEED requests to continously grow to * cover all of the previously written regions of the * file. * * Note that the blocks just written are almost * certainly still dirty, so this only works when * VOP_ADVISE() calls from subsequent writes push out * the data written by this write(2) once the backing * buffers are clean. However, as compared to forcing * IO_DIRECT, this gives much saner behavior. Write * clustering is still allowed, and clean pages are * merely moved to the cache page queue rather than * outright thrown away. This means a subsequent * read(2) can still avoid hitting the disk if the * pages have not been reclaimed. * * This does make POSIX_FADV_NOREUSE largely useless * with non-sequential access. However, sequential * access is the more common use case and the flag is * merely advisory. */ start = offset; end = uio->uio_offset - 1; mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { if (start != 0 && fp->f_advice->fa_prevend + 1 == start) start = fp->f_advice->fa_prevstart; else if (fp->f_advice->fa_prevstart != 0 && fp->f_advice->fa_prevstart == end + 1) end = fp->f_advice->fa_prevend; fp->f_advice->fa_prevstart = start; fp->f_advice->fa_prevend = end; } mtx_unlock(mtxp); error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); } unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { switch (args->kind) { case VN_IO_FAULT_FOP: return ((args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td)); case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { return (VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred)); } else if (uio->uio_rw == UIO_WRITE) { return (VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred)); } break; } panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, save, saveheld, prev_td_ma_cnt; prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; save = vm_fault_disable_pagefaults(); error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > io_hold_cnt * PAGE_SIZE) len = io_hold_cnt * PAGE_SIZE; addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } cnt = atop(end - trunc_page(addr)); /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: vm_fault_enable_pagefaults(save); free(uio_clone, M_IOV); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; foffset_lock_uio(fp, uio, flags); if (do_vn_io_fault(vp, uio)) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } error = vn_io_fault1(vp, uio, &args, td); vn_rangelock_unlock(vp, rl_cookie); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_writechk(vp); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, fp->f_cred); } out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * File table vnode stat routine. */ static int vn_statfile(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(vp, sb, active_cred, file_cred, td) struct vnode *vp; register struct stat *sb; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; #ifdef MAC error = mac_vnode_check_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, active_cred); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atim = vap->va_atime; sb->st_mtim = vap->va_mtime; sb->st_ctim = vap->va_ctime; sb->st_birthtim = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use miminum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { struct vattr vattr; struct vnode *vp; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred); VOP_UNLOCK(vp, 0); if (error == 0) *(int *)data = vattr.va_size - fp->f_offset; return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct vnode *vp; int error; vp = fp->f_vnode; #ifdef MAC vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ int _vn_lock(struct vnode *vp, int flags, char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock called with no locktype.")); do { #ifdef DEBUG_VFS_LOCKS KASSERT(vp->v_holdcnt != 0, ("vn_lock %p: zero hold count", vp)); #endif error = VOP_LOCK1(vp, flags, file, line); flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ KASSERT((flags & LK_RETRY) == 0 || error == 0, ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)", flags, error)); /* * Callers specify LK_RETRY if they wish to get dead vnodes. * If RETRY is not set, we return ENOENT instead. */ if (error == 0 && vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) { VOP_UNLOCK(vp, 0); error = ENOENT; break; } } while (flags & LK_RETRY && error != 0); return (error); } /* * File table vnode close routine. */ static int vn_closefile(fp, td) struct file *fp; struct thread *td; { struct vnode *vp; struct flock lf; int error; vp = fp->f_vnode; fp->f_ops = &badfileops; if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) vref(vp); error = vn_close(vp, fp->f_flag, fp->f_cred, td); if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } static bool vn_suspendable_mp(struct mount *mp) { return ((mp->mnt_kern_flag & MNTK_SUSPENDABLE) != 0); } static bool vn_suspendable(struct vnode *vp, struct mount **mpp) { if (vp != NULL) *mpp = vp->v_mount; if (*mpp == NULL) return (false); return (vn_suspendable_mp(*mpp)); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_locked(struct mount *mp, int flags) { int error, mflags; mtx_assert(MNT_MTX(mp), MA_OWNED); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0) | (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error) goto unlock; } } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); if (!vn_suspendable(vp, mpp)) { if ((flags & V_MNTREF) != 0) vfs_rel(*mpp); return (0); } error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); return (vn_start_write_locked(mp, flags)); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); if (!vn_suspendable(vp, mpp)) { if ((flags & V_MNTREF) != 0) vfs_rel(*mpp); return (0); } retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(mp) struct mount *mp; { if (mp == NULL || !vn_suspendable_mp(mp)) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(mp) struct mount *mp; { if (mp == NULL || !vn_suspendable_mp(mp)) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; MPASS(vn_suspendable_mp(mp)); MNT_ILOCK(mp); if (mp->mnt_susp_owner == curthread) { MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) vfs_write_resume(mp, 0); return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MPASS(vn_suspendable_mp(mp)); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_locked(mp, 0); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; MPASS(vn_suspendable_mp(mp)); KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (vp->v_iflag & VI_DOOMED) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (*rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, const struct thread *td) { if (vp->v_type != VREG || td == NULL) return (0); PROC_LOCK(td->td_proc); if ((uoff_t)uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); return (0); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfown(td, active_cred, vp, uid, gid)); } void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { struct vattr va; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("Wrong command %lu", cmd)); if (vn_lock(vp, LK_SHARED) != 0) return (EBADF); if (vp->v_type != VREG) { error = ENOTTY; goto unlock; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) goto unlock; noff = *off; if (noff >= va.va_size) { error = ENXIO; goto unlock; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto unlock; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto unlock; } } if (noff > va.va_size) noff = va.va_size; /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; unlock: VOP_UNLOCK(vp, 0); if (error == 0) *off = noff; return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; struct vattr vattr; off_t foffset, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; foffset = foffset_lock(fp, 0); noneg = (vp->v_type != VCHR); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); if (error) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (vattr.va_size == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) vattr.va_size = size; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_vnode_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(curthread, vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; return (0); +} + +int +vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, + vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, + struct thread *td) +{ +#ifdef HWPMC_HOOKS + struct pmckern_map_in pkm; +#endif + struct mount *mp; + struct vnode *vp; + vm_object_t object; + vm_prot_t maxprot; + boolean_t writecounted; + int error; + +#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ + defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) + /* + * POSIX shared-memory objects are defined to have + * kernel persistence, and are not defined to support + * read(2)/write(2) -- or even open(2). Thus, we can + * use MAP_ASYNC to trade on-disk coherence for speed. + * The shm_open(3) library routine turns on the FPOSIXSHM + * flag to request this behavior. + */ + if ((fp->f_flag & FPOSIXSHM) != 0) + flags |= MAP_NOSYNC; +#endif + vp = fp->f_vnode; + + /* + * Ensure that file and memory protections are + * compatible. Note that we only worry about + * writability if mapping is shared; in this case, + * current and max prot are dictated by the open file. + * XXX use the vnode instead? Problem is: what + * credentials do we use for determination? What if + * proc does a setuid? + */ + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) + maxprot = VM_PROT_NONE; + else + maxprot = VM_PROT_EXECUTE; + if ((fp->f_flag & FREAD) != 0) + maxprot |= VM_PROT_READ; + else if ((prot & VM_PROT_READ) != 0) + return (EACCES); + + /* + * If we are sharing potential changes via MAP_SHARED and we + * are trying to get write permission although we opened it + * without asking for it, bail out. + */ + if ((flags & MAP_SHARED) != 0) { + if ((fp->f_flag & FWRITE) != 0) + maxprot |= VM_PROT_WRITE; + else if ((prot & VM_PROT_WRITE) != 0) + return (EACCES); + } else { + maxprot |= VM_PROT_WRITE; + cap_maxprot |= VM_PROT_WRITE; + } + maxprot &= cap_maxprot; + + writecounted = FALSE; + error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, + &foff, &object, &writecounted); + if (error != 0) + return (error); + error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, + foff, writecounted, td); + if (error != 0) { + /* + * If this mapping was accounted for in the vnode's + * writecount, then undo that now. + */ + if (writecounted) + vnode_pager_release_writecount(object, 0, size); + vm_object_deallocate(object); + } +#ifdef HWPMC_HOOKS + /* Inform hwpmc(4) if an executable is being mapped. */ + if (error == 0 && (prot & VM_PROT_EXECUTE) != 0) { + pkm.pm_file = vp; + pkm.pm_address = (uintptr_t) addr; + PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); + } +#endif + return (error); } Index: head/sys/sys/file.h =================================================================== --- head/sys/sys/file.h (revision 283997) +++ head/sys/sys/file.h (revision 283998) @@ -1,396 +1,413 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include #include +#include struct filedesc; struct stat; struct thread; struct uio; struct knote; struct vnode; struct socket; #endif /* _KERNEL */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_LINUXEFD 13 /* emulation eventfd type */ #ifdef _KERNEL struct file; struct filecaps; struct kinfo_file; struct ucred; #define FOF_OFFSET 0x01 /* Use the offset in uio argument */ #define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */ #define FOF_NEXTOFF 0x04 /* Also update f_nextoff */ #define FOF_NOUPDATE 0x10 /* Do not update f_offset */ off_t foffset_lock(struct file *fp, int flags); void foffset_lock_uio(struct file *fp, struct uio *uio, int flags); void foffset_unlock(struct file *fp, off_t val, int flags); void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags); static inline off_t foffset_get(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); typedef int fo_truncate_t(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td); typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_chmod_t(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); typedef int fo_chown_t(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct thread *td); typedef int fo_seek_t(struct file *fp, off_t offset, int whence, struct thread *td); typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp); +typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, + vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, + int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_truncate_t *fo_truncate; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_chmod_t *fo_chmod; fo_chown_t *fo_chown; fo_sendfile_t *fo_sendfile; fo_seek_t *fo_seek; fo_fill_kinfo_t *fo_fill_kinfo; + fo_mmap_t *fo_mmap; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (a) f_vnode lock required (shared allows both reads and writes) * (f) protected with mtx_lock(mtx_pool_find(fp)) * (d) cdevpriv_mtx * none not locked */ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ off_t fa_prevstart; /* (f) Previous NOREUSE start. */ off_t fa_prevend; /* (f) Previous NOREUSE end. */ }; struct file { void *f_data; /* file descriptor specific data */ struct fileops *f_ops; /* File operations */ struct ucred *f_cred; /* associated credentials. */ struct vnode *f_vnode; /* NULL or applicable vnode */ short f_type; /* descriptor type */ short f_vnread_flags; /* (f) Sleep lock for f_offset */ volatile u_int f_flag; /* see fcntl.h */ volatile u_int f_count; /* reference count */ /* * DTYPE_VNODE specific fields. */ int f_seqcount; /* (a) Count of sequential accesses. */ off_t f_nextoff; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ struct fadvise_info *fvn_advice; } f_vnun; /* * DFLAG_SEEKABLE specific fields */ off_t f_offset; /* * Mandatory Access control information. */ void *f_label; /* Place-holder for MAC label. */ }; #define f_cdevpriv f_vnun.fvn_cdevpriv #define f_advice f_vnun.fvn_advice #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 #define FDEVFS_VNODE 0x4 #endif /* _KERNEL || _WANT_FILE */ /* * Userland version of struct file, for sysctl */ struct xfile { size_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ void *xf_file; /* address of struct file */ short xf_type; /* descriptor type */ int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ off_t xf_offset; /* file offset */ void *xf_data; /* file descriptor specific data */ void *xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ }; #ifdef _KERNEL extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ extern volatile int openfiles; /* actual number of open files */ int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp); int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); fo_rdwr_t invfo_rdwr; fo_truncate_t invfo_truncate; fo_ioctl_t invfo_ioctl; fo_poll_t invfo_poll; fo_kqfilter_t invfo_kqfilter; fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); void finit(struct file *, u_int, short, void *, struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp, u_int *fflagp); void fputsock(struct socket *sp); static __inline int _fnoop(void) { return (0); } #define fhold(fp) \ (refcount_acquire(&(fp)->f_count)) #define fdrop(fp, td) \ (refcount_release(&(fp)->f_count) ? _fdrop((fp), (td)) : _fnoop()) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_truncate_t fo_truncate; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline fo_chmod_t fo_chmod; static __inline fo_chown_t fo_chown; static __inline fo_sendfile_t fo_sendfile; static __inline int fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); } static __inline int fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); } static __inline int fo_close(struct file *fp, struct thread *td) { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(struct file *fp, struct knote *kn) { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } static __inline int fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td)); } static __inline int fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td)); } static __inline int fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct thread *td) { return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset, nbytes, sent, flags, kflags, td)); } static __inline int fo_seek(struct file *fp, off_t offset, int whence, struct thread *td) { return ((*fp->f_ops->fo_seek)(fp, offset, whence, td)); } static __inline int fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp)); +} + +static __inline int +fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, + vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, + struct thread *td) +{ + + if (fp->f_ops->fo_mmap == NULL) + return (ENODEV); + return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot, + flags, foff, td)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: head/sys/sys/mman.h =================================================================== --- head/sys/sys/mman.h (revision 283997) +++ head/sys/sys/mman.h (revision 283998) @@ -1,273 +1,271 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mman.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_MMAN_H_ #define _SYS_MMAN_H_ #include #include #if __BSD_VISIBLE /* * Inheritance for minherit() */ #define INHERIT_SHARE 0 #define INHERIT_COPY 1 #define INHERIT_NONE 2 #endif /* * Protections are chosen from these bits, or-ed together */ #define PROT_NONE 0x00 /* no permissions */ #define PROT_READ 0x01 /* pages can be read */ #define PROT_WRITE 0x02 /* pages can be written */ #define PROT_EXEC 0x04 /* pages can be executed */ /* * Flags contain sharing type and options. * Sharing types; choose one. */ #define MAP_SHARED 0x0001 /* share changes */ #define MAP_PRIVATE 0x0002 /* changes are private */ #if __BSD_VISIBLE #define MAP_COPY MAP_PRIVATE /* Obsolete */ #endif /* * Other flags */ #define MAP_FIXED 0x0010 /* map addr must be exactly as requested */ #if __BSD_VISIBLE #define MAP_RESERVED0020 0x0020 /* previously unimplemented MAP_RENAME */ #define MAP_RESERVED0040 0x0040 /* previously unimplemented MAP_NORESERVE */ #define MAP_RESERVED0080 0x0080 /* previously misimplemented MAP_INHERIT */ #define MAP_RESERVED0100 0x0100 /* previously unimplemented MAP_NOEXTEND */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_STACK 0x0400 /* region grows down, like a stack */ #define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */ /* * Mapping type */ #define MAP_FILE 0x0000 /* map from file (default) */ #define MAP_ANON 0x1000 /* allocated from memory, swap space */ #ifndef _KERNEL #define MAP_ANONYMOUS MAP_ANON /* For compatibility. */ #endif /* !_KERNEL */ /* * Extended flags */ #define MAP_EXCL 0x00004000 /* for MAP_FIXED, fail if address is used */ #define MAP_NOCORE 0x00020000 /* dont include these pages in a coredump */ #define MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */ #ifdef __LP64__ #define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */ #endif /* * Request specific alignment (n == log2 of the desired alignment). * * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does * not enforce a specific alignment. */ #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT) #define MAP_ALIGNMENT_SHIFT 24 #define MAP_ALIGNMENT_MASK MAP_ALIGNED(0xff) #define MAP_ALIGNED_SUPER MAP_ALIGNED(1) /* align on a superpage */ #endif /* __BSD_VISIBLE */ #if __POSIX_VISIBLE >= 199309 /* * Process memory locking */ #define MCL_CURRENT 0x0001 /* Lock only current memory */ #define MCL_FUTURE 0x0002 /* Lock all future memory as well */ #endif /* * Error return from mmap() */ #define MAP_FAILED ((void *)-1) /* * msync() flags */ #define MS_SYNC 0x0000 /* msync synchronously */ #define MS_ASYNC 0x0001 /* return immediately */ #define MS_INVALIDATE 0x0002 /* invalidate all cached data */ /* * Advice to madvise */ #define _MADV_NORMAL 0 /* no further special treatment */ #define _MADV_RANDOM 1 /* expect random page references */ #define _MADV_SEQUENTIAL 2 /* expect sequential page references */ #define _MADV_WILLNEED 3 /* will need these pages */ #define _MADV_DONTNEED 4 /* dont need these pages */ #if __BSD_VISIBLE #define MADV_NORMAL _MADV_NORMAL #define MADV_RANDOM _MADV_RANDOM #define MADV_SEQUENTIAL _MADV_SEQUENTIAL #define MADV_WILLNEED _MADV_WILLNEED #define MADV_DONTNEED _MADV_DONTNEED #define MADV_FREE 5 /* dont need these pages, and junk contents */ #define MADV_NOSYNC 6 /* try to avoid flushes to physical media */ #define MADV_AUTOSYNC 7 /* revert to default flushing strategy */ #define MADV_NOCORE 8 /* do not include these pages in a core file */ #define MADV_CORE 9 /* revert to including pages in a core file */ #define MADV_PROTECT 10 /* protect process from pageout kill */ /* * Return bits from mincore */ #define MINCORE_INCORE 0x1 /* Page is incore */ #define MINCORE_REFERENCED 0x2 /* Page has been referenced by us */ #define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ #define MINCORE_SUPER 0x20 /* Page is a "super" page */ /* * Anonymous object constant for shm_open(). */ #define SHM_ANON ((char *)1) #endif /* __BSD_VISIBLE */ /* * XXX missing POSIX_TYPED_MEM_* macros and * posix_typed_mem_info structure. */ #if __POSIX_VISIBLE >= 200112 #define POSIX_MADV_NORMAL _MADV_NORMAL #define POSIX_MADV_RANDOM _MADV_RANDOM #define POSIX_MADV_SEQUENTIAL _MADV_SEQUENTIAL #define POSIX_MADV_WILLNEED _MADV_WILLNEED #define POSIX_MADV_DONTNEED _MADV_DONTNEED #endif #ifndef _MODE_T_DECLARED typedef __mode_t mode_t; #define _MODE_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #if defined(_KERNEL) || defined(_WANT_FILE) #include #include #include #include #include struct file; struct shmfd { size_t shm_size; vm_object_t shm_object; int shm_refs; uid_t shm_uid; gid_t shm_gid; mode_t shm_mode; int shm_kmappings; /* * Values maintained solely to make this a better-behaved file * descriptor for fstat() to run on. */ struct timespec shm_atime; struct timespec shm_mtime; struct timespec shm_ctime; struct timespec shm_birthtime; ino_t shm_ino; struct label *shm_label; /* MAC label */ const char *shm_path; struct rangelock shm_rl; struct mtx shm_mtx; }; #endif #ifdef _KERNEL -int shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, - vm_object_t *obj); int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); #else /* !_KERNEL */ __BEGIN_DECLS /* * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(), * posix_typed_mem_open(). */ #if __BSD_VISIBLE int getpagesizes(size_t *, int); int madvise(void *, size_t, int); int mincore(const void *, size_t, char *); int minherit(void *, size_t, int); #endif int mlock(const void *, size_t); #ifndef _MMAP_DECLARED #define _MMAP_DECLARED void * mmap(void *, size_t, int, int, int, off_t); #endif int mprotect(const void *, size_t, int); int msync(void *, size_t, int); int munlock(const void *, size_t); int munmap(void *, size_t); #if __POSIX_VISIBLE >= 200112 int posix_madvise(void *, size_t, int); #endif #if __POSIX_VISIBLE >= 199309 int mlockall(int); int munlockall(void); int shm_open(const char *, int, mode_t); int shm_unlink(const char *); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_MMAN_H_ */ Index: head/sys/vm/vm_extern.h =================================================================== --- head/sys/vm/vm_extern.h (revision 283997) +++ head/sys/vm/vm_extern.h (revision 283998) @@ -1,108 +1,118 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 * $FreeBSD$ */ #ifndef _VM_EXTERN_H_ #define _VM_EXTERN_H_ struct pmap; struct proc; struct vmspace; struct vnode; struct vmem; #ifdef _KERNEL +struct cdev; +struct cdevsw; /* These operate on kernel virtual addresses only. */ vm_offset_t kva_alloc(vm_size_t); void kva_free(vm_offset_t, vm_size_t); /* These operate on pageable virtual addresses. */ vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t); void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); /* These operate on virtual addresses backed by memory. */ vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr); vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags); void kmem_free(struct vmem *, vm_offset_t, vm_size_t); /* This provides memory for previously allocated address space. */ int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int); void kmem_unback(vm_object_t, vm_offset_t, vm_size_t); /* Bootstrapping. */ vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, boolean_t); void kmem_init(vm_offset_t, vm_offset_t); void kmem_init_zero_region(void); void kmeminit(void); void swapout_procs(int); int kernacc(void *, int, int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t, vm_ooffset_t *); int vm_fault_disable_pagefaults(void); void vm_fault_enable_pagefaults(int save); int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold); int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count); -int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); +int vm_forkproc(struct thread *, struct proc *, struct thread *, + struct vmspace *, int); void vm_waitproc(struct proc *); -int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); +int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, + objtype_t, void *, vm_ooffset_t); +int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, + vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *); int vm_mmap_to_errno(int rv); +int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, + int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *); +int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, + struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); void vm_set_page_size(void); void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t); typedef int (*pmap_pinit_t)(struct pmap *pmap); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *); int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); int vmspace_unshare(struct proc *); void vmspace_exit(struct thread *); struct vmspace *vmspace_acquire_ref(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset); void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); int vm_mlock(struct proc *, struct ucred *, const void *, size_t); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 283997) +++ head/sys/vm/vm_mmap.c (revision 283998) @@ -1,1732 +1,1611 @@ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif -static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); -static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct cdev *, vm_ooffset_t *, vm_object_t *); -static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct shmfd *, vm_ooffset_t, vm_object_t *); - #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_sbrk(td, uap) struct thread *td; struct sbrk_args *uap; { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_sstk(td, uap) struct thread *td; struct sstk_args *uap; { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif int ogetpagesize(td, uap) struct thread *td; struct getpagesize_args *uap; { /* MP SAFE */ td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif /* * MPSAFE */ int sys_mmap(td, uap) struct thread *td; struct mmap_args *uap; { -#ifdef HWPMC_HOOKS - struct pmckern_map_in pkm; -#endif struct file *fp; - struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; - vm_prot_t cap_maxprot, maxprot; - void *handle; - objtype_t handle_type; + vm_prot_t cap_maxprot; int align, error, flags, prot; off_t pos; struct vmspace *vms = td->td_proc->p_vmspace; cap_rights_t rights; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot; flags = uap->flags; pos = uap->pos; fp = NULL; /* * Ignore old flags that used to be defined but did not do anything. */ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and * zero position for new code. Be nice to ancient a.out * binaries and correct pos for anonymous mapping, since old * ld.so sometimes issues anonymous map requests with non-zero * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) pos = 0; } if (flags & MAP_STACK) { if ((uap->fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) return (EINVAL); if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ /* Ensure alignment is at least a page and fits in a pointer. */ align = flags & MAP_ALIGNMENT_MASK; if (align != 0 && align != MAP_ALIGNED_SUPER && (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (addr < vm_map_min(&vms->vm_map) || addr + size > vm_map_max(&vms->vm_map)) return (EINVAL); if (addr + size < addr) return (EINVAL); #ifdef MAP_32BIT if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) return (EINVAL); } else if (flags & MAP_32BIT) { /* * For MAP_32BIT, override the hint if it is too high and * do not bother moving the mapping past the heap (since * the heap is usually above 2GB). */ if (addr + size > MAP_32BIT_MAX_ADDR) addr = 0; #endif } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ PROC_LOCK(td->td_proc); if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td->td_proc, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td->td_proc, RLIMIT_DATA)); PROC_UNLOCK(td->td_proc); } - if (flags & MAP_ANON) { + if (size == 0) { /* + * Return success without mapping anything for old + * binaries that request a page-aligned mapping of + * length 0. For modern binaries, this function + * returns an error earlier. + */ + error = 0; + } else if (flags & MAP_ANON) { + /* * Mapping blank space is trivial. + * + * This relies on VM_PROT_* matching PROT_*. */ - handle = NULL; - handle_type = OBJT_DEFAULT; - maxprot = VM_PROT_ALL; - cap_maxprot = VM_PROT_ALL; + error = vm_mmap_object(&vms->vm_map, &addr, size, prot, + VM_PROT_ALL, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the * descriptor disappear on us if we block. Check capability * rights, but also return the maximum rights to be combined * with maxprot later. */ cap_rights_init(&rights, CAP_MMAP); if (prot & PROT_READ) cap_rights_set(&rights, CAP_MMAP_R); if ((flags & MAP_SHARED) != 0) { if (prot & PROT_WRITE) cap_rights_set(&rights, CAP_MMAP_W); } if (prot & PROT_EXEC) cap_rights_set(&rights, CAP_MMAP_X); error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { error = EINVAL; goto done; } - if (fp->f_type == DTYPE_SHM) { - handle = fp->f_data; - handle_type = OBJT_SWAP; - maxprot = VM_PROT_NONE; - /* FREAD should always be set. */ - if (fp->f_flag & FREAD) - maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; - if (fp->f_flag & FWRITE) - maxprot |= VM_PROT_WRITE; - goto map; - } - if (fp->f_type != DTYPE_VNODE) { - error = ENODEV; - goto done; - } -#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ - defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) - /* - * POSIX shared-memory objects are defined to have - * kernel persistence, and are not defined to support - * read(2)/write(2) -- or even open(2). Thus, we can - * use MAP_ASYNC to trade on-disk coherence for speed. - * The shm_open(3) library routine turns on the FPOSIXSHM - * flag to request this behavior. - */ - if (fp->f_flag & FPOSIXSHM) - flags |= MAP_NOSYNC; -#endif - vp = fp->f_vnode; - /* - * Ensure that file and memory protections are - * compatible. Note that we only worry about - * writability if mapping is shared; in this case, - * current and max prot are dictated by the open file. - * XXX use the vnode instead? Problem is: what - * credentials do we use for determination? What if - * proc does a setuid? - */ - if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) - maxprot = VM_PROT_NONE; - else - maxprot = VM_PROT_EXECUTE; - if (fp->f_flag & FREAD) { - maxprot |= VM_PROT_READ; - } else if (prot & PROT_READ) { - error = EACCES; - goto done; - } - /* - * If we are sharing potential changes (either via - * MAP_SHARED or via the implicit sharing of character - * device mappings), and we are trying to get write - * permission although we opened it without asking - * for it, bail out. - */ - if ((flags & MAP_SHARED) != 0) { - if ((fp->f_flag & FWRITE) != 0) { - maxprot |= VM_PROT_WRITE; - } else if ((prot & PROT_WRITE) != 0) { - error = EACCES; - goto done; - } - } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { - maxprot |= VM_PROT_WRITE; - cap_maxprot |= VM_PROT_WRITE; - } - handle = (void *)vp; - handle_type = OBJT_VNODE; + /* This relies on VM_PROT_* matching PROT_*. */ + error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, + cap_maxprot, flags, pos, td); } -map: - td->td_fpop = fp; - maxprot &= cap_maxprot; - /* This relies on VM_PROT_* matching PROT_*. */ - error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, - flags, handle_type, handle, pos); - td->td_fpop = NULL; -#ifdef HWPMC_HOOKS - /* inform hwpmc(4) if an executable is being mapped */ - if (error == 0 && handle_type == OBJT_VNODE && - (prot & PROT_EXEC)) { - pkm.pm_file = handle; - pkm.pm_address = (uintptr_t) addr; - PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); - } -#endif if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: if (fp) fdrop(fp, td); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { struct mmap_args oargs; oargs.addr = uap->addr; oargs.len = uap->len; oargs.prot = uap->prot; oargs.flags = uap->flags; oargs.fd = uap->fd; oargs.pos = uap->pos; return (sys_mmap(td, &oargs)); } #endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(td, uap) struct thread *td; struct ommap_args *uap; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; #ifdef COMPAT_FREEBSD32 #if defined(__amd64__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && nargs.prot != 0) nargs.prot |= PROT_EXEC; #endif #endif nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; nargs.fd = uap->fd; nargs.pos = uap->pos; return (sys_mmap(td, &nargs)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; size_t len; int flags; }; #endif /* * MPSAFE */ int sys_msync(td, uap) struct thread *td; struct msync_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (ENOMEM); case KERN_INVALID_ARGUMENT: return (EBUSY); case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif /* * MPSAFE */ int sys_munmap(td, uap) struct thread *td; struct munmap_args *uap; { #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; #endif vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; addr = (vm_offset_t) uap->addr; size = uap->len; if (size == 0) return (EINVAL); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... */ map = &td->td_proc->p_vmspace->vm_map; if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS /* * Inform hwpmc if the address range being unmapped contains * an executable region. */ pkm.pm_address = (uintptr_t) NULL; if (vm_map_lookup_entry(map, addr, &entry)) { for (; entry != &map->header && entry->start < addr + size; entry = entry->next) { if (vm_map_check_protection(map, entry->start, entry->end, VM_PROT_EXECUTE) == TRUE) { pkm.pm_address = (uintptr_t) addr; pkm.pm_size = (size_t) size; break; } } } #endif vm_map_delete(map, addr, addr + size); #ifdef HWPMC_HOOKS /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); vm_map_unlock_read(map); #else vm_map_unlock(map); #endif /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ return (0); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif /* * MPSAFE */ int sys_mprotect(td, uap) struct thread *td; struct mprotect_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); case KERN_RESOURCE_SHORTAGE: return (ENOMEM); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif /* * MPSAFE */ int sys_minherit(td, uap) struct thread *td; struct minherit_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif /* * MPSAFE */ int sys_madvise(td, uap) struct thread *td; struct madvise_args *uap; { vm_offset_t start, end; vm_map_t map; int flags; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (uap->behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); } /* * Check for illegal behavior */ if (uap->behav < 0 || uap->behav > MADV_CORE) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; if ((vm_offset_t)uap->addr < vm_map_min(map) || (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) return (EINVAL); if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page((vm_offset_t) uap->addr); end = round_page((vm_offset_t) uap->addr + uap->len); if (vm_map_madvise(map, start, end, uap->behav)) return (EINVAL); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif /* * MPSAFE */ int sys_mincore(td, uap) struct thread *td; struct mincore_args *uap; { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; vm_object_t object; vm_paddr_t locked_pa; vm_page_t m; vm_pindex_t pindex; int mincoreinfo; unsigned int timestamp; boolean_t locked; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); /* * Address of byte vector */ vec = uap->vec; pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) { vm_map_unlock_read(map); return (ENOMEM); } /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for (current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * check for contiguity */ if (current->end < end && (entry->next == &map->header || current->next->start > current->end)) { vm_map_unlock_read(map); return (ENOMEM); } /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ object = NULL; locked_pa = 0; retry: m = NULL; mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); if (locked_pa != 0) { /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that * other mappings might be examined. */ m = PHYS_TO_VM_PAGE(locked_pa); if (m->object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = m->object; locked = VM_OBJECT_TRYWLOCK(object); vm_page_unlock(m); if (!locked) { VM_OBJECT_WLOCK(object); vm_page_lock(m); goto retry; } } else vm_page_unlock(m); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { /* * The page is not mapped by this process. If * the object implements managed pages, then * determine if the page is resident so that * the mappings might be examined. */ if (current->object.vm_object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = current->object.vm_object; VM_OBJECT_WLOCK(object); } if (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP || object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); if (m == NULL && vm_page_is_cached(object, pindex)) mincoreinfo = MINCORE_INCORE; if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; } } if (m != NULL) { /* Examine other mappings to the page. */ if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty != 0) mincoreinfo |= MINCORE_MODIFIED_OTHER; /* * The first test for PGA_REFERENCED is an * optimization. The second test is * required because a concurrent pmap * operation could clear the last reference * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ if ((m->aflags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || (m->aflags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) VM_OBJECT_WUNLOCK(object); /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int sys_mlock(td, uap) struct thread *td; struct mlock_args *uap; { return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); } int vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; vm_map_t map; unsigned long nsize; int error; error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); if (error) return (error); addr = (vm_offset_t)addr0; size = len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef RACCT if (racct_enable) { PROC_LOCK(proc); error = racct_set(proc, RACCT_MEMLOCK, nsize); PROC_UNLOCK(proc); if (error != 0) return (ENOMEM); } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif /* * MPSAFE */ int sys_mlockall(td, uap) struct thread *td; struct mlockall_args *uap; { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { PROC_LOCK(td->td_proc); if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } PROC_UNLOCK(td->td_proc); } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); PROC_UNLOCK(td->td_proc); if (error != 0) return (ENOMEM); } #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); error = (error == KERN_SUCCESS ? 0 : EAGAIN); } #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif /* * MPSAFE */ int sys_munlockall(td, uap) struct thread *td; struct munlockall_args *uap; { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int sys_munlock(td, uap) struct thread *td; struct munlock_args *uap; { vm_offset_t addr, end, last, start; vm_size_t size; #ifdef RACCT vm_map_t map; #endif int error; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); addr = (vm_offset_t)uap->addr; size = uap->len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. - * - * For VCHR vnodes, the vnode lock is held over the call to - * vm_mmap_cdev() to keep vp->v_rdev valid. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_offset_t foff; struct ucred *cred; int error, flags, locktype; cred = td->td_ucred; if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) locktype = LK_EXCLUSIVE; else locktype = LK_SHARED; if ((error = vget(vp, locktype, td)) != 0) return (error); foff = *foffp; flags = *flagsp; obj = vp->v_object; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (obj == NULL) { error = EINVAL; goto done; } if (obj->type == OBJT_VNODE && obj->handle != vp) { vput(vp); vp = (struct vnode *)obj->handle; /* * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ error = vget(vp, locktype, td); if (error != 0) return (error); } if (locktype == LK_EXCLUSIVE) { *writecounted = TRUE; vnode_pager_update_writecount(obj, 0, objsize); } - } else if (vp->v_type == VCHR) { - error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, - vp->v_rdev, foffp, objp); - if (error == 0) - goto mark_atime; - goto done; } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC - error = mac_vnode_check_mmap(cred, vp, prot, flags); + /* This relies on VM_PROT_* matching PROT_*. */ + error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { - if (prot & PROT_WRITE) { + if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; } } else { KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, ("wrong object type")); VM_OBJECT_WLOCK(obj); vm_object_reference_locked(obj); #if VM_NRESERVLEVEL > 0 vm_object_color(obj, 0); #endif VM_OBJECT_WUNLOCK(obj); } *objp = obj; *flagsp = flags; -mark_atime: vfs_mark_atime(vp, cred); done: if (error != 0 && *writecounted) { *writecounted = FALSE; vnode_pager_update_writecount(obj, objsize, 0); } vput(vp); return (error); } /* * vm_mmap_cdev() * * MPSAFE * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on cdevs. */ int -vm_mmap_cdev(struct thread *td, vm_size_t objsize, - vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) +vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, + vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, + vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; - struct cdevsw *dsw; - int error, flags, ref; + int error, flags; flags = *flagsp; - dsw = dev_refthread(cdev, &ref); - if (dsw == NULL) - return (ENXIO); if (dsw->d_flags & D_MMAP_ANON) { - dev_relthread(cdev, ref); + *objp = NULL; + *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); } /* * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && - (prot & PROT_WRITE) != 0) { - dev_relthread(cdev, ref); + (prot & VM_PROT_WRITE) != 0) return (EACCES); - } - if (flags & (MAP_PRIVATE|MAP_COPY)) { - dev_relthread(cdev, ref); + if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); - } /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX - error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); - if (error != 0) { - dev_relthread(cdev, ref); + error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); + if (error != 0) return (error); - } #endif /* * First, try d_mmap_single(). If that is not implemented * (returns ENODEV), fall back to using the device pager. * Note that d_mmap_single() must return a reference to the * object (it needs to bump the reference count of the object * it returns somehow). * * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); - dev_relthread(cdev, ref); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, td->td_ucred); if (obj == NULL) return (EINVAL); *objp = obj; *flagsp = flags; return (0); } /* - * vm_mmap_shm() + * vm_mmap() * - * MPSAFE - * - * Helper function for vm_mmap. Perform sanity check specific for mmap - * operations on shm file descriptors. + * Internal version of mmap used by exec, sys5 shared memory, and + * various device drivers. Handle is either a vnode pointer, a + * character device, or NULL for MAP_ANON. */ int -vm_mmap_shm(struct thread *td, vm_size_t objsize, - vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) +vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, + vm_prot_t maxprot, int flags, + objtype_t handle_type, void *handle, + vm_ooffset_t foff) { + vm_object_t object; + struct thread *td = curthread; int error; + boolean_t writecounted; - if ((*flagsp & MAP_SHARED) != 0 && - (*maxprotp & VM_PROT_WRITE) == 0 && - (prot & PROT_WRITE) != 0) - return (EACCES); -#ifdef MAC - error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); - if (error != 0) - return (error); -#endif - error = shm_mmap(shmfd, objsize, foff, objp); + if (size == 0) + return (EINVAL); + + size = round_page(size); + writecounted = FALSE; + + /* + * Lookup/allocate object. + */ + switch (handle_type) { + case OBJT_DEVICE: { + struct cdevsw *dsw; + struct cdev *cdev; + int ref; + + cdev = handle; + dsw = dev_refthread(cdev, &ref); + if (dsw == NULL) + return (ENXIO); + error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, + dsw, &foff, &object); + dev_relthread(cdev, ref); + break; + } + case OBJT_VNODE: + error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, + handle, &foff, &object, &writecounted); + break; + case OBJT_DEFAULT: + if (handle == NULL) { + error = 0; + break; + } + /* FALLTHROUGH */ + default: + error = EINVAL; + break; + } if (error) return (error); - return (0); + + error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, + foff, writecounted, td); + if (error != 0 && object != NULL) { + /* + * If this mapping was accounted for in the vnode's + * writecount, then undo that now. + */ + if (writecounted) + vnode_pager_release_writecount(object, 0, size); + vm_object_deallocate(object); + } + return (error); } /* - * vm_mmap() - * - * MPSAFE - * - * Internal version of mmap. Currently used by mmap, exec, and sys5 - * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. + * Internal version of mmap that maps a specific VM object into an + * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int -vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, - vm_prot_t maxprot, int flags, - objtype_t handle_type, void *handle, - vm_ooffset_t foff) +vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, + vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, + boolean_t writecounted, struct thread *td) { boolean_t fitit; - vm_object_t object = NULL; - struct thread *td = curthread; int docow, error, findspace, rv; - boolean_t writecounted; - if (size == 0) - return (0); - - size = round_page(size); - if (map == &td->td_proc->p_vmspace->vm_map) { PROC_LOCK(td->td_proc); if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + size); if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); PROC_UNLOCK(td->td_proc); return (error); } } PROC_UNLOCK(td->td_proc); } /* * We currently can only deal with page aligned file offsets. - * The check is here rather than in the syscall because the - * kernel calls this function internally for other mmaping - * operations (such as in exec) and non-aligned offsets will - * cause pmap inconsistencies...so we want to be sure to - * disallow this in all cases. + * The mmap() system call already enforces this by subtracting + * the page offset from the file offset, but checking here + * catches errors in device drivers (e.g. d_single_mmap() + * callbacks) and other internal mapping requests (such as in + * exec). */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; } - writecounted = FALSE; - /* - * Lookup/allocate object. - */ - switch (handle_type) { - case OBJT_DEVICE: - error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, - handle, &foff, &object); - break; - case OBJT_VNODE: - error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object, &writecounted); - break; - case OBJT_SWAP: - error = vm_mmap_shm(td, size, prot, &maxprot, &flags, - handle, foff, &object); - break; - case OBJT_DEFAULT: - if (handle == NULL) { - error = 0; - break; - } - /* FALLTHROUGH */ - default: - error = EINVAL; - break; - } - if (error) - return (error); if (flags & MAP_ANON) { - object = NULL; + if (object != NULL || foff != 0) + return (EINVAL); docow = 0; - /* - * Unnamed anonymous regions always start at 0. - */ - if (handle == 0) - foff = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else docow = MAP_PREFAULT_PARTIAL; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; if (writecounted) docow |= MAP_VN_WRITECOUNT; if (flags & MAP_STACK) { if (object != NULL) return (EINVAL); docow |= MAP_STACK_GROWS_DOWN; } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) findspace = VMFS_SUPER_SPACE; else if ((flags & MAP_ALIGNMENT_MASK) != 0) findspace = VMFS_ALIGNED_SPACE(flags >> MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; rv = vm_map_find(map, object, foff, addr, size, #ifdef MAP_32BIT flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : #endif 0, findspace, prot, maxprot, docow); } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); } if (rv == KERN_SUCCESS) { /* * If the process has requested that all future mappings * be wired, then heed this. */ if (map->flags & MAP_WIREFUTURE) { vm_map_wire(map, *addr, *addr + size, VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); } - } else { - /* - * If this mapping was accounted for in the vnode's - * writecount, then undo that now. - */ - if (writecounted) - vnode_pager_release_writecount(object, 0, size); - /* - * Lose the object reference. Will destroy the - * object if it's an unnamed anonymous mapping - * or named anonymous without other references. - */ - vm_object_deallocate(object); } return (vm_mmap_to_errno(rv)); } /* * Translate a Mach VM return code to zero on success or the appropriate errno * on failure. */ int vm_mmap_to_errno(int rv) { switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } }