Index: head/sys/amd64/sgx/sgx.c
===================================================================
--- head/sys/amd64/sgx/sgx.c (revision 353538)
+++ head/sys/amd64/sgx/sgx.c (revision 353539)
@@ -1,1218 +1,1218 @@
/*-
* Copyright (c) 2017 Ruslan Bukin
* All rights reserved.
*
* This software was developed by BAE Systems, the University of Cambridge
* Computer Laboratory, and Memorial University under DARPA/AFRL contract
* FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
* (TC) research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Design overview.
*
* The driver provides character device for mmap(2) and ioctl(2) system calls
* allowing user to manage isolated compartments ("enclaves") in user VA space.
*
* The driver duties is EPC pages management, enclave management, user data
* validation.
*
* This driver requires Intel SGX support from hardware.
*
* /dev/sgx:
* .mmap:
* sgx_mmap_single() allocates VM object with following pager
* operations:
* a) sgx_pg_ctor():
* VM object constructor does nothing
* b) sgx_pg_dtor():
* VM object destructor destroys the SGX enclave associated
* with the object: it frees all the EPC pages allocated for
* enclave and removes the enclave.
* c) sgx_pg_fault():
* VM object fault handler does nothing
*
* .ioctl:
* sgx_ioctl():
* a) SGX_IOC_ENCLAVE_CREATE
* Adds Enclave SECS page: initial step of enclave creation.
* b) SGX_IOC_ENCLAVE_ADD_PAGE
* Adds TCS, REG pages to the enclave.
* c) SGX_IOC_ENCLAVE_INIT
* Finalizes enclave creation.
*
* Enclave lifecycle:
* .-- ECREATE -- Add SECS page
* Kernel | EADD -- Add TCS, REG pages
* space | EEXTEND -- Measure the page (take unique hash)
* ENCLS | EPA -- Allocate version array page
* '-- EINIT -- Finalize enclave creation
* User .-- EENTER -- Go to entry point of enclave
* space | EEXIT -- Exit back to main application
* ENCLU '-- ERESUME -- Resume enclave execution (e.g. after exception)
*
* Enclave lifecycle from driver point of view:
* 1) User calls mmap() on /dev/sgx: we allocate a VM object
* 2) User calls ioctl SGX_IOC_ENCLAVE_CREATE: we look for the VM object
* associated with user process created on step 1, create SECS physical
* page and store it in enclave's VM object queue by special index
* SGX_SECS_VM_OBJECT_INDEX.
* 3) User calls ioctl SGX_IOC_ENCLAVE_ADD_PAGE: we look for enclave created
* on step 2, create TCS or REG physical page and map it to specified by
* user address of enclave VM object.
* 4) User finalizes enclave creation with ioctl SGX_IOC_ENCLAVE_INIT call.
* 5) User can freely enter to and exit from enclave using ENCLU instructions
* from userspace: the driver does nothing here.
* 6) User proceed munmap(2) system call (or the process with enclave dies):
* we destroy the enclave associated with the object.
*
* EPC page types and their indexes in VM object queue:
* - PT_SECS index is special and equals SGX_SECS_VM_OBJECT_INDEX (-1);
* - PT_TCS and PT_REG indexes are specified by user in addr field of ioctl
* request data and determined as follows:
* pidx = OFF_TO_IDX(addp->addr - vmh->base);
* - PT_VA index is special, created for PT_REG, PT_TCS and PT_SECS pages
* and determined by formula:
* va_page_idx = - SGX_VA_PAGES_OFFS - (page_idx / SGX_VA_PAGE_SLOTS);
* PT_VA page can hold versions of up to 512 pages, and slot for each
* page in PT_VA page is determined as follows:
* va_slot_idx = page_idx % SGX_VA_PAGE_SLOTS;
* - PT_TRIM is unused.
*
* Locking:
* SGX ENCLS set of instructions have limitations on concurrency:
* some instructions can't be executed same time on different CPUs.
* We use sc->mtx_encls lock around them to prevent concurrent execution.
* sc->mtx lock is used to manage list of created enclaves and the state of
* SGX driver.
*
* Eviction of EPC pages:
* Eviction support is not implemented in this driver, however the driver
* manages VA (version array) pages: it allocates a VA slot for each EPC
* page. This will be required for eviction support in future.
* VA pages and slots are currently unused.
*
* IntelĀ® 64 and IA-32 Architectures Software Developer's Manual
* https://software.intel.com/en-us/articles/intel-sdm
*/
#include
__FBSDID("$FreeBSD$");
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define SGX_DEBUG
#undef SGX_DEBUG
#ifdef SGX_DEBUG
#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
#else
#define dprintf(fmt, ...)
#endif
static struct cdev_pager_ops sgx_pg_ops;
struct sgx_softc sgx_sc;
static int
sgx_get_epc_page(struct sgx_softc *sc, struct epc_page **epc)
{
vmem_addr_t addr;
int i;
if (vmem_alloc(sc->vmem_epc, PAGE_SIZE, M_FIRSTFIT | M_NOWAIT,
&addr) == 0) {
i = (addr - sc->epc_base) / PAGE_SIZE;
*epc = &sc->epc_pages[i];
return (0);
}
return (ENOMEM);
}
static void
sgx_put_epc_page(struct sgx_softc *sc, struct epc_page *epc)
{
vmem_addr_t addr;
if (epc == NULL)
return;
addr = (epc->index * PAGE_SIZE) + sc->epc_base;
vmem_free(sc->vmem_epc, addr, PAGE_SIZE);
}
static int
sgx_va_slot_init_by_index(struct sgx_softc *sc, vm_object_t object,
uint64_t idx)
{
struct epc_page *epc;
vm_page_t page;
vm_page_t p;
int ret;
VM_OBJECT_ASSERT_WLOCKED(object);
p = vm_page_lookup(object, idx);
if (p == NULL) {
ret = sgx_get_epc_page(sc, &epc);
if (ret) {
dprintf("%s: No free EPC pages available.\n",
__func__);
return (ret);
}
mtx_lock(&sc->mtx_encls);
sgx_epa((void *)epc->base);
mtx_unlock(&sc->mtx_encls);
page = PHYS_TO_VM_PAGE(epc->phys);
- vm_page_insert(page, object, idx);
page->valid = VM_PAGE_BITS_ALL;
+ vm_page_insert(page, object, idx);
}
return (0);
}
static int
sgx_va_slot_init(struct sgx_softc *sc,
struct sgx_enclave *enclave,
uint64_t addr)
{
vm_pindex_t pidx;
uint64_t va_page_idx;
uint64_t idx;
vm_object_t object;
int va_slot;
int ret;
object = enclave->object;
VM_OBJECT_ASSERT_WLOCKED(object);
pidx = OFF_TO_IDX(addr);
va_slot = pidx % SGX_VA_PAGE_SLOTS;
va_page_idx = pidx / SGX_VA_PAGE_SLOTS;
idx = - SGX_VA_PAGES_OFFS - va_page_idx;
ret = sgx_va_slot_init_by_index(sc, object, idx);
return (ret);
}
static int
sgx_mem_find(struct sgx_softc *sc, uint64_t addr,
vm_map_entry_t *entry0, vm_object_t *object0)
{
vm_map_t map;
vm_map_entry_t entry;
vm_object_t object;
map = &curproc->p_vmspace->vm_map;
vm_map_lock_read(map);
if (!vm_map_lookup_entry(map, addr, &entry)) {
vm_map_unlock_read(map);
dprintf("%s: Can't find enclave.\n", __func__);
return (EINVAL);
}
object = entry->object.vm_object;
if (object == NULL || object->handle == NULL) {
vm_map_unlock_read(map);
return (EINVAL);
}
if (object->type != OBJT_MGTDEVICE ||
object->un_pager.devp.ops != &sgx_pg_ops) {
vm_map_unlock_read(map);
return (EINVAL);
}
vm_object_reference(object);
*object0 = object;
*entry0 = entry;
vm_map_unlock_read(map);
return (0);
}
static int
sgx_enclave_find(struct sgx_softc *sc, uint64_t addr,
struct sgx_enclave **encl)
{
struct sgx_vm_handle *vmh;
struct sgx_enclave *enclave;
vm_map_entry_t entry;
vm_object_t object;
int ret;
ret = sgx_mem_find(sc, addr, &entry, &object);
if (ret)
return (ret);
vmh = object->handle;
if (vmh == NULL) {
vm_object_deallocate(object);
return (EINVAL);
}
enclave = vmh->enclave;
if (enclave == NULL || enclave->object == NULL) {
vm_object_deallocate(object);
return (EINVAL);
}
*encl = enclave;
return (0);
}
static int
sgx_enclave_alloc(struct sgx_softc *sc, struct secs *secs,
struct sgx_enclave **enclave0)
{
struct sgx_enclave *enclave;
enclave = malloc(sizeof(struct sgx_enclave),
M_SGX, M_WAITOK | M_ZERO);
enclave->base = secs->base;
enclave->size = secs->size;
*enclave0 = enclave;
return (0);
}
static void
sgx_epc_page_remove(struct sgx_softc *sc,
struct epc_page *epc)
{
mtx_lock(&sc->mtx_encls);
sgx_eremove((void *)epc->base);
mtx_unlock(&sc->mtx_encls);
}
static void
sgx_page_remove(struct sgx_softc *sc, vm_page_t p)
{
struct epc_page *epc;
vm_paddr_t pa;
uint64_t offs;
(void)vm_page_remove(p);
dprintf("%s: p->pidx %ld\n", __func__, p->pindex);
pa = VM_PAGE_TO_PHYS(p);
epc = &sc->epc_pages[0];
offs = (pa - epc->phys) / PAGE_SIZE;
epc = &sc->epc_pages[offs];
sgx_epc_page_remove(sc, epc);
sgx_put_epc_page(sc, epc);
}
static void
sgx_enclave_remove(struct sgx_softc *sc,
struct sgx_enclave *enclave)
{
vm_object_t object;
vm_page_t p, p_secs, p_next;
mtx_lock(&sc->mtx);
TAILQ_REMOVE(&sc->enclaves, enclave, next);
mtx_unlock(&sc->mtx);
object = enclave->object;
VM_OBJECT_WLOCK(object);
/*
* First remove all the pages except SECS,
* then remove SECS page.
*/
p_secs = NULL;
TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
if (p->pindex == SGX_SECS_VM_OBJECT_INDEX) {
p_secs = p;
continue;
}
sgx_page_remove(sc, p);
}
/* Now remove SECS page */
if (p_secs != NULL)
sgx_page_remove(sc, p_secs);
KASSERT(TAILQ_EMPTY(&object->memq) == 1, ("not empty"));
KASSERT(object->resident_page_count == 0, ("count"));
VM_OBJECT_WUNLOCK(object);
}
static int
sgx_measure_page(struct sgx_softc *sc, struct epc_page *secs,
struct epc_page *epc, uint16_t mrmask)
{
int i, j;
int ret;
mtx_lock(&sc->mtx_encls);
for (i = 0, j = 1; i < PAGE_SIZE; i += 0x100, j <<= 1) {
if (!(j & mrmask))
continue;
ret = sgx_eextend((void *)secs->base,
(void *)(epc->base + i));
if (ret == SGX_EFAULT) {
mtx_unlock(&sc->mtx_encls);
return (ret);
}
}
mtx_unlock(&sc->mtx_encls);
return (0);
}
static int
sgx_secs_validate(struct sgx_softc *sc, struct secs *secs)
{
struct secs_attr *attr;
int i;
if (secs->size == 0)
return (EINVAL);
/* BASEADDR must be naturally aligned on an SECS.SIZE boundary. */
if (secs->base & (secs->size - 1))
return (EINVAL);
/* SECS.SIZE must be at least 2 pages. */
if (secs->size < 2 * PAGE_SIZE)
return (EINVAL);
if ((secs->size & (secs->size - 1)) != 0)
return (EINVAL);
attr = &secs->attributes;
if (attr->reserved1 != 0 ||
attr->reserved2 != 0 ||
attr->reserved3 != 0)
return (EINVAL);
for (i = 0; i < SECS_ATTR_RSV4_SIZE; i++)
if (attr->reserved4[i])
return (EINVAL);
/*
* IntelĀ® Software Guard Extensions Programming Reference
* 6.7.2 Relevant Fields in Various Data Structures
* 6.7.2.1 SECS.ATTRIBUTES.XFRM
* XFRM[1:0] must be set to 0x3.
*/
if ((attr->xfrm & 0x3) != 0x3)
return (EINVAL);
if (!attr->mode64bit)
return (EINVAL);
if (secs->size > sc->enclave_size_max)
return (EINVAL);
for (i = 0; i < SECS_RSV1_SIZE; i++)
if (secs->reserved1[i])
return (EINVAL);
for (i = 0; i < SECS_RSV2_SIZE; i++)
if (secs->reserved2[i])
return (EINVAL);
for (i = 0; i < SECS_RSV3_SIZE; i++)
if (secs->reserved3[i])
return (EINVAL);
for (i = 0; i < SECS_RSV4_SIZE; i++)
if (secs->reserved4[i])
return (EINVAL);
return (0);
}
static int
sgx_tcs_validate(struct tcs *tcs)
{
int i;
if ((tcs->flags) ||
(tcs->ossa & (PAGE_SIZE - 1)) ||
(tcs->ofsbasgx & (PAGE_SIZE - 1)) ||
(tcs->ogsbasgx & (PAGE_SIZE - 1)) ||
((tcs->fslimit & 0xfff) != 0xfff) ||
((tcs->gslimit & 0xfff) != 0xfff))
return (EINVAL);
for (i = 0; i < nitems(tcs->reserved3); i++)
if (tcs->reserved3[i])
return (EINVAL);
return (0);
}
static void
sgx_tcs_dump(struct sgx_softc *sc, struct tcs *t)
{
dprintf("t->flags %lx\n", t->flags);
dprintf("t->ossa %lx\n", t->ossa);
dprintf("t->cssa %x\n", t->cssa);
dprintf("t->nssa %x\n", t->nssa);
dprintf("t->oentry %lx\n", t->oentry);
dprintf("t->ofsbasgx %lx\n", t->ofsbasgx);
dprintf("t->ogsbasgx %lx\n", t->ogsbasgx);
dprintf("t->fslimit %x\n", t->fslimit);
dprintf("t->gslimit %x\n", t->gslimit);
}
static int
sgx_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
{
struct sgx_vm_handle *vmh;
vmh = handle;
if (vmh == NULL) {
dprintf("%s: vmh not found.\n", __func__);
return (0);
}
dprintf("%s: vmh->base %lx foff 0x%lx size 0x%lx\n",
__func__, vmh->base, foff, size);
return (0);
}
static void
sgx_pg_dtor(void *handle)
{
struct sgx_vm_handle *vmh;
struct sgx_softc *sc;
vmh = handle;
if (vmh == NULL) {
dprintf("%s: vmh not found.\n", __func__);
return;
}
sc = vmh->sc;
if (sc == NULL) {
dprintf("%s: sc is NULL\n", __func__);
return;
}
if (vmh->enclave == NULL) {
dprintf("%s: Enclave not found.\n", __func__);
return;
}
sgx_enclave_remove(sc, vmh->enclave);
free(vmh->enclave, M_SGX);
free(vmh, M_SGX);
}
static int
sgx_pg_fault(vm_object_t object, vm_ooffset_t offset,
int prot, vm_page_t *mres)
{
/*
* The purpose of this trivial handler is to handle the race
* when user tries to access mmaped region before or during
* enclave creation ioctl calls.
*/
dprintf("%s: offset 0x%lx\n", __func__, offset);
return (VM_PAGER_FAIL);
}
static struct cdev_pager_ops sgx_pg_ops = {
.cdev_pg_ctor = sgx_pg_ctor,
.cdev_pg_dtor = sgx_pg_dtor,
.cdev_pg_fault = sgx_pg_fault,
};
static void
sgx_insert_epc_page_by_index(vm_page_t page, vm_object_t object,
vm_pindex_t pidx)
{
VM_OBJECT_ASSERT_WLOCKED(object);
- vm_page_insert(page, object, pidx);
page->valid = VM_PAGE_BITS_ALL;
+ vm_page_insert(page, object, pidx);
}
static void
sgx_insert_epc_page(struct sgx_enclave *enclave,
struct epc_page *epc, uint64_t addr)
{
vm_pindex_t pidx;
vm_page_t page;
VM_OBJECT_ASSERT_WLOCKED(enclave->object);
pidx = OFF_TO_IDX(addr);
page = PHYS_TO_VM_PAGE(epc->phys);
sgx_insert_epc_page_by_index(page, enclave->object, pidx);
}
static int
sgx_ioctl_create(struct sgx_softc *sc, struct sgx_enclave_create *param)
{
struct sgx_vm_handle *vmh;
vm_map_entry_t entry;
vm_page_t p;
struct page_info pginfo;
struct secinfo secinfo;
struct sgx_enclave *enclave;
struct epc_page *epc;
struct secs *secs;
vm_object_t object;
vm_page_t page;
int ret;
epc = NULL;
secs = NULL;
enclave = NULL;
object = NULL;
/* SGX Enclave Control Structure (SECS) */
secs = malloc(PAGE_SIZE, M_SGX, M_WAITOK | M_ZERO);
ret = copyin((void *)param->src, secs, sizeof(struct secs));
if (ret) {
dprintf("%s: Can't copy SECS.\n", __func__);
goto error;
}
ret = sgx_secs_validate(sc, secs);
if (ret) {
dprintf("%s: SECS validation failed.\n", __func__);
goto error;
}
ret = sgx_mem_find(sc, secs->base, &entry, &object);
if (ret) {
dprintf("%s: Can't find vm_map.\n", __func__);
goto error;
}
vmh = object->handle;
if (!vmh) {
dprintf("%s: Can't find vmh.\n", __func__);
ret = ENXIO;
goto error;
}
dprintf("%s: entry start %lx offset %lx\n",
__func__, entry->start, entry->offset);
vmh->base = (entry->start - entry->offset);
ret = sgx_enclave_alloc(sc, secs, &enclave);
if (ret) {
dprintf("%s: Can't alloc enclave.\n", __func__);
goto error;
}
enclave->object = object;
enclave->vmh = vmh;
memset(&secinfo, 0, sizeof(struct secinfo));
memset(&pginfo, 0, sizeof(struct page_info));
pginfo.linaddr = 0;
pginfo.srcpge = (uint64_t)secs;
pginfo.secinfo = &secinfo;
pginfo.secs = 0;
ret = sgx_get_epc_page(sc, &epc);
if (ret) {
dprintf("%s: Failed to get free epc page.\n", __func__);
goto error;
}
enclave->secs_epc_page = epc;
VM_OBJECT_WLOCK(object);
p = vm_page_lookup(object, SGX_SECS_VM_OBJECT_INDEX);
if (p) {
VM_OBJECT_WUNLOCK(object);
/* SECS page already added. */
ret = ENXIO;
goto error;
}
ret = sgx_va_slot_init_by_index(sc, object,
- SGX_VA_PAGES_OFFS - SGX_SECS_VM_OBJECT_INDEX);
if (ret) {
VM_OBJECT_WUNLOCK(object);
dprintf("%s: Can't init va slot.\n", __func__);
goto error;
}
mtx_lock(&sc->mtx);
if ((sc->state & SGX_STATE_RUNNING) == 0) {
mtx_unlock(&sc->mtx);
/* Remove VA page that was just created for SECS page. */
p = vm_page_lookup(enclave->object,
- SGX_VA_PAGES_OFFS - SGX_SECS_VM_OBJECT_INDEX);
sgx_page_remove(sc, p);
VM_OBJECT_WUNLOCK(object);
goto error;
}
mtx_lock(&sc->mtx_encls);
ret = sgx_ecreate(&pginfo, (void *)epc->base);
mtx_unlock(&sc->mtx_encls);
if (ret == SGX_EFAULT) {
dprintf("%s: gp fault\n", __func__);
mtx_unlock(&sc->mtx);
/* Remove VA page that was just created for SECS page. */
p = vm_page_lookup(enclave->object,
- SGX_VA_PAGES_OFFS - SGX_SECS_VM_OBJECT_INDEX);
sgx_page_remove(sc, p);
VM_OBJECT_WUNLOCK(object);
goto error;
}
TAILQ_INSERT_TAIL(&sc->enclaves, enclave, next);
mtx_unlock(&sc->mtx);
vmh->enclave = enclave;
page = PHYS_TO_VM_PAGE(epc->phys);
sgx_insert_epc_page_by_index(page, enclave->object,
SGX_SECS_VM_OBJECT_INDEX);
VM_OBJECT_WUNLOCK(object);
/* Release the reference. */
vm_object_deallocate(object);
free(secs, M_SGX);
return (0);
error:
free(secs, M_SGX);
sgx_put_epc_page(sc, epc);
free(enclave, M_SGX);
vm_object_deallocate(object);
return (ret);
}
static int
sgx_ioctl_add_page(struct sgx_softc *sc,
struct sgx_enclave_add_page *addp)
{
struct epc_page *secs_epc_page;
struct sgx_enclave *enclave;
struct sgx_vm_handle *vmh;
struct epc_page *epc;
struct page_info pginfo;
struct secinfo secinfo;
vm_object_t object;
void *tmp_vaddr;
uint64_t page_type;
struct tcs *t;
uint64_t addr;
uint64_t pidx;
vm_page_t p;
int ret;
tmp_vaddr = NULL;
epc = NULL;
object = NULL;
/* Find and get reference to VM object. */
ret = sgx_enclave_find(sc, addp->addr, &enclave);
if (ret) {
dprintf("%s: Failed to find enclave.\n", __func__);
goto error;
}
object = enclave->object;
KASSERT(object != NULL, ("vm object is NULL\n"));
vmh = object->handle;
ret = sgx_get_epc_page(sc, &epc);
if (ret) {
dprintf("%s: Failed to get free epc page.\n", __func__);
goto error;
}
memset(&secinfo, 0, sizeof(struct secinfo));
ret = copyin((void *)addp->secinfo, &secinfo,
sizeof(struct secinfo));
if (ret) {
dprintf("%s: Failed to copy secinfo.\n", __func__);
goto error;
}
tmp_vaddr = malloc(PAGE_SIZE, M_SGX, M_WAITOK | M_ZERO);
ret = copyin((void *)addp->src, tmp_vaddr, PAGE_SIZE);
if (ret) {
dprintf("%s: Failed to copy page.\n", __func__);
goto error;
}
page_type = (secinfo.flags & SECINFO_FLAGS_PT_M) >>
SECINFO_FLAGS_PT_S;
if (page_type != SGX_PT_TCS && page_type != SGX_PT_REG) {
dprintf("%s: page can't be added.\n", __func__);
goto error;
}
if (page_type == SGX_PT_TCS) {
t = (struct tcs *)tmp_vaddr;
ret = sgx_tcs_validate(t);
if (ret) {
dprintf("%s: TCS page validation failed.\n",
__func__);
goto error;
}
sgx_tcs_dump(sc, t);
}
addr = (addp->addr - vmh->base);
pidx = OFF_TO_IDX(addr);
VM_OBJECT_WLOCK(object);
p = vm_page_lookup(object, pidx);
if (p) {
VM_OBJECT_WUNLOCK(object);
/* Page already added. */
ret = ENXIO;
goto error;
}
ret = sgx_va_slot_init(sc, enclave, addr);
if (ret) {
VM_OBJECT_WUNLOCK(object);
dprintf("%s: Can't init va slot.\n", __func__);
goto error;
}
secs_epc_page = enclave->secs_epc_page;
memset(&pginfo, 0, sizeof(struct page_info));
pginfo.linaddr = (uint64_t)addp->addr;
pginfo.srcpge = (uint64_t)tmp_vaddr;
pginfo.secinfo = &secinfo;
pginfo.secs = (uint64_t)secs_epc_page->base;
mtx_lock(&sc->mtx_encls);
ret = sgx_eadd(&pginfo, (void *)epc->base);
if (ret == SGX_EFAULT) {
dprintf("%s: gp fault on eadd\n", __func__);
mtx_unlock(&sc->mtx_encls);
VM_OBJECT_WUNLOCK(object);
goto error;
}
mtx_unlock(&sc->mtx_encls);
ret = sgx_measure_page(sc, enclave->secs_epc_page, epc, addp->mrmask);
if (ret == SGX_EFAULT) {
dprintf("%s: gp fault on eextend\n", __func__);
sgx_epc_page_remove(sc, epc);
VM_OBJECT_WUNLOCK(object);
goto error;
}
sgx_insert_epc_page(enclave, epc, addr);
VM_OBJECT_WUNLOCK(object);
/* Release the reference. */
vm_object_deallocate(object);
free(tmp_vaddr, M_SGX);
return (0);
error:
free(tmp_vaddr, M_SGX);
sgx_put_epc_page(sc, epc);
vm_object_deallocate(object);
return (ret);
}
static int
sgx_ioctl_init(struct sgx_softc *sc, struct sgx_enclave_init *initp)
{
struct epc_page *secs_epc_page;
struct sgx_enclave *enclave;
struct thread *td;
void *tmp_vaddr;
void *einittoken;
void *sigstruct;
vm_object_t object;
int retry;
int ret;
td = curthread;
tmp_vaddr = NULL;
object = NULL;
dprintf("%s: addr %lx, sigstruct %lx, einittoken %lx\n",
__func__, initp->addr, initp->sigstruct, initp->einittoken);
/* Find and get reference to VM object. */
ret = sgx_enclave_find(sc, initp->addr, &enclave);
if (ret) {
dprintf("%s: Failed to find enclave.\n", __func__);
goto error;
}
object = enclave->object;
tmp_vaddr = malloc(PAGE_SIZE, M_SGX, M_WAITOK | M_ZERO);
sigstruct = tmp_vaddr;
einittoken = (void *)((uint64_t)sigstruct + PAGE_SIZE / 2);
ret = copyin((void *)initp->sigstruct, sigstruct,
SGX_SIGSTRUCT_SIZE);
if (ret) {
dprintf("%s: Failed to copy SIGSTRUCT page.\n", __func__);
goto error;
}
ret = copyin((void *)initp->einittoken, einittoken,
SGX_EINITTOKEN_SIZE);
if (ret) {
dprintf("%s: Failed to copy EINITTOKEN page.\n", __func__);
goto error;
}
secs_epc_page = enclave->secs_epc_page;
retry = 16;
do {
mtx_lock(&sc->mtx_encls);
ret = sgx_einit(sigstruct, (void *)secs_epc_page->base,
einittoken);
mtx_unlock(&sc->mtx_encls);
dprintf("%s: sgx_einit returned %d\n", __func__, ret);
} while (ret == SGX_UNMASKED_EVENT && retry--);
if (ret) {
dprintf("%s: Failed init enclave: %d\n", __func__, ret);
td->td_retval[0] = ret;
ret = 0;
}
error:
free(tmp_vaddr, M_SGX);
/* Release the reference. */
vm_object_deallocate(object);
return (ret);
}
static int
sgx_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
struct thread *td)
{
struct sgx_enclave_add_page *addp;
struct sgx_enclave_create *param;
struct sgx_enclave_init *initp;
struct sgx_softc *sc;
int ret;
int len;
sc = &sgx_sc;
len = IOCPARM_LEN(cmd);
dprintf("%s: cmd %lx, addr %lx, len %d\n",
__func__, cmd, (uint64_t)addr, len);
if (len > SGX_IOCTL_MAX_DATA_LEN)
return (EINVAL);
switch (cmd) {
case SGX_IOC_ENCLAVE_CREATE:
param = (struct sgx_enclave_create *)addr;
ret = sgx_ioctl_create(sc, param);
break;
case SGX_IOC_ENCLAVE_ADD_PAGE:
addp = (struct sgx_enclave_add_page *)addr;
ret = sgx_ioctl_add_page(sc, addp);
break;
case SGX_IOC_ENCLAVE_INIT:
initp = (struct sgx_enclave_init *)addr;
ret = sgx_ioctl_init(sc, initp);
break;
default:
return (EINVAL);
}
return (ret);
}
static int
sgx_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
vm_size_t mapsize, struct vm_object **objp, int nprot)
{
struct sgx_vm_handle *vmh;
struct sgx_softc *sc;
sc = &sgx_sc;
dprintf("%s: mapsize 0x%lx, offset %lx\n",
__func__, mapsize, *offset);
vmh = malloc(sizeof(struct sgx_vm_handle),
M_SGX, M_WAITOK | M_ZERO);
vmh->sc = sc;
vmh->size = mapsize;
vmh->mem = cdev_pager_allocate(vmh, OBJT_MGTDEVICE, &sgx_pg_ops,
mapsize, nprot, *offset, NULL);
if (vmh->mem == NULL) {
free(vmh, M_SGX);
return (ENOMEM);
}
VM_OBJECT_WLOCK(vmh->mem);
vm_object_set_flag(vmh->mem, OBJ_PG_DTOR);
VM_OBJECT_WUNLOCK(vmh->mem);
*objp = vmh->mem;
return (0);
}
static struct cdevsw sgx_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = sgx_ioctl,
.d_mmap_single = sgx_mmap_single,
.d_name = "Intel SGX",
};
static int
sgx_get_epc_area(struct sgx_softc *sc)
{
vm_offset_t epc_base_vaddr;
u_int cp[4];
int error;
int i;
cpuid_count(SGX_CPUID, 0x2, cp);
sc->epc_base = ((uint64_t)(cp[1] & 0xfffff) << 32) +
(cp[0] & 0xfffff000);
sc->epc_size = ((uint64_t)(cp[3] & 0xfffff) << 32) +
(cp[2] & 0xfffff000);
sc->npages = sc->epc_size / SGX_PAGE_SIZE;
if (sc->epc_size == 0 || sc->epc_base == 0) {
printf("%s: Incorrect EPC data: EPC base %lx, size %lu\n",
__func__, sc->epc_base, sc->epc_size);
return (EINVAL);
}
if (cp[3] & 0xffff)
sc->enclave_size_max = (1 << ((cp[3] >> 8) & 0xff));
else
sc->enclave_size_max = SGX_ENCL_SIZE_MAX_DEF;
epc_base_vaddr = (vm_offset_t)pmap_mapdev_attr(sc->epc_base,
sc->epc_size, VM_MEMATTR_DEFAULT);
sc->epc_pages = malloc(sizeof(struct epc_page) * sc->npages,
M_DEVBUF, M_WAITOK | M_ZERO);
for (i = 0; i < sc->npages; i++) {
sc->epc_pages[i].base = epc_base_vaddr + SGX_PAGE_SIZE * i;
sc->epc_pages[i].phys = sc->epc_base + SGX_PAGE_SIZE * i;
sc->epc_pages[i].index = i;
}
sc->vmem_epc = vmem_create("SGX EPC", sc->epc_base, sc->epc_size,
PAGE_SIZE, PAGE_SIZE, M_FIRSTFIT | M_WAITOK);
if (sc->vmem_epc == NULL) {
printf("%s: Can't create vmem arena.\n", __func__);
free(sc->epc_pages, M_SGX);
return (EINVAL);
}
error = vm_phys_fictitious_reg_range(sc->epc_base,
sc->epc_base + sc->epc_size, VM_MEMATTR_DEFAULT);
if (error) {
printf("%s: Can't register fictitious space.\n", __func__);
free(sc->epc_pages, M_SGX);
return (EINVAL);
}
return (0);
}
static void
sgx_put_epc_area(struct sgx_softc *sc)
{
vm_phys_fictitious_unreg_range(sc->epc_base,
sc->epc_base + sc->epc_size);
free(sc->epc_pages, M_SGX);
}
static int
sgx_load(void)
{
struct sgx_softc *sc;
int error;
sc = &sgx_sc;
if ((cpu_stdext_feature & CPUID_STDEXT_SGX) == 0)
return (ENXIO);
error = sgx_get_epc_area(sc);
if (error) {
printf("%s: Failed to get Processor Reserved Memory area.\n",
__func__);
return (ENXIO);
}
mtx_init(&sc->mtx_encls, "SGX ENCLS", NULL, MTX_DEF);
mtx_init(&sc->mtx, "SGX driver", NULL, MTX_DEF);
TAILQ_INIT(&sc->enclaves);
sc->sgx_cdev = make_dev(&sgx_cdevsw, 0, UID_ROOT, GID_WHEEL,
0600, "isgx");
sc->state |= SGX_STATE_RUNNING;
printf("SGX initialized: EPC base 0x%lx size %ld (%d pages)\n",
sc->epc_base, sc->epc_size, sc->npages);
return (0);
}
static int
sgx_unload(void)
{
struct sgx_softc *sc;
sc = &sgx_sc;
if ((sc->state & SGX_STATE_RUNNING) == 0)
return (0);
mtx_lock(&sc->mtx);
if (!TAILQ_EMPTY(&sc->enclaves)) {
mtx_unlock(&sc->mtx);
return (EBUSY);
}
sc->state &= ~SGX_STATE_RUNNING;
mtx_unlock(&sc->mtx);
destroy_dev(sc->sgx_cdev);
vmem_destroy(sc->vmem_epc);
sgx_put_epc_area(sc);
mtx_destroy(&sc->mtx_encls);
mtx_destroy(&sc->mtx);
return (0);
}
static int
sgx_handler(module_t mod, int what, void *arg)
{
int error;
switch (what) {
case MOD_LOAD:
error = sgx_load();
break;
case MOD_UNLOAD:
error = sgx_unload();
break;
default:
error = 0;
break;
}
return (error);
}
static moduledata_t sgx_kmod = {
"sgx",
sgx_handler,
NULL
};
DECLARE_MODULE(sgx, sgx_kmod, SI_SUB_LAST, SI_ORDER_ANY);
MODULE_VERSION(sgx, 1);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 353538)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 353539)
@@ -1,2734 +1,2740 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2019 Datto Inc.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef _KERNEL
#include
#include
#include
#endif
/*
* Enable/disable nopwrite feature.
*/
int zfs_nopwrite_enabled = 1;
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
&zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
/*
* Tunable to control percentage of dirtied L1 blocks from frees allowed into
* one TXG. After this threshold is crossed, additional dirty blocks from frees
* will wait until the next TXG.
* A value of zero will disable this throttle.
*/
uint32_t zfs_per_txg_dirty_frees_percent = 5;
SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
&zfs_per_txg_dirty_frees_percent, 0,
"Percentage of dirtied indirect blocks from frees allowed in one txg");
/*
* This can be used for testing, to ensure that certain actions happen
* while in the middle of a remap (which might otherwise complete too
* quickly).
*/
int zfs_object_remap_one_indirect_delay_ticks = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ byteswap_uint8_array, "uint8" },
{ byteswap_uint16_array, "uint16" },
{ byteswap_uint32_array, "uint32" },
{ byteswap_uint64_array, "uint64" },
{ zap_byteswap, "zap" },
{ dnode_buf_byteswap, "dnode" },
{ dmu_objset_byteswap, "objset" },
{ zfs_znode_byteswap, "znode" },
{ zfs_oldacl_byteswap, "oldacl" },
{ zfs_acl_byteswap, "acl" }
};
int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
uint64_t blkid;
dmu_buf_impl_t *db;
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL) {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (0);
}
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
if (db == NULL) {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (err);
}
int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
}
}
return (err);
}
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
}
}
return (err);
}
int
dmu_bonus_max(void)
{
return (DN_OLD_MAX_BONUSLEN);
}
int
dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int error;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dn->dn_bonus != db) {
error = SET_ERROR(EINVAL);
} else if (newsize < 0 || newsize > db_fake->db_size) {
error = SET_ERROR(EINVAL);
} else {
dnode_setbonuslen(dn, newsize, tx);
error = 0;
}
DB_DNODE_EXIT(db);
return (error);
}
int
dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int error;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (!DMU_OT_IS_VALID(type)) {
error = SET_ERROR(EINVAL);
} else if (dn->dn_bonus != db) {
error = SET_ERROR(EINVAL);
} else {
dnode_setbonus_type(dn, type, tx);
error = 0;
}
DB_DNODE_EXIT(db);
return (error);
}
dmu_object_type_t
dmu_get_bonustype(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
dmu_object_type_t type;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
type = dn->dn_bonustype;
DB_DNODE_EXIT(db);
return (type);
}
int
dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
dnode_t *dn;
int error;
error = dnode_hold(os, object, FTAG, &dn);
dbuf_rm_spill(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_rm_spill(dn, tx);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (error);
}
/*
* returns ENOENT, EIO, or 0.
*/
int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
error = dnode_hold(os, object, FTAG, &dn);
if (error)
return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
/* as long as the bonus buf is held, the dnode will be held */
if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
atomic_inc_32(&dn->dn_dbufs_count);
}
/*
* Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
* hold and incrementing the dbuf count to ensure that dnode_move() sees
* a dnode hold for every dbuf.
*/
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
*dbp = &db->db;
return (0);
}
/*
* returns ENOENT, EIO, or 0.
*
* This interface will allocate a blank spill dbuf when a spill blk
* doesn't already exist on the dnode.
*
* if you only want to find an already existing spill db, then
* dmu_spill_hold_existing() should be used.
*/
int
dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = NULL;
int err;
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
ASSERT(db != NULL);
err = dbuf_read(db, NULL, flags);
if (err == 0)
*dbp = &db->db;
else
dbuf_rele(db, tag);
return (err);
}
int
dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
err = SET_ERROR(EINVAL);
} else {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (!dn->dn_have_spill) {
err = SET_ERROR(ENOENT);
} else {
err = dmu_spill_hold_by_dnode(dn,
DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
}
rw_exit(&dn->dn_struct_rwlock);
}
DB_DNODE_EXIT(db);
return (err);
}
int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
* whose dnodes are in the same block.
*/
int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
ASSERT(length <= DMU_MAX_ACCESS);
/*
* Note: We directly notify the prefetch code of this read, so that
* we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing.
*/
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
} else {
if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object "
"%llx/%llx (size=%u access=%llu+%llu)",
(longlong_t)dn->dn_objset->
os_dsl_dataset->ds_object,
(longlong_t)dn->dn_object, dn->dn_datablksz,
(longlong_t)offset, (longlong_t)length);
rw_exit(&dn->dn_struct_rwlock);
return (SET_ERROR(EIO));
}
nblks = 1;
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
#if defined(_KERNEL) && defined(RACCT)
if (racct_enable && !read) {
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_WRITEBPS, length);
racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
PROC_UNLOCK(curproc);
}
#endif
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
zio_nowait(zio);
return (SET_ERROR(EIO));
}
/* initiate async i/o */
if (read)
(void) dbuf_read(db, zio, dbuf_flags);
#ifdef _KERNEL
else
curthread->td_ru.ru_oublock++;
#endif
dbp[i] = &db->db;
}
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn));
}
rw_exit(&dn->dn_struct_rwlock);
/* wait for async i/o */
err = zio_wait(zio);
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
}
/* wait for other io to complete */
if (read) {
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ ||
db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED)
err = SET_ERROR(EIO);
mutex_exit(&db->db_mtx);
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
}
}
}
*numbufsp = nblks;
*dbpp = dbp;
return (0);
}
static int
dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, boolean_t read, void *tag, int *numbufsp,
dmu_buf_t ***dbpp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
int err;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
DB_DNODE_EXIT(db);
return (err);
}
void
dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
{
int i;
dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
if (numbufs == 0)
return;
for (i = 0; i < numbufs; i++) {
if (dbp[i])
dbuf_rele(dbp[i], tag);
}
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}
/*
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
* Note that if the indirect blocks above the blocks being prefetched are not in
* cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
int nblks, err;
if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, level,
object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
return;
}
/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
/*
* offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
* last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
*/
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
} else {
nblks = (offset < dn->dn_datablksz);
}
if (nblks != 0) {
blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
*
* On input, *start should be the first offset that does not need to be
* freed (e.g. "offset + length"). On return, *start will be the first
* offset that should be freed and l1blks is set to the number of level 1
* indirect blocks found within the chunk.
*/
static int
get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
{
uint64_t blks;
uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
/* bytes of data covered by a level-1 indirect block */
uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
ASSERT3U(minimum, <=, *start);
/*
* Check if we can free the entire range assuming that all of the
* L1 blocks in this range have data. If we can, we use this
* worst case value as an estimate so we can avoid having to look
* at the object's actual data.
*/
uint64_t total_l1blks =
(roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
iblkrange;
if (total_l1blks <= maxblks) {
*l1blks = total_l1blks;
*start = minimum;
return (0);
}
ASSERT(ISP2(iblkrange));
for (blks = 0; *start > minimum && blks < maxblks; blks++) {
int err;
/*
* dnode_next_offset(BACKWARDS) will find an allocated L1
* indirect block at or before the input offset. We must
* decrement *start so that it is at the end of the region
* to search.
*/
(*start)--;
err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
/* if there are no indirect blocks before start, we are done */
if (err == ESRCH) {
*start = minimum;
break;
} else if (err != 0) {
*l1blks = blks;
return (err);
}
/* set start to the beginning of this L1 indirect */
*start = P2ALIGN(*start, iblkrange);
}
if (*start < minimum)
*start = minimum;
*l1blks = blks;
return (0);
}
/*
* If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
* otherwise return false.
* Used below in dmu_free_long_range_impl() to enable abort when unmounting
*/
/*ARGSUSED*/
static boolean_t
dmu_objset_zfs_unmounting(objset_t *os)
{
#ifdef _KERNEL
if (dmu_objset_type(os) == DMU_OST_ZFS)
return (zfs_get_vfs_flag_unmounted(os));
#endif
return (B_FALSE);
}
static int
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
uint64_t length)
{
uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
int err;
uint64_t dirty_frees_threshold;
dsl_pool_t *dp = dmu_objset_pool(os);
if (offset >= object_size)
return (0);
if (zfs_per_txg_dirty_frees_percent <= 100)
dirty_frees_threshold =
zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
else
dirty_frees_threshold = zfs_dirty_data_max / 20;
if (length == DMU_OBJECT_END || offset + length > object_size)
length = object_size - offset;
while (length != 0) {
uint64_t chunk_end, chunk_begin, chunk_len;
uint64_t l1blks;
dmu_tx_t *tx;
if (dmu_objset_zfs_unmounting(dn->dn_objset))
return (SET_ERROR(EINTR));
chunk_end = chunk_begin = offset + length;
/* move chunk_begin backwards to the beginning of this chunk */
err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
if (err)
return (err);
ASSERT3U(chunk_begin, >=, offset);
ASSERT3U(chunk_begin, <=, chunk_end);
chunk_len = chunk_end - chunk_begin;
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
/*
* Mark this transaction as typically resulting in a net
* reduction in space used.
*/
dmu_tx_mark_netfree(tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
uint64_t txg = dmu_tx_get_txg(tx);
mutex_enter(&dp->dp_lock);
uint64_t long_free_dirty =
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
mutex_exit(&dp->dp_lock);
/*
* To avoid filling up a TXG with just frees, wait for
* the next TXG to open before freeing more chunks if
* we have reached the threshold of frees.
*/
if (dirty_frees_threshold != 0 &&
long_free_dirty >= dirty_frees_threshold) {
dmu_tx_commit(tx);
txg_wait_open(dp, 0);
continue;
}
/*
* In order to prevent unnecessary write throttling, for each
* TXG, we track the cumulative size of L1 blocks being dirtied
* in dnode_free_range() below. We compare this number to a
* tunable threshold, past which we prevent new L1 dirty freeing
* blocks from being added into the open TXG. See
* dmu_free_long_range_impl() for details. The threshold
* prevents write throttle activation due to dirty freeing L1
* blocks taking up a large percentage of zfs_dirty_data_max.
*/
mutex_enter(&dp->dp_lock);
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
l1blks << dn->dn_indblkshift;
mutex_exit(&dp->dp_lock);
DTRACE_PROBE3(free__long__range,
uint64_t, long_free_dirty, uint64_t, chunk_len,
uint64_t, txg);
dnode_free_range(dn, chunk_begin, chunk_len, tx);
dmu_tx_commit(tx);
length -= chunk_len;
}
return (0);
}
int
dmu_free_long_range(objset_t *os, uint64_t object,
uint64_t offset, uint64_t length)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_free_long_range_impl(os, dn, offset, length);
/*
* It is important to zero out the maxblkid when freeing the entire
* file, so that (a) subsequent calls to dmu_free_long_range_impl()
* will take the fast path, and (b) dnode_reallocate() can verify
* that the entire file has been freed.
*/
if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
dn->dn_maxblkid = 0;
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_free_long_object(objset_t *os, uint64_t object)
{
dmu_tx_t *tx;
int err;
err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
if (err != 0)
return (err);
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, object);
dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
dmu_tx_mark_netfree(tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err == 0) {
err = dmu_object_free(os, object, tx);
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);
}
return (err);
}
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
{
dnode_t *dn;
int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
ASSERT(offset < UINT64_MAX);
ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
dnode_free_range(dn, offset, size, tx);
dnode_rele(dn, FTAG);
return (0);
}
static int
dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
{
dmu_buf_t **dbp;
int numbufs, err = 0;
/*
* Deal with odd block sizes, where there can't be data past the first
* block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
if (dn->dn_maxblkid == 0) {
int newsz = offset > dn->dn_datablksz ? 0 :
MIN(size, dn->dn_datablksz - offset);
bzero((char *)buf + newsz, size - newsz);
size = newsz;
}
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
int i;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp, flags);
if (err)
break;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
bcopy((char *)db->db_data + bufoff, buf, tocpy);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
return (err);
}
int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
err = dmu_read_impl(dn, offset, size, buf, flags);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags)
{
return (dmu_read_impl(dn, offset, size, buf, flags));
}
static void
dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
int i;
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
bcopy(buf, (char *)db->db_data + bufoff, tocpy);
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
buf = (char *)buf + tocpy;
}
}
void
dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
if (size == 0)
return;
VERIFY0(dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
void
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
if (size == 0)
return;
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
static int
dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
uint64_t last_removal_txg, uint64_t offset)
{
uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
int err = 0;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
ASSERT3P(dbuf, !=, NULL);
/*
* If the block hasn't been written yet, this default will ensure
* we don't try to remap it.
*/
uint64_t birth = UINT64_MAX;
ASSERT3U(last_removal_txg, !=, UINT64_MAX);
if (dbuf->db_blkptr != NULL)
birth = dbuf->db_blkptr->blk_birth;
rw_exit(&dn->dn_struct_rwlock);
/*
* If this L1 was already written after the last removal, then we've
* already tried to remap it.
*/
if (birth <= last_removal_txg &&
dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
dbuf_can_remap(dbuf)) {
dmu_tx_t *tx = dmu_tx_create(os);
dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err == 0) {
(void) dbuf_dirty(dbuf, tx);
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);
}
}
dbuf_rele(dbuf, FTAG);
delay(zfs_object_remap_one_indirect_delay_ticks);
return (err);
}
/*
* Remap all blockpointers in the object, if possible, so that they reference
* only concrete vdevs.
*
* To do this, iterate over the L0 blockpointers and remap any that reference
* an indirect vdev. Note that we only examine L0 blockpointers; since we
* cannot guarantee that we can remap all blockpointer anyways (due to split
* blocks), we do not want to make the code unnecessarily complicated to
* catch the unlikely case that there is an L1 block on an indirect vdev that
* contains no indirect blockpointers.
*/
int
dmu_object_remap_indirects(objset_t *os, uint64_t object,
uint64_t last_removal_txg)
{
uint64_t offset, l1span;
int err;
dnode_t *dn;
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0) {
return (err);
}
if (dn->dn_nlevels <= 1) {
if (issig(JUSTLOOKING) && issig(FORREAL)) {
err = SET_ERROR(EINTR);
}
/*
* If the dnode has no indirect blocks, we cannot dirty them.
* We still want to remap the blkptr(s) in the dnode if
* appropriate, so mark it as dirty.
*/
if (err == 0 && dnode_needs_remap(dn)) {
dmu_tx_t *tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, dn->dn_object);
if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
dnode_setdirty(dn, tx);
dmu_tx_commit(tx);
} else {
dmu_tx_abort(tx);
}
}
dnode_rele(dn, FTAG);
return (err);
}
offset = 0;
l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
dn->dn_datablkshift);
/*
* Find the next L1 indirect that is not a hole.
*/
while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
if (issig(JUSTLOOKING) && issig(FORREAL)) {
err = SET_ERROR(EINTR);
break;
}
if ((err = dmu_object_remap_one_indirect(os, dn,
last_removal_txg, offset)) != 0) {
break;
}
offset += l1span;
}
dnode_rele(dn, FTAG);
return (err);
}
void
dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
if (size == 0)
return;
VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
for (i = 0; i < numbufs; i++) {
dmu_buf_t *db = dbp[i];
dmu_buf_will_not_fill(db, tx);
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx)
{
dmu_buf_t *db;
ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
VERIFY0(dmu_buf_hold_noread(os, object, offset,
FTAG, &db));
dmu_buf_write_embedded(db,
data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
uncompressed_size, compressed_size, byteorder, tx);
dmu_buf_rele(db, FTAG);
}
/*
* DMU support for xuio
*/
kstat_t *xuio_ksp = NULL;
int
dmu_xuio_init(xuio_t *xuio, int nblk)
{
dmu_xuio_t *priv;
uio_t *uio = &xuio->xu_uio;
uio->uio_iovcnt = nblk;
uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
priv->cnt = nblk;
priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
priv->iovp = uio->uio_iov;
XUIO_XUZC_PRIV(xuio) = priv;
if (XUIO_XUZC_RW(xuio) == UIO_READ)
XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
else
XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
return (0);
}
void
dmu_xuio_fini(xuio_t *xuio)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
int nblk = priv->cnt;
kmem_free(priv->iovp, nblk * sizeof (iovec_t));
kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
kmem_free(priv, sizeof (dmu_xuio_t));
if (XUIO_XUZC_RW(xuio) == UIO_READ)
XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
else
XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
}
/*
* Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
* and increase priv->next by 1.
*/
int
dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
{
struct iovec *iov;
uio_t *uio = &xuio->xu_uio;
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
int i = priv->next++;
ASSERT(i < priv->cnt);
ASSERT(off + n <= arc_buf_lsize(abuf));
iov = uio->uio_iov + i;
iov->iov_base = (char *)abuf->b_data + off;
iov->iov_len = n;
priv->bufs[i] = abuf;
return (0);
}
int
dmu_xuio_cnt(xuio_t *xuio)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
return (priv->cnt);
}
arc_buf_t *
dmu_xuio_arcbuf(xuio_t *xuio, int i)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
ASSERT(i < priv->cnt);
return (priv->bufs[i]);
}
void
dmu_xuio_clear(xuio_t *xuio, int i)
{
dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
ASSERT(i < priv->cnt);
priv->bufs[i] = NULL;
}
static void
xuio_stat_init(void)
{
xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (xuio_ksp != NULL) {
xuio_ksp->ks_data = &xuio_stats;
kstat_install(xuio_ksp);
}
}
static void
xuio_stat_fini(void)
{
if (xuio_ksp != NULL) {
kstat_delete(xuio_ksp);
xuio_ksp = NULL;
}
}
void
xuio_stat_wbuf_copied(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
void
xuio_stat_wbuf_nocopy(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
}
#ifdef _KERNEL
int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
TRUE, FTAG, &numbufs, &dbp, 0);
if (err)
return (err);
#ifdef UIO_XUIO
if (uio->uio_extflg == UIO_XUIO)
xuio = (xuio_t *)uio;
#endif
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
if (xuio) {
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
arc_buf_t *dbuf_abuf = dbi->db_buf;
arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
if (!err) {
uio->uio_resid -= tocpy;
uio->uio_loffset += tocpy;
}
if (abuf == dbuf_abuf)
XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
else
XUIOSTAT_BUMP(xuiostat_rbuf_copied);
} else {
#ifdef illumos
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_READ, uio);
#else
err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
tocpy, uio);
#endif
}
if (err)
break;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
/*
* Read 'size' bytes into the uio buffer.
* From object zdb->db_object.
* Starting at offset uio->uio_loffset.
*
* If the caller already has a dbuf in the target object
* (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
* because we don't have to find the dnode_t for the object.
*/
int
dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;
if (size == 0)
return (0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_read_uio_dnode(dn, uio, size);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Read 'size' bytes into the uio buffer.
* From the specified object
* Starting at offset uio->uio_loffset.
*/
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dnode_t *dn;
int err;
if (size == 0)
return (0);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_read_uio_dnode(dn, uio, size);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
int err = 0;
int i;
err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
ASSERT(size > 0);
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
#ifdef illumos
/*
* XXX uiomove could block forever (eg. nfs-backed
* pages). There needs to be a uiolockdown() function
* to lock the pages in memory, so that uiomove won't
* block.
*/
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
#else
err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
uio);
#endif
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
if (err)
break;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
/*
* Write 'size' bytes from the uio buffer.
* To object zdb->db_object.
* Starting at offset uio->uio_loffset.
*
* If the caller already has a dbuf in the target object
* (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
* because we don't have to find the dnode_t for the object.
*/
int
dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;
if (size == 0)
return (0);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_write_uio_dnode(dn, uio, size, tx);
DB_DNODE_EXIT(db);
return (err);
}
/*
* Write 'size' bytes from the uio buffer.
* To the specified object.
* Starting at offset uio->uio_loffset.
*/
int
dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
dnode_t *dn;
int err;
if (size == 0)
return (0);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dmu_write_uio_dnode(dn, uio, size, tx);
dnode_rele(dn, FTAG);
return (err);
}
#ifdef illumos
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
page_t *pp, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
int err;
if (size == 0)
return (0);
err = dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy, copied, thiscpy;
int bufoff;
dmu_buf_t *db = dbp[i];
caddr_t va;
ASSERT(size > 0);
ASSERT3U(db->db_size, >=, PAGESIZE);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
va = zfs_map_page(pp, S_READ);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
zfs_unmap_page(pp, va);
pp = pp->p_next;
bufoff += PAGESIZE;
}
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
#else /* !illumos */
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
vm_page_t *ma, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
struct sf_buf *sf;
int numbufs, i;
int err;
if (size == 0)
return (0);
err = dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp);
if (err)
return (err);
for (i = 0; i < numbufs; i++) {
int tocpy, copied, thiscpy;
int bufoff;
dmu_buf_t *db = dbp[i];
caddr_t va;
ASSERT(size > 0);
ASSERT3U(db->db_size, >=, PAGESIZE);
bufoff = offset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
va = zfs_map_page(*ma, &sf);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
zfs_unmap_page(sf);
ma += 1;
bufoff += PAGESIZE;
}
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
offset += tocpy;
size -= tocpy;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
int
dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
int *rbehind, int *rahead, int last_size)
{
struct sf_buf *sf;
vm_object_t vmobj;
vm_page_t m;
dmu_buf_t **dbp;
dmu_buf_t *db;
caddr_t va;
int numbufs, i;
int bufoff, pgoff, tocpy;
int mi, di;
int err;
ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
ASSERT(last_size <= PAGE_SIZE);
err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
if (err != 0)
return (err);
#ifdef DEBUG
IMPLY(last_size < PAGE_SIZE, *rahead == 0);
if (dbp[0]->db_offset != 0 || numbufs > 1) {
for (i = 0; i < numbufs; i++) {
ASSERT(ISP2(dbp[i]->db_size));
ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
}
}
#endif
vmobj = ma[0]->object;
zfs_vmobject_wlock(vmobj);
db = dbp[0];
for (i = 0; i < *rbehind; i++) {
m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
- VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
+ VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
if (m == NULL)
break;
- if (m->valid != 0) {
+ if (!vm_page_none_valid(m)) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(m);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
zfs_unmap_page(sf);
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
+ vm_page_sunbusy(m);
}
*rbehind = i;
bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
pgoff = 0;
for (mi = 0, di = 0; mi < count && di < numbufs; ) {
if (pgoff == 0) {
m = ma[mi];
if (m != bogus_page) {
vm_page_assert_xbusied(m);
- ASSERT(m->valid == 0);
+ ASSERT(vm_page_none_valid(m));
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
va = zfs_map_page(m, &sf);
}
}
if (bufoff == 0)
db = dbp[di];
if (m != bogus_page) {
ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
db->db_offset + bufoff);
}
/*
* We do not need to clamp the copy size by the file
* size as the last block is zero-filled beyond the
* end of file anyway.
*/
tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
if (m != bogus_page)
bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
pgoff += tocpy;
ASSERT(pgoff <= PAGESIZE);
if (pgoff == PAGESIZE) {
if (m != bogus_page) {
zfs_unmap_page(sf);
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
}
ASSERT(mi < count);
mi++;
pgoff = 0;
}
bufoff += tocpy;
ASSERT(bufoff <= db->db_size);
if (bufoff == db->db_size) {
ASSERT(di < numbufs);
di++;
bufoff = 0;
}
}
#ifdef DEBUG
/*
* Three possibilities:
* - last requested page ends at a buffer boundary and , thus,
* all pages and buffers have been iterated;
* - all requested pages are filled, but the last buffer
* has not been exhausted;
* the read-ahead is possible only in this case;
* - all buffers have been read, but the last page has not been
* fully filled;
* this is only possible if the file has only a single buffer
* with a size that is not a multiple of the page size.
*/
if (mi == count) {
ASSERT(di >= numbufs - 1);
IMPLY(*rahead != 0, di == numbufs - 1);
IMPLY(*rahead != 0, bufoff != 0);
ASSERT(pgoff == 0);
}
if (di == numbufs) {
ASSERT(mi >= count - 1);
ASSERT(*rahead == 0);
IMPLY(pgoff == 0, mi == count);
if (pgoff != 0) {
ASSERT(mi == count - 1);
ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
}
}
#endif
if (pgoff != 0) {
ASSERT(m != bogus_page);
bzero(va + pgoff, PAGESIZE - pgoff);
zfs_unmap_page(sf);
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
}
for (i = 0; i < *rahead; i++) {
m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
- VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+ VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
+ VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
if (m == NULL)
break;
- if (m->valid != 0) {
+ if (!vm_page_none_valid(m)) {
ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(m);
break;
}
ASSERT(m->dirty == 0);
ASSERT(!pmap_page_is_mapped(m));
ASSERT(db->db_size > PAGE_SIZE);
bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
tocpy = MIN(db->db_size - bufoff, PAGESIZE);
va = zfs_map_page(m, &sf);
bcopy((char *)db->db_data + bufoff, va, tocpy);
if (tocpy < PAGESIZE) {
ASSERT(i == *rahead - 1);
ASSERT((db->db_size & PAGE_MASK) != 0);
bzero(va + tocpy, PAGESIZE - tocpy);
}
zfs_unmap_page(sf);
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
vm_page_lock(m);
if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
else
vm_page_deactivate(m);
vm_page_unlock(m);
+ vm_page_sunbusy(m);
}
*rahead = i;
zfs_vmobject_wunlock(vmobj);
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (0);
}
#endif /* illumos */
#endif /* _KERNEL */
/*
* Allocate a loaned anonymous arc buffer.
*/
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
}
/*
* Free a loaned arc buffer.
*/
void
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
arc_buf_destroy(buf, FTAG);
}
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
*/
void
dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
/*
* We can only assign if the offset is aligned, the arc buf is the
* same size as the dbuf, and the dbuf is not metadata.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(curproc);
racct_add_force(curproc, RACCT_WRITEBPS, blksz);
racct_add_force(curproc, RACCT_WRITEIOPS, 1);
PROC_UNLOCK(curproc);
}
#endif /* RACCT */
#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
objset_t *os;
uint64_t object;
/* compressed bufs must always be assignable to their dbuf */
ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
os = dn->dn_objset;
object = dn->dn_object;
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
DB_DNODE_ENTER(dbuf);
dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
DB_DNODE_EXIT(dbuf);
}
typedef struct {
dbuf_dirty_record_t *dsa_dr;
dmu_sync_cb_t *dsa_done;
zgd_t *dsa_zgd;
dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
if (BP_IS_HOLE(bp)) {
/*
* A block of zeros may compress to a hole, but the
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
}
}
static void
dmu_sync_late_arrival_ready(zio_t *zio)
{
dmu_sync_ready(zio, NULL, zio->io_private);
}
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
zgd_t *zgd = dsa->dsa_zgd;
/*
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
*/
if (zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (zio->io_error == 0) {
dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
if (dr->dt.dl.dr_nopwrite) {
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
ASSERT(BP_EQUAL(bp, bp_orig));
VERIFY(BP_EQUAL(bp, db->db_blkptr));
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
ASSERT(zio_checksum_table[chksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE);
}
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
/*
* Old style holes are filled with all zeros, whereas
* new-style holes maintain their lsize, type, level,
* and birth time (see zio_write_compress). While we
* need to reset the BP_SET_LSIZE() call that happened
* in dmu_sync_ready for old style holes, we do *not*
* want to wipe out the information contained in new
* style holes. Thus, only zero out the block pointer if
* it's an old style hole.
*/
if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
dr->dt.dl.dr_overridden_by.blk_birth == 0)
BP_ZERO(&dr->dt.dl.dr_overridden_by);
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
}
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
kmem_free(dsa, sizeof (*dsa));
}
static void
dmu_sync_late_arrival_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
dmu_sync_arg_t *dsa = zio->io_private;
blkptr_t *bp_orig = &zio->io_bp_orig;
zgd_t *zgd = dsa->dsa_zgd;
if (zio->io_error == 0) {
/*
* Record the vdev(s) backing this blkptr so they can be
* flushed after the writes for the lwb have completed.
*/
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
if (!BP_IS_HOLE(bp)) {
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
}
dmu_tx_commit(dsa->dsa_tx);
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}
static int
dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
zio_prop_t *zp, zbookmark_phys_t *zb)
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));
}
/*
* In order to prevent the zgd's lwb from being free'd prior to
* dmu_sync_late_arrival_done() being called, we have to ensure
* the lwb's "max txg" takes this tx's txg into account.
*/
zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa->dsa_dr = NULL;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
/*
* Since we are currently syncing this txg, it's nontrivial to
* determine what BP to nopwrite against, so we disable nopwrite.
*
* When syncing, the db_blkptr is initially the BP of the previous
* txg. We can not nopwrite against it because it will be changed
* (this is similar to the non-late-arrival case where the dbuf is
* dirty in a future txg).
*
* Then dbuf_write_ready() sets bp_blkptr to the location we will write.
* We can not nopwrite against it because although the BP will not
* (typically) be changed, the data has not yet been persisted to this
* location.
*
* Finally, when dbuf_write_done() is called, it is theoretically
* possible to always nopwrite, because the data that was written in
* this txg is the same data that we are trying to write. However we
* would need to check that this dbuf is not dirty in any future
* txg's (as we do in the normal dmu_sync() path). For simplicity, we
* don't nopwrite in this case.
*/
zp->zp_nopwrite = B_FALSE;
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
}
/*
* Intent log support: sync the block associated with db to disk.
* N.B. and XXX: the caller is responsible for making sure that the
* data isn't changing while dmu_sync() is writing it.
*
* Return values:
*
* EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
* The caller should not log the write.
*
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
* EIO: could not do the I/O.
* The caller should do a txg_wait_synced().
*
* 0: the I/O has been initiated.
* The caller should log this blkptr in the done callback.
* It is possible that the I/O will fail, in which case
* the error will be reported to the done callback and
* propagated to pio from zio_done().
*/
int
dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
objset_t *os = db->db_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *dsa;
zbookmark_phys_t zb;
zio_prop_t zp;
dnode_t *dn;
ASSERT(pio != NULL);
ASSERT(txg != 0);
SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
DB_DNODE_EXIT(db);
/*
* If we're frozen (running ziltest), we always need to generate a bp.
*/
if (txg > spa_freeze_txg(os->os_spa))
return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
/*
* Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
* and us. If we determine that this txg is not yet syncing,
* but it begins to sync a moment later, that's OK because the
* sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
mutex_enter(&db->db_mtx);
if (txg <= spa_last_synced_txg(os->os_spa)) {
/*
* This txg has already synced. There's nothing to do.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(EEXIST));
}
if (txg <= spa_syncing_txg(os->os_spa)) {
/*
* This txg is currently syncing, so we can't mess with
* the dirty record anymore; just write a new log block.
*/
mutex_exit(&db->db_mtx);
return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
}
dr = db->db_last_dirty;
while (dr && dr->dr_txg != txg)
dr = dr->dr_next;
if (dr == NULL) {
/*
* There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(ENOENT));
}
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
if (db->db_blkptr != NULL) {
/*
* We need to fill in zgd_bp with the current blkptr so that
* the nopwrite code can check if we're writing the same
* data that's already on disk. We can only nopwrite if we
* are sure that after making the copy, db_blkptr will not
* change until our i/o completes. We ensure this by
* holding the db_mtx, and only allowing nopwrite if the
* block is not already dirty (see below). This is verified
* by dmu_sync_done(), which VERIFYs that the db_blkptr has
* not changed.
*/
*zgd->zgd_bp = *db->db_blkptr;
}
/*
* Assume the on-disk data is X, the current syncing data (in
* txg - 1) is Y, and the current in-memory data is Z (currently
* in dmu_sync).
*
* We usually want to perform a nopwrite if X and Z are the
* same. However, if Y is different (i.e. the BP is going to
* change before this write takes effect), then a nopwrite will
* be incorrect - we would override with X, which could have
* been freed when Y was written.
*
* (Note that this is not a concern when we are nop-writing from
* syncing context, because X and Y must be identical, because
* all previous txgs have been synced.)
*
* Therefore, we disable nopwrite if the current BP could change
* before this TXG. There are two ways it could change: by
* being dirty (dr_next is non-NULL), or by being freed
* (dnode_block_freed()). This behavior is verified by
* zio_done(), which VERIFYs that the override BP is identical
* to the on-disk BP.
*/
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
zp.zp_nopwrite = B_FALSE;
DB_DNODE_EXIT(db);
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* We have already issued a sync write for this buffer,
* or this buffer has already been synced. It could not
* have been dirtied since, or we would have cleared the state.
*/
mutex_exit(&db->db_mtx);
return (SET_ERROR(EALREADY));
}
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
mutex_exit(&db->db_mtx);
dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
dsa->dsa_dr = dr;
dsa->dsa_done = done;
dsa->dsa_zgd = zgd;
dsa->dsa_tx = NULL;
zio_nowait(arc_write(pio, os->os_spa, txg,
zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
}
int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_blksz(dn, size, ibs, tx);
dnode_rele(dn, FTAG);
return (err);
}
void
dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dmu_tx_t *tx)
{
dnode_t *dn;
/*
* Send streams include each object's checksum function. This
* check ensures that the receiving system can understand the
* checksum function transmitted.
*/
ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
void
dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx)
{
dnode_t *dn;
/*
* Send streams include each object's compression function. This
* check ensures that the receiving system can understand the
* compression function transmitted.
*/
ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
}
int zfs_mdcomp_disable = 0;
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
/*
* When the "redundant_metadata" property is set to "most", only indirect
* blocks of this level and higher will have an additional ditto block.
*/
int zfs_redundant_metadata_most_ditto_level = 2;
void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
{
dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
(wp & WP_SPILL));
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
/*
* We maintain different write policies for each of the following
* types of data:
* 1. metadata
* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
* 3. all other level 0 blocks
*/
if (ismd) {
if (zfs_mdcomp_disable) {
compress = ZIO_COMPRESS_EMPTY;
} else {
/*
* XXX -- we should design a compression algorithm
* that specializes in arrays of bps.
*/
compress = zio_compress_select(os->os_spa,
ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
}
/*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
* ZBT-style checksum, then it's suitable for metadata
* as well. Otherwise, the metadata checksum defaults
* to fletcher4.
*/
if (!(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_METADATA) ||
(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_EMBEDDED))
checksum = ZIO_CHECKSUM_FLETCHER_4;
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
(os->os_redundant_metadata ==
ZFS_REDUNDANT_METADATA_MOST &&
(level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
copies++;
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
/*
* If we're writing preallocated blocks, we aren't actually
* writing them so don't set any policy properties. These
* blocks are currently only used by an external subsystem
* outside of zfs (i.e. dump) and not written by the zio
* pipeline.
*/
compress = ZIO_COMPRESS_OFF;
checksum = ZIO_CHECKSUM_NOPARITY;
} else {
compress = zio_compress_select(os->os_spa, dn->dn_compress,
compress);
checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
zio_checksum_select(dn->dn_checksum, checksum) :
dedup_checksum;
/*
* Determine dedup setting. If we are in dmu_sync(),
* we won't actually dedup now because that's all
* done in syncing context; but we do want to use the
* dedup checkum. If the checksum is not strong
* enough to ensure unique signatures, force
* dedup_verify.
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
if (!(zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
/*
* Enable nopwrite if we have secure enough checksum
* algorithm (see comment in zio_nop_write) and
* compression is enabled. We don't enable nopwrite if
* dedup is enabled as the two features are mutually
* exclusive.
*/
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
zp->zp_compress = compress;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
}
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
int err;
/*
* Sync any current changes before
* we go trundling through the block pointers.
*/
err = dmu_object_wait_synced(os, object);
if (err) {
return (err);
}
err = dnode_hold(os, object, FTAG, &dn);
if (err) {
return (err);
}
err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
}
/*
* Given the ZFS object, if it contains any dirty nodes
* this function flushes all dirty blocks to disk. This
* ensures the DMU object info is updated. A more efficient
* future version might just find the TXG with the maximum
* ID and wait for that to be synced.
*/
int
dmu_object_wait_synced(objset_t *os, uint64_t object)
{
dnode_t *dn;
int error, i;
error = dnode_hold(os, object, FTAG, &dn);
if (error) {
return (error);
}
for (i = 0; i < TXG_SIZE; i++) {
if (list_link_active(&dn->dn_dirty_link[i])) {
break;
}
}
dnode_rele(dn, FTAG);
if (i != TXG_SIZE) {
txg_wait_synced(dmu_objset_pool(os), 0);
}
return (0);
}
void
__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
dnode_phys_t *dnp = dn->dn_phys;
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
doi->doi_nblkptr = dn->dn_nblkptr;
doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
}
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
__dmu_object_info_from_dnode(dn, doi);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
}
/*
* Get information on a DMU object.
* If doi is NULL, just indicates whether the object exists.
*/
int
dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
{
dnode_t *dn;
int err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
if (doi != NULL)
dmu_object_info_from_dnode(dn, doi);
dnode_rele(dn, FTAG);
return (0);
}
/*
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
DB_DNODE_ENTER(db);
dmu_object_info_from_dnode(DB_DNODE(db), doi);
DB_DNODE_EXIT(db);
}
/*
* Faster still when you only care about the size.
* This is specifically optimized for zfs_getattr().
*/
void
dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
u_longlong_t *nblk512)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add in number of slots used for the dnode itself */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
DB_DNODE_EXIT(db);
}
void
dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
*dnsize = dn->dn_num_slots << DNODE_SHIFT;
DB_DNODE_EXIT(db);
}
void
byteswap_uint64_array(void *vbuf, size_t size)
{
uint64_t *buf = vbuf;
size_t count = size >> 3;
int i;
ASSERT((size & 7) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_64(buf[i]);
}
void
byteswap_uint32_array(void *vbuf, size_t size)
{
uint32_t *buf = vbuf;
size_t count = size >> 2;
int i;
ASSERT((size & 3) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_32(buf[i]);
}
void
byteswap_uint16_array(void *vbuf, size_t size)
{
uint16_t *buf = vbuf;
size_t count = size >> 1;
int i;
ASSERT((size & 1) == 0);
for (i = 0; i < count; i++)
buf[i] = BSWAP_16(buf[i]);
}
/* ARGSUSED */
void
byteswap_uint8_array(void *vbuf, size_t size)
{
}
void
dmu_init(void)
{
abd_init();
zfs_dbgmsg_init();
sa_cache_init();
xuio_stat_init();
dmu_objset_init();
dnode_init();
zfetch_init();
zio_compress_init();
l2arc_init();
arc_init();
dbuf_init();
}
void
dmu_fini(void)
{
arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini();
zfetch_fini();
zio_compress_fini();
dbuf_fini();
dnode_fini();
dmu_objset_fini();
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
abd_fini();
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 353538)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 353539)
@@ -1,6021 +1,6020 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
*/
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/*
* Programming rules.
*
* Each vnode op performs some logical unit of work. To do this, the ZPL must
* properly lock its in-core state, create a DMU transaction, do the work,
* record this work in the intent log (ZIL), commit the DMU transaction,
* and wait for the intent log to commit if it is a synchronous operation.
* Moreover, the vnode ops must work in both normal and log replay context.
* The ordering of events is important to avoid deadlocks and references
* to freed memory. The example below illustrates the following Big Rules:
*
* (1) A check must be made in each zfs thread for a mounted file system.
* This is done avoiding races using ZFS_ENTER(zfsvfs).
* A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
* must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
* can return EIO from the calling function.
*
* (2) VN_RELE() should always be the last thing except for zil_commit()
* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
* First, if it's the last reference, the vnode/znode
* can be freed, so the zp may point to freed memory. Second, the last
* reference will call zfs_zinactive(), which may induce a lot of work --
* pushing cached pages (which acquires range locks) and syncing out
* cached atime changes. Third, zfs_zinactive() may require a new tx,
* which could deadlock the system if you were already holding one.
* If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
*
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls.
*
* (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
* dmu_tx_assign(). This is critical because we don't want to block
* while holding locks.
*
* If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
* reduces lock contention and CPU usage when we must wait (note that if
* throughput is constrained by the storage, nearly every transaction
* must wait).
*
* Note, in particular, that if a lock is sometimes acquired before
* the tx assigns, and sometimes after (e.g. z_lock), then failing
* to use a non-blocking assign can deadlock the system. The scenario:
*
* Thread A has grabbed a lock before calling dmu_tx_assign().
* Thread B is in an already-assigned tx, and blocks for this lock.
* Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
* forever, because the previous txg can't quiesce until B's tx commits.
*
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
* calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
* to indicate that this operation has already called dmu_tx_wait().
* This will ensure that we don't retry forever, waiting a short bit
* each time.
*
* (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events
* in the intent log matches the order in which they actually occurred.
* During ZIL replay the zfs_log_* functions will update the sequence
* number to indicate the zil transaction has replayed.
*
* (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors.
*
* (7) After dropping all locks, invoke zil_commit(zilog, foid)
* to ensure that synchronous semantics are provided when necessary.
*
* In general, this is how things should be ordered in each vnode op:
*
* ZFS_ENTER(zfsvfs); // exit if unmounted
* top:
* zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
* error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
* if (error) {
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
* if (error == ERESTART) {
* waited = B_TRUE;
* dmu_tx_wait(tx);
* dmu_tx_abort(tx);
* goto top;
* }
* dmu_tx_abort(tx); // abort DMU tx
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // really out of space
* }
* error = do_real_work(); // do whatever this VOP does
* if (error == 0)
* zfs_log_*(...); // on success, make ZIL entry
* dmu_tx_commit(tx); // commit DMU tx -- error or not
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
* zil_commit(zilog, foid); // synchronous when necessary
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
/* ARGSUSED */
static int
zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(*vpp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & FAPPEND) == 0)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
if (fs_vscan(*vpp, cr, 0) != 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EACCES));
}
}
/* Keep a count of the synchronous opens in the znode */
if (flag & (FSYNC | FDSYNC))
atomic_inc_32(&zp->z_sync_cnt);
ZFS_EXIT(zfsvfs);
return (0);
}
/* ARGSUSED */
static int
zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
/*
* Clean up any locks held by this process on the vp.
*/
cleanlocks(vp, ddi_get_pid(), 0);
cleanshares(vp, ddi_get_pid());
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/* Decrement the synchronous opens in the znode */
if ((flag & (FSYNC | FDSYNC)) && (count == 1))
atomic_dec_32(&zp->z_sync_cnt);
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
VERIFY(fs_vscan(vp, cr, 1) == 0);
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
* data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
*/
static int
zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
{
znode_t *zp = VTOZ(vp);
uint64_t noff = (uint64_t)*off; /* new offset */
uint64_t file_sz;
int error;
boolean_t hole;
file_sz = zp->z_size;
if (noff >= file_sz) {
return (SET_ERROR(ENXIO));
}
if (cmd == _FIO_SEEK_HOLE)
hole = B_TRUE;
else
hole = B_FALSE;
error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
if (error == ESRCH)
return (SET_ERROR(ENXIO));
/*
* We could find a hole that begins after the logical end-of-file,
* because dmu_offset_next() only works on whole blocks. If the
* EOF falls mid-block, then indicate that the "virtual hole"
* at the end of the file begins at the logical EOF, rather than
* at the end of the last block.
*/
if (noff > file_sz) {
ASSERT(hole);
noff = file_sz;
}
if (noff < *off)
return (error);
*off = noff;
return (error);
}
/* ARGSUSED */
static int
zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
int *rvalp, caller_context_t *ct)
{
offset_t off;
offset_t ndata;
dmu_object_info_t doi;
int error;
zfsvfs_t *zfsvfs;
znode_t *zp;
switch (com) {
case _FIOFFS:
{
return (0);
/*
* The following two ioctls are used by bfu. Faking out,
* necessary to avoid bfu errors.
*/
}
case _FIOGDIO:
case _FIOSDIO:
{
return (0);
}
case _FIO_SEEK_DATA:
case _FIO_SEEK_HOLE:
{
#ifdef illumos
if (ddi_copyin((void *)data, &off, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
off = *(offset_t *)data;
#endif
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/* offset parameter is in/out */
error = zfs_holey(vp, com, &off);
ZFS_EXIT(zfsvfs);
if (error)
return (error);
#ifdef illumos
if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
*(offset_t *)data = off;
#endif
return (0);
}
#ifdef illumos
case _FIO_COUNT_FILLED:
{
/*
* _FIO_COUNT_FILLED adds a new ioctl command which
* exposes the number of filled blocks in a
* ZFS object.
*/
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/*
* Wait for all dirty blocks for this object
* to get synced out to disk, and the DMU info
* updated.
*/
error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Retrieve fill count from DMU object.
*/
error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
ndata = doi.doi_fill_count;
ZFS_EXIT(zfsvfs);
if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
return (SET_ERROR(EFAULT));
return (0);
}
#endif
}
return (SET_ERROR(ENOTTY));
}
static vm_page_t
page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
{
vm_object_t obj;
vm_page_t pp;
int64_t end;
/*
* At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
* aligned boundaries, if the range is not aligned. As a result a
* DEV_BSIZE subrange with partially dirty data may get marked as clean.
* It may happen that all DEV_BSIZE subranges are marked clean and thus
* the whole page would be considred clean despite have some dirty data.
* For this reason we should shrink the range to DEV_BSIZE aligned
* boundaries before calling vm_page_clear_dirty.
*/
end = rounddown2(off + nbytes, DEV_BSIZE);
off = roundup2(off, DEV_BSIZE);
nbytes = end - off;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
if (pp != NULL) {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_object_pip_add(obj, 1);
pmap_remove_write(pp);
if (nbytes != 0)
vm_page_clear_dirty(pp, off, nbytes);
}
return (pp);
}
static void
page_unbusy(vm_page_t pp)
{
vm_page_sunbusy(pp);
vm_object_pip_wakeup(pp->object);
}
static vm_page_t
page_wire(vnode_t *vp, int64_t start)
{
vm_object_t obj;
vm_page_t m;
obj = vp->v_object;
zfs_vmobject_assert_wlocked(obj);
vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY);
return (m);
}
static void
page_unwire(vm_page_t pp)
{
vm_page_unwire(pp, PQ_ACTIVE);
}
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
static void
update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
int segflg, dmu_tx_t *tx)
{
vm_object_t obj;
struct sf_buf *sf;
caddr_t va;
int off;
ASSERT(segflg != UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
off = start & PAGEOFFSET;
zfs_vmobject_wlock(obj);
vm_object_pip_add(obj, 1);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
int nbytes = imin(PAGESIZE - off, len);
if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
(void) dmu_read(os, oid, start+off, nbytes,
va+off, DMU_READ_PREFETCH);;
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unbusy(pp);
}
len -= nbytes;
off = 0;
}
vm_object_pip_wakeup(obj);
zfs_vmobject_wunlock(obj);
}
/*
* Read with UIO_NOCOPY flag means that sendfile(2) requests
* ZFS to populate a range of page cache pages with data.
*
* NOTE: this function could be optimized to pre-allocate
* all pages in advance, drain exclusive busy on all of them,
* map them into contiguous KVA region and populate them
* in one single dmu_read() call.
*/
static int
mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
{
znode_t *zp = VTOZ(vp);
objset_t *os = zp->z_zfsvfs->z_os;
struct sf_buf *sf;
vm_object_t obj;
vm_page_t pp;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
ASSERT(uio->uio_segflg == UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
zfs_vmobject_wlock(obj);
for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
int bytes = MIN(PAGESIZE, len);
pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
- if (pp->valid == 0) {
+ if (vm_page_none_valid(pp)) {
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
error = dmu_read(os, zp->z_id, start, bytes, va,
DMU_READ_PREFETCH);
if (bytes != PAGESIZE && error == 0)
bzero(va + bytes, PAGESIZE - bytes);
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
- vm_page_sunbusy(pp);
- if (error) {
- if (!vm_page_busied(pp) && !vm_page_wired(pp) &&
- pp->valid == 0)
- vm_page_free(pp);
- } else {
- pp->valid = VM_PAGE_BITS_ALL;
+ if (error == 0) {
+ vm_page_valid(pp);
vm_page_lock(pp);
vm_page_activate(pp);
vm_page_unlock(pp);
}
+ vm_page_sunbusy(pp);
+ if (error != 0 && !vm_page_wired(pp) == 0 &&
+ pp->valid == 0 && vm_page_tryxbusy(pp))
+ vm_page_free(pp);
} else {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_page_sunbusy(pp);
}
if (error)
break;
uio->uio_resid -= bytes;
uio->uio_offset += bytes;
len -= bytes;
}
zfs_vmobject_wunlock(obj);
return (error);
}
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Read: We "read" preferentially from memory mapped pages,
* else we default from the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
*/
static int
mappedread(vnode_t *vp, int nbytes, uio_t *uio)
{
znode_t *zp = VTOZ(vp);
vm_object_t obj;
int64_t start;
caddr_t va;
int len = nbytes;
int off;
int error = 0;
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
start = uio->uio_loffset;
off = start & PAGEOFFSET;
zfs_vmobject_wlock(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
uint64_t bytes = MIN(PAGESIZE - off, len);
if (pp = page_wire(vp, start)) {
struct sf_buf *sf;
caddr_t va;
zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
#ifdef illumos
error = uiomove(va + off, bytes, UIO_READ, uio);
#else
error = vn_io_fault_uiomove(va + off, bytes, uio);
#endif
zfs_unmap_page(sf);
zfs_vmobject_wlock(obj);
page_unwire(pp);
} else {
zfs_vmobject_wunlock(obj);
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes);
zfs_vmobject_wlock(obj);
}
len -= bytes;
off = 0;
if (error)
break;
}
zfs_vmobject_wunlock(obj);
return (error);
}
offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
/*
* Read bytes from specified file into supplied buffer.
*
* IN: vp - vnode of file to be read from.
* uio - structure supplying read location, range info,
* and return buffer.
* ioflag - SYNC flags; used to provide FRSYNC semantics.
* cr - credentials of caller.
* ct - caller context
*
* OUT: uio - updated offset and range, buffer filled.
*
* RETURN: 0 on success, error code on failure.
*
* Side Effects:
* vp - atime updated if byte count > 0
*/
/* ARGSUSED */
static int
zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ssize_t n, nbytes;
int error = 0;
rl_t *rl;
xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EACCES));
}
/*
* Validate file offset
*/
if (uio->uio_loffset < (offset_t)0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Fasttrack empty reads
*/
if (uio->uio_resid == 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Check for mandatory locks
*/
if (MANDMODE(zp->z_mode)) {
if (error = chklock(vp, FREAD,
uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
/*
* If we're in FRSYNC mode, sync out this znode before reading it.
*/
if (zfsvfs->z_log &&
(ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
zil_commit(zfsvfs->z_log, zp->z_id);
/*
* Lock the range against changes.
*/
rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
/*
* If we are reading past end-of-file we can skip
* to the end; but we might still need to set atime.
*/
if (uio->uio_loffset >= zp->z_size) {
error = 0;
goto out;
}
ASSERT(uio->uio_loffset < zp->z_size);
n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
#ifdef illumos
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
int nblk;
int blksz = zp->z_blksz;
uint64_t offset = uio->uio_loffset;
xuio = (xuio_t *)uio;
if ((ISP2(blksz))) {
nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
blksz)) / blksz;
} else {
ASSERT(offset + n <= blksz);
nblk = 1;
}
(void) dmu_xuio_init(xuio, nblk);
if (vn_has_cached_data(vp)) {
/*
* For simplicity, we always allocate a full buffer
* even if we only expect to read a portion of a block.
*/
while (--nblk >= 0) {
(void) dmu_xuio_add(xuio,
dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
blksz), 0, blksz);
}
}
}
#endif /* illumos */
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
#ifdef __FreeBSD__
if (uio->uio_segflg == UIO_NOCOPY)
error = mappedread_sf(vp, nbytes, uio);
else
#endif /* __FreeBSD__ */
if (vn_has_cached_data(vp)) {
error = mappedread(vp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes);
}
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = SET_ERROR(EIO);
break;
}
n -= nbytes;
}
out:
zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Write the bytes to a file.
*
* IN: vp - vnode of file to be written to.
* uio - structure supplying write location, range info,
* and data buffer.
* ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
* set if in append mode.
* cr - credentials of caller.
* ct - caller context (NFS/CIFS fem monitor only)
*
* OUT: uio - updated offset and range.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
*/
/* ARGSUSED */
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
rlim64_t limit = MAXOFFSET_T;
ssize_t start_resid = uio->uio_resid;
ssize_t tx_bytes;
uint64_t end_size;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
offset_t woff;
ssize_t n, nbytes;
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
int error = 0;
arc_buf_t *abuf;
iovec_t *aiov = NULL;
xuio_t *xuio = NULL;
int i_iov = 0;
int iovcnt = uio->uio_iovcnt;
iovec_t *iovp = uio->uio_iov;
int write_eof;
int count = 0;
sa_bulk_attr_t bulk[4];
uint64_t mtime[2], ctime[2];
/*
* Fasttrack empty write
*/
n = start_resid;
if (n == 0)
return (0);
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
limit = MAXOFFSET_T;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
&zp->z_size, 8);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, 8);
/*
* In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
* callers might not be able to detect properly that we are read-only,
* so check it explicitly here.
*/
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EROFS));
}
/*
* If immutable or not appending then return EPERM.
* Intentionally allow ZFS_READONLY through here.
* See zfs_zaccess_common()
*/
if ((zp->z_pflags & ZFS_IMMUTABLE) ||
((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
(uio->uio_loffset < zp->z_size))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
zilog = zfsvfs->z_log;
/*
* Validate file offset
*/
woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
if (woff < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Check for mandatory locks before calling zfs_range_lock()
* in order to prevent a deadlock with locks set via fcntl().
*/
if (MANDMODE((mode_t)zp->z_mode) &&
(error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
#ifdef illumos
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
* Skip this if uio contains loaned arc_buf.
*/
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
xuio = (xuio_t *)uio;
else
uio_prefaultpages(MIN(n, max_blksz), uio);
#endif
/*
* If in append mode, set the io offset pointer to eof.
*/
if (ioflag & FAPPEND) {
/*
* Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
/*
* We overlocked the file because this write will cause
* the file block size to increase.
* Note that zp_size cannot change with this lock held.
*/
woff = zp->z_size;
}
uio->uio_loffset = woff;
} else {
/*
* Note that if the file block size will change as a result of
* this write, then this range lock will lock the entire file
* so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
}
if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (EFBIG);
}
if (woff >= limit) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFBIG));
}
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
/* Will this write extend the file length? */
write_eof = (woff + n > zp->z_size);
end_size = MAX(zp->z_size, woff + n);
/*
* Write the file in reasonable size chunks. Each chunk is written
* in a separate transaction; this keeps the intent log records small
* and allows us to do more fine-grained space accounting.
*/
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = SET_ERROR(EDQUOT);
break;
}
if (xuio && abuf == NULL) {
ASSERT(i_iov < iovcnt);
aiov = &iovp[i_iov];
abuf = dmu_xuio_arcbuf(xuio, i_iov);
dmu_xuio_clear(xuio, i_iov);
DTRACE_PROBE3(zfs_cp_write, int, i_iov,
iovec_t *, aiov, arc_buf_t *, abuf);
ASSERT((aiov->iov_base == abuf->b_data) ||
((char *)aiov->iov_base - (char *)abuf->b_data +
aiov->iov_len == arc_buf_size(abuf)));
i_iov++;
} else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
* a transaction. This avoids the possibility of
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes)) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT(cbytes == max_blksz);
}
/*
* Start a transaction.
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
if (abuf != NULL)
dmu_return_arcbuf(abuf);
break;
}
/*
* If zfs_range_lock() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen
* on the first iteration since zfs_range_reduce() will
* shrink down r_len to the appropriate size.
*/
if (rl->r_len == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_range_reduce(rl, woff, n);
}
/*
* XXX - should we really limit each write to z_max_blksz?
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
if (woff + nbytes > zp->z_size)
vnode_pager_setsize(vp, woff + nbytes);
if (abuf == NULL) {
tx_bytes = uio->uio_resid;
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
/*
* If this is not a full block write, but we are
* extending the file past EOF and this data starts
* block-aligned, use assign_arcbuf(). Otherwise,
* write via dmu_write().
*/
if (tx_bytes < max_blksz && (!write_eof ||
aiov->iov_base != abuf->b_data)) {
ASSERT(xuio);
dmu_write(zfsvfs->z_os, zp->z_id, woff,
aiov->iov_len, aiov->iov_base, tx);
dmu_return_arcbuf(abuf);
xuio_stat_wbuf_copied();
} else {
ASSERT(xuio || tx_bytes == max_blksz);
dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
woff, abuf, tx);
}
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
zp->z_id, uio->uio_segflg, tx);
}
/*
* If we made no progress, we're done. If we made even
* partial progress, update the znode and ZIL accordingly.
*/
if (tx_bytes == 0) {
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
(void *)&zp->z_size, sizeof (uint64_t), tx);
dmu_tx_commit(tx);
ASSERT(error != 0);
break;
}
/*
* Clear Set-UID/Set-GID bits on successful write if not
* privileged and at least one of the excute bits is set.
*
* It would be nice to to this after all writes have
* been done, but that would still expose the ISUID/ISGID
* to another app after the partial write is committed.
*
* Note: we don't call zfs_fuid_map_id() here because
* user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
(S_IXUSR >> 6))) != 0 &&
(zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(vp, cr,
(zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
uint64_t newmode;
zp->z_mode &= ~(S_ISUID | S_ISGID);
newmode = zp->z_mode;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
(void *)&newmode, sizeof (uint64_t), tx);
}
mutex_exit(&zp->z_acl_lock);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
/*
* Update the file size (zp_size) if it has changed;
* account for possible concurrent updates.
*/
while ((end_size = zp->z_size) < uio->uio_loffset) {
(void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
#ifdef illumos
ASSERT(error == 0);
#else
ASSERT(error == 0 || error == EFAULT);
#endif
}
/*
* If we are replaying and eof is non zero then force
* the file size to the specified eof. Note, there's no
* concurrency during replay.
*/
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
zp->z_size = zfsvfs->z_replay_eof;
if (error == 0)
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
else
(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
if (error != 0)
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
#ifdef illumos
if (!xuio && n > 0)
uio_prefaultpages(MIN(n, max_blksz), uio);
#endif
}
zfs_range_unlock(rl);
/*
* If we're in replay mode, or we made no progress, return error.
* Otherwise, it's at least a partial write, so it's successful.
*/
if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
ZFS_EXIT(zfsvfs);
return (error);
}
#ifdef __FreeBSD__
/*
* EFAULT means that at least one page of the source buffer was not
* available. VFS will re-try remaining I/O upon this error.
*/
if (error == EFAULT) {
ZFS_EXIT(zfsvfs);
return (error);
}
#endif
if (ioflag & (FSYNC | FDSYNC) ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
}
/* ARGSUSED */
void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;
objset_t *os = zp->z_zfsvfs->z_os;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
zfs_range_unlock(zgd->zgd_rl);
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
kmem_free(zgd, sizeof (zgd_t));
}
#ifdef DEBUG
static int zil_fault_io = 0;
#endif
/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (SET_ERROR(ENOENT));
}
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_rl = zfs_range_lock(zp, offset, size,
RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
zgd->zgd_db = db;
zgd->zgd_bp = bp;
ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);
error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);
/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);
if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
/*
* TX_WRITE2 relies on the data previously
* written by the TX_WRITE that caused
* EALREADY. We zero out the BP because
* it is the old, currently-on-disk BP.
*/
zgd->zgd_bp = NULL;
BP_ZERO(bp);
error = 0;
}
}
}
zfs_get_done(zgd, error);
return (error);
}
/*ARGSUSED*/
static int
zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (flag & V_ACE_MASK)
error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
else
error = zfs_zaccess_rwx(zp, mode, flag, cr);
ZFS_EXIT(zfsvfs);
return (error);
}
static int
zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
{
int error;
*vpp = arg;
error = vn_lock(*vpp, lkflags);
if (error != 0)
vrele(*vpp);
return (error);
}
static int
zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
{
znode_t *zdp = VTOZ(dvp);
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error;
int ltype;
ASSERT_VOP_LOCKED(dvp, __func__);
#ifdef DIAGNOSTIC
if ((zdp->z_pflags & ZFS_XATTR) == 0)
VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
#endif
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
ASSERT3P(dvp, ==, vp);
vref(dvp);
ltype = lkflags & LK_TYPE_MASK;
if (ltype != VOP_ISLOCKED(dvp)) {
if (ltype == LK_EXCLUSIVE)
vn_lock(dvp, LK_UPGRADE | LK_RETRY);
else /* if (ltype == LK_SHARED) */
vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
/*
* Relock for the "." case could leave us with
* reclaimed vnode.
*/
if (dvp->v_iflag & VI_DOOMED) {
vrele(dvp);
return (SET_ERROR(ENOENT));
}
}
return (0);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
/*
* Note that in this case, dvp is the child vnode, and we
* are looking up the parent vnode - exactly reverse from
* normal operation. Unlocking dvp requires some rather
* tricky unlock/relock dance to prevent mp from being freed;
* use vn_vget_ino_gen() which takes care of all that.
*
* XXX Note that there is a time window when both vnodes are
* unlocked. It is possible, although highly unlikely, that
* during that window the parent-child relationship between
* the vnodes may change, for example, get reversed.
* In that case we would have a wrong lock order for the vnodes.
* All other filesystems seem to ignore this problem, so we
* do the same here.
* A potential solution could be implemented as follows:
* - using LK_NOWAIT when locking the second vnode and retrying
* if necessary
* - checking that the parent-child relationship still holds
* after locking both vnodes and retrying if it doesn't
*/
error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
return (error);
} else {
error = vn_lock(vp, lkflags);
if (error != 0)
vrele(vp);
return (error);
}
}
/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held vnode reference for it.
*
* IN: dvp - vnode of directory to search.
* nm - name of entry to lookup.
* pnp - full pathname to lookup [UNUSED].
* flags - LOOKUP_XATTR set if looking for an attribute.
* rdir - root directory vnode [UNUSED].
* cr - credentials of caller.
* ct - caller context
*
* OUT: vpp - vnode of located entry, NULL if not found.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* NA
*/
/* ARGSUSED */
static int
zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
int nameiop, cred_t *cr, kthread_t *td, int flags)
{
znode_t *zdp = VTOZ(dvp);
znode_t *zp;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error = 0;
/*
* Fast path lookup, however we must skip DNLC lookup
* for case folding or normalizing lookups because the
* DNLC code only stores the passed in name. This means
* creating 'a' and removing 'A' on a case insensitive
* file system would work, but DNLC still thinks 'a'
* exists and won't let you create it again on the next
* pass through fast path.
*/
if (!(flags & LOOKUP_XATTR)) {
if (dvp->v_type != VDIR) {
return (SET_ERROR(ENOTDIR));
} else if (zdp->z_sa_hdl == NULL) {
return (SET_ERROR(EIO));
}
}
DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zdp);
*vpp = NULL;
if (flags & LOOKUP_XATTR) {
#ifdef TODO
/*
* If the xattr property is off, refuse the lookup request.
*/
if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
#endif
/*
* We don't allow recursive attributes..
* Maybe someday we will.
*/
if (zdp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Do we have permission to get into attribute directory?
*/
if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
B_FALSE, cr)) {
vrele(*vpp);
*vpp = NULL;
}
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Check accessibility of directory.
*/
if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
/*
* First handle the special cases.
*/
if ((cnp->cn_flags & ISDOTDOT) != 0) {
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
struct componentname cn;
vnode_t *zfsctl_vp;
int ltype;
ZFS_EXIT(zfsvfs);
ltype = VOP_ISLOCKED(dvp);
VOP_UNLOCK(dvp, 0);
error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
&zfsctl_vp);
if (error == 0) {
cn.cn_nameptr = "snapshot";
cn.cn_namelen = strlen(cn.cn_nameptr);
cn.cn_nameiop = cnp->cn_nameiop;
cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
cn.cn_lkflags = cnp->cn_lkflags;
error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
vput(zfsctl_vp);
}
vn_lock(dvp, ltype | LK_RETRY);
return (error);
}
}
if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
ZFS_EXIT(zfsvfs);
if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
return (SET_ERROR(ENOTSUP));
error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
return (error);
}
/*
* The loop is retry the lookup if the parent-child relationship
* changes during the dot-dot locking complexities.
*/
for (;;) {
uint64_t parent;
error = zfs_dirlook(zdp, nm, &zp);
if (error == 0)
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
if (error != 0)
break;
error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
if (error != 0) {
/*
* If we've got a locking error, then the vnode
* got reclaimed because of a force unmount.
* We never enter doomed vnodes into the name cache.
*/
*vpp = NULL;
return (error);
}
if ((cnp->cn_flags & ISDOTDOT) == 0)
break;
ZFS_ENTER(zfsvfs);
if (zdp->z_sa_hdl == NULL) {
error = SET_ERROR(EIO);
} else {
error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (parent));
}
if (error != 0) {
ZFS_EXIT(zfsvfs);
vput(ZTOV(zp));
break;
}
if (zp->z_id == parent) {
ZFS_EXIT(zfsvfs);
break;
}
vput(ZTOV(zp));
}
out:
if (error != 0)
*vpp = NULL;
/* Translate errors and add SAVENAME when needed. */
if (cnp->cn_flags & ISLASTCN) {
switch (nameiop) {
case CREATE:
case RENAME:
if (error == ENOENT) {
error = EJUSTRETURN;
cnp->cn_flags |= SAVENAME;
break;
}
/* FALLTHROUGH */
case DELETE:
if (error == 0)
cnp->cn_flags |= SAVENAME;
break;
}
}
/* Insert name into cache (as non-existent) if appropriate. */
if (zfsvfs->z_use_namecache &&
error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(dvp, NULL, cnp);
/* Insert name into cache if appropriate. */
if (zfsvfs->z_use_namecache &&
error == 0 && (cnp->cn_flags & MAKEENTRY)) {
if (!(cnp->cn_flags & ISLASTCN) ||
(nameiop != DELETE && nameiop != RENAME)) {
cache_enter(dvp, *vpp, cnp);
}
}
return (error);
}
/*
* Attempt to create a new entry in a directory. If the entry
* already exists, truncate the file if permissible, else return
* an error. Return the vp of the created or trunc'd file.
*
* IN: dvp - vnode of directory to put new file entry in.
* name - name of new file entry.
* vap - attributes of new file.
* excl - flag indicating exclusive or non-exclusive mode.
* mode - mode to open file with.
* cr - credentials of caller.
* flag - large file flag [UNUSED].
* ct - caller context
* vsecp - ACL to be set
*
* OUT: vpp - vnode of created or trunc'd entry.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated if new entry created
* vp - ctime|mtime always, atime if new
*/
/* ARGSUSED */
static int
zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
vnode_t **vpp, cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
objset_t *os;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
void *vsecp = NULL;
int flag = 0;
uint64_t txtype;
/*
* If we have an ephemeral id, ACL, or XVATTR then
* make sure file system is at proper version
*/
ksid = crgetsid(cr, KSID_OWNER);
if (ksid)
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
(vsecp || (vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
os = zfsvfs->z_os;
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
*vpp = NULL;
if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
vap->va_mode &= ~S_ISVTX;
error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
ASSERT3P(zp, ==, NULL);
/*
* Create a new file object and update the directory
* to reference it.
*/
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
goto out;
}
/*
* We only support the creation of regular files in
* extended attribute directories.
*/
if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
error = SET_ERROR(EINVAL);
goto out;
}
if ((error = zfs_acl_ids_create(dzp, 0, vap,
cr, vsecp, &acl_ids)) != 0)
goto out;
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
error = SET_ERROR(EDQUOT);
goto out;
}
getnewvnode_reserve(1);
tx = dmu_tx_create(os);
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
if (!zfsvfs->z_use_sa &&
acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
zfs_log_create(zilog, tx, txtype, dzp, zp, name,
vsecp, acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
out:
if (error == 0) {
*vpp = ZTOV(zp);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Remove an entry from a directory.
*
* IN: dvp - vnode of directory to remove entry from.
* name - name of entry to remove.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime
* vp - ctime (if nlink > 0)
*/
/*ARGSUSED*/
static int
zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
znode_t *zp = VTOZ(vp);
znode_t *xzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
uint64_t obj = 0;
dmu_tx_t *tx;
boolean_t unlinked, toobig = FALSE;
uint64_t txtype;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
zp = VTOZ(vp);
xattr_obj = 0;
xzp = NULL;
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
/*
* Need to use rmdir for removing directories.
*/
if (vp->v_type == VDIR) {
error = SET_ERROR(EPERM);
goto out;
}
vnevent_remove(vp, dvp, name, ct);
obj = zp->z_id;
/* are there any extended attributes? */
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT0(error);
}
/*
* We may delete the znode now, or we may put it in the unlinked set;
* it depends on whether we're the last link, and on whether there are
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
if (xzp) {
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
/*
* Mark this transaction as typically resulting in a net free of space
*/
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Remove the directory entry.
*/
error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
if (error) {
dmu_tx_commit(tx);
goto out;
}
if (unlinked) {
zfs_unlinked_add(zp, tx);
vp->v_vflag |= VV_NOSYNC;
}
txtype = TX_REMOVE;
zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
if (xzp)
vrele(ZTOV(xzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create a new directory and insert it into dvp using the name
* provided. Return a pointer to the inserted directory.
*
* IN: dvp - vnode of directory to add subdir to.
* dirname - name of new directory.
* vap - attributes of new directory.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
* vsecp - ACL to be set
*
* OUT: vpp - vnode of created directory.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
* vp - ctime|mtime|atime updated
*/
/*ARGSUSED*/
static int
zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t txtype;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
/*
* If we have an ephemeral id, ACL, or XVATTR then
* make sure file system is at proper version
*/
ksid = crgetsid(cr, KSID_OWNER);
if (ksid)
uid = ksid_getid(ksid);
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
((vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
if (dzp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (zfsvfs->z_utf8 && u8_validate(dirname,
strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
crgetuid(cr), cr, vap->va_type)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* First make sure the new directory doesn't exist.
*
* Existence is checked first to make sure we don't return
* EACCES instead of EEXIST which can cause some applications
* to fail.
*/
*vpp = NULL;
if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
ASSERT3P(zp, ==, NULL);
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
/*
* Add a new entry to the directory.
*/
getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
acl_ids.z_aclp->z_acl_bytes);
}
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create new node.
*/
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
/*
* Now put new name in parent dir.
*/
(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
*vpp = ZTOV(zp);
txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Remove a directory subdir entry. If the current working
* directory is the same as the subdir to be removed, the
* remove will fail.
*
* IN: dvp - vnode of directory to remove from.
* name - name of directory to be removed.
* cwd - vnode of current working directory.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
if (vp->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
goto out;
}
vnevent_rmdir(vp, dvp, name, ct);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
cache_purge(dvp);
error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
if (error == 0) {
uint64_t txtype = TX_RMDIR;
zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
cache_purge(vp);
out:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Read as many directory entries as will fit into the provided
* buffer from the given directory cursor position (specified in
* the uio structure).
*
* IN: vp - vnode of directory to read.
* uio - structure supplying read location, range info,
* and return buffer.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* OUT: uio - updated offset and range, buffer filled.
* eofp - set to true if end-of-file detected.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
*
* Note that the low 4 bits of the cookie returned by zap is always zero.
* This allows us to use the low range for "special" directory entries:
* We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
* we use the offset 2 for the '.zfs' directory.
*/
/* ARGSUSED */
static int
zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
{
znode_t *zp = VTOZ(vp);
iovec_t *iovp;
edirent_t *eodp;
dirent64_t *odp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os;
caddr_t outbuf;
size_t bufsize;
zap_cursor_t zc;
zap_attribute_t zap;
uint_t bytes_wanted;
uint64_t offset; /* must be unsigned; checks for < 1 */
uint64_t parent;
int local_eof;
int outcount;
int error;
uint8_t prefetch;
boolean_t check_sysattrs;
uint8_t type;
int ncooks;
u_long *cooks = NULL;
int flags = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (parent))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* If we are not given an eof variable,
* use a local one.
*/
if (eofp == NULL)
eofp = &local_eof;
/*
* Check for valid iov_len.
*/
if (uio->uio_iov->iov_len <= 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* Quit if directory has been removed (posix)
*/
if ((*eofp = zp->z_unlinked) != 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
error = 0;
os = zfsvfs->z_os;
offset = uio->uio_loffset;
prefetch = zp->z_zn_prefetch;
/*
* Initialize the iterator cursor.
*/
if (offset <= 3) {
/*
* Start iteration from the beginning of the directory.
*/
zap_cursor_init(&zc, os, zp->z_id);
} else {
/*
* The offset is a serialized cursor.
*/
zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
}
/*
* Get space to change directory entries into fs independent format.
*/
iovp = uio->uio_iov;
bytes_wanted = iovp->iov_len;
if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
bufsize = bytes_wanted;
outbuf = kmem_alloc(bufsize, KM_SLEEP);
odp = (struct dirent64 *)outbuf;
} else {
bufsize = bytes_wanted;
outbuf = NULL;
odp = (struct dirent64 *)iovp->iov_base;
}
eodp = (struct edirent *)odp;
if (ncookies != NULL) {
/*
* Minimum entry size is dirent size and 1 byte for a file name.
*/
ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
*cookies = cooks;
*ncookies = ncooks;
}
/*
* If this VFS supports the system attribute view interface; and
* we're looking at an extended attribute directory; and we care
* about normalization conflicts on this vfs; then we must check
* for normalization conflicts with the sysattr name space.
*/
#ifdef TODO
check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
(vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
(flags & V_RDDIR_ENTFLAGS);
#else
check_sysattrs = 0;
#endif
/*
* Transform to file-system independent format
*/
outcount = 0;
while (outcount < bytes_wanted) {
ino64_t objnum;
ushort_t reclen;
off64_t *next = NULL;
/*
* Special case `.', `..', and `.zfs'.
*/
if (offset == 0) {
(void) strcpy(zap.za_name, ".");
zap.za_normalization_conflict = 0;
objnum = zp->z_id;
type = DT_DIR;
} else if (offset == 1) {
(void) strcpy(zap.za_name, "..");
zap.za_normalization_conflict = 0;
objnum = parent;
type = DT_DIR;
} else if (offset == 2 && zfs_show_ctldir(zp)) {
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
zap.za_normalization_conflict = 0;
objnum = ZFSCTL_INO_ROOT;
type = DT_DIR;
} else {
/*
* Grab next entry.
*/
if (error = zap_cursor_retrieve(&zc, &zap)) {
if ((*eofp = (error == ENOENT)) != 0)
break;
else
goto update;
}
if (zap.za_integer_length != 8 ||
zap.za_num_integers != 1) {
cmn_err(CE_WARN, "zap_readdir: bad directory "
"entry, obj = %lld, offset = %lld\n",
(u_longlong_t)zp->z_id,
(u_longlong_t)offset);
error = SET_ERROR(ENXIO);
goto update;
}
objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
/*
* MacOS X can extract the object type here such as:
* uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
*/
type = ZFS_DIRENT_TYPE(zap.za_first_integer);
if (check_sysattrs && !zap.za_normalization_conflict) {
#ifdef TODO
zap.za_normalization_conflict =
xattr_sysattr_casechk(zap.za_name);
#else
panic("%s:%u: TODO", __func__, __LINE__);
#endif
}
}
if (flags & V_RDDIR_ACCFILTER) {
/*
* If we have no access at all, don't include
* this entry in the returned information
*/
znode_t *ezp;
if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
goto skip_entry;
if (!zfs_has_access(ezp, cr)) {
vrele(ZTOV(ezp));
goto skip_entry;
}
vrele(ZTOV(ezp));
}
if (flags & V_RDDIR_ENTFLAGS)
reclen = EDIRENT_RECLEN(strlen(zap.za_name));
else
reclen = DIRENT64_RECLEN(strlen(zap.za_name));
/*
* Will this entry fit in the buffer?
*/
if (outcount + reclen > bufsize) {
/*
* Did we manage to fit anything in the buffer?
*/
if (!outcount) {
error = SET_ERROR(EINVAL);
goto update;
}
break;
}
if (flags & V_RDDIR_ENTFLAGS) {
/*
* Add extended flag entry:
*/
eodp->ed_ino = objnum;
eodp->ed_reclen = reclen;
/* NOTE: ed_off is the offset for the *next* entry. */
next = &eodp->ed_off;
eodp->ed_eflags = zap.za_normalization_conflict ?
ED_CASE_CONFLICT : 0;
(void) strncpy(eodp->ed_name, zap.za_name,
EDIRENT_NAMELEN(reclen));
eodp = (edirent_t *)((intptr_t)eodp + reclen);
} else {
/*
* Add normal entry:
*/
odp->d_ino = objnum;
odp->d_reclen = reclen;
odp->d_namlen = strlen(zap.za_name);
/* NOTE: d_off is the offset for the *next* entry. */
next = &odp->d_off;
(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
odp->d_type = type;
dirent_terminate(odp);
odp = (dirent64_t *)((intptr_t)odp + reclen);
}
outcount += reclen;
ASSERT(outcount <= bufsize);
/* Prefetch znode */
if (prefetch)
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
skip_entry:
/*
* Move to the next entry, fill in the previous offset.
*/
if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
zap_cursor_advance(&zc);
offset = zap_cursor_serialize(&zc);
} else {
offset += 1;
}
/* Fill the offset right after advancing the cursor. */
if (next != NULL)
*next = offset;
if (cooks != NULL) {
*cooks++ = offset;
ncooks--;
KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
}
}
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
/* Subtract unused cookies */
if (ncookies != NULL)
*ncookies -= ncooks;
if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
iovp->iov_base += outcount;
iovp->iov_len -= outcount;
uio->uio_resid -= outcount;
} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
/*
* Reset the pointer.
*/
offset = uio->uio_loffset;
}
update:
zap_cursor_fini(&zc);
if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
kmem_free(outbuf, bufsize);
if (error == ENOENT)
error = 0;
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
uio->uio_loffset = offset;
ZFS_EXIT(zfsvfs);
if (error != 0 && cookies != NULL) {
free(*cookies, M_TEMP);
*cookies = NULL;
*ncookies = 0;
}
return (error);
}
ulong_t zfs_fsync_sync_cnt = 4;
static int
zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
}
return (0);
}
/*
* Get the requested file attributes and place them in the provided
* vattr structure.
*
* IN: vp - vnode of file.
* vap - va_mask identifies requested attributes.
* If AT_XVATTR set, then optional attrs are requested
* flags - ATTR_NOACLCHECK (CIFS server context)
* cr - credentials of caller.
* ct - caller context
*
* OUT: vap - attribute values.
*
* RETURN: 0 (always succeeds).
*/
/* ARGSUSED */
static int
zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error = 0;
uint32_t blksize;
u_longlong_t nblocks;
uint64_t mtime[2], ctime[2], crtime[2], rdev;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
sa_bulk_attr_t bulk[4];
int count = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
if (vp->v_type == VBLK || vp->v_type == VCHR)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
&rdev, 8);
if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
(vap->va_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
}
/*
* Return all attributes. It's cheaper to provide the answer
* than to determine whether we were asked the question.
*/
vap->va_type = IFTOVT(zp->z_mode);
vap->va_mode = zp->z_mode & ~S_IFMT;
#ifdef illumos
vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
#else
vn_fsid(vp, vap);
#endif
vap->va_nodeid = zp->z_id;
vap->va_nlink = zp->z_links;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
zp->z_links < ZFS_LINK_MAX)
vap->va_nlink++;
vap->va_size = zp->z_size;
#ifdef illumos
vap->va_rdev = vp->v_rdev;
#else
if (vp->v_type == VBLK || vp->v_type == VCHR)
vap->va_rdev = zfs_cmpldev(rdev);
#endif
vap->va_seq = zp->z_seq;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
vap->va_filerev = zp->z_seq;
/*
* Add in any requested optional attributes and the create time.
* Also set the corresponding bits in the returned attribute bitmap.
*/
if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
xoap->xoa_archive =
((zp->z_pflags & ZFS_ARCHIVE) != 0);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
xoap->xoa_readonly =
((zp->z_pflags & ZFS_READONLY) != 0);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
xoap->xoa_system =
((zp->z_pflags & ZFS_SYSTEM) != 0);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
xoap->xoa_hidden =
((zp->z_pflags & ZFS_HIDDEN) != 0);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_nounlink =
((zp->z_pflags & ZFS_NOUNLINK) != 0);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
xoap->xoa_immutable =
((zp->z_pflags & ZFS_IMMUTABLE) != 0);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_appendonly =
((zp->z_pflags & ZFS_APPENDONLY) != 0);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
xoap->xoa_nodump =
((zp->z_pflags & ZFS_NODUMP) != 0);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
xoap->xoa_opaque =
((zp->z_pflags & ZFS_OPAQUE) != 0);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
xoap->xoa_av_quarantined =
((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
xoap->xoa_av_modified =
((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
vp->v_type == VREG) {
zfs_sa_get_scanstamp(zp, xvap);
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
xoap->xoa_generation = zp->z_gen;
XVA_SET_RTN(xvap, XAT_GEN);
}
if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
xoap->xoa_offline =
((zp->z_pflags & ZFS_OFFLINE) != 0);
XVA_SET_RTN(xvap, XAT_OFFLINE);
}
if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
xoap->xoa_sparse =
((zp->z_pflags & ZFS_SPARSE) != 0);
XVA_SET_RTN(xvap, XAT_SPARSE);
}
}
ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
ZFS_TIME_DECODE(&vap->va_mtime, mtime);
ZFS_TIME_DECODE(&vap->va_ctime, ctime);
ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
vap->va_bytes = nblocks << 9; /* nblocks * 512 */
if (zp->z_blksz == 0) {
/*
* Block size hasn't been set; suggest maximal I/O transfers.
*/
vap->va_blksize = zfsvfs->z_max_blksz;
}
ZFS_EXIT(zfsvfs);
return (0);
}
/*
* Set the file attributes to the values contained in the
* vattr structure.
*
* IN: vp - vnode of file to be modified.
* vap - new attribute values.
* If AT_XVATTR set, then optional attrs are being set
* flags - ATTR_UTIME set if non-default time values provided.
* - ATTR_NOACLCHECK (CIFS context only).
* cr - credentials of caller.
* ct - caller context
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime updated, mtime updated if size changed.
*/
/* ARGSUSED */
static int
zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
vattr_t oldva;
xvattr_t tmpxvattr;
uint_t mask = vap->va_mask;
uint_t saved_mask = 0;
uint64_t saved_mode;
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
uint64_t xattr_obj;
uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
int err, err2;
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
boolean_t fuid_dirtied = B_FALSE;
sa_bulk_attr_t bulk[7], xattr_bulk[7];
int count = 0, xattr_count = 0;
if (mask == 0)
return (0);
if (mask & AT_NOSET)
return (SET_ERROR(EINVAL));
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
/*
* Make sure that if we have ephemeral uid/gid or xvattr specified
* that file system is at proper version level
*/
if (zfsvfs->z_use_fuids == B_FALSE &&
(((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
(mask & AT_XVATTR))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
if (mask & AT_SIZE && vp->v_type == VDIR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EISDIR));
}
if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
/*
* If this is an xvattr_t, then get a pointer to the structure of
* optional attributes. If this is NULL, then we have a vattr_t.
*/
xoap = xva_getxoptattr(xvap);
xva_init(&tmpxvattr);
/*
* Immutable files can only alter immutable bit and atime
*/
if ((zp->z_pflags & ZFS_IMMUTABLE) &&
((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
/*
* Note: ZFS_READONLY is handled in zfs_zaccess_common.
*/
/*
* Verify timestamps doesn't overflow 32 bits.
* ZFS can handle large timestamps, but 32bit syscalls can't
* handle times greater than 2039. This check should be removed
* once large timestamps are fully supported.
*/
if (mask & (AT_ATIME | AT_MTIME)) {
if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EOVERFLOW));
}
}
if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EOVERFLOW));
}
attrzp = NULL;
aclp = NULL;
/* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EROFS));
}
/*
* First validate permissions
*/
if (mask & AT_SIZE) {
/*
* XXX - Note, we are not providing any open
* mode flags here (like FNDELAY), so we may
* block if there are locks present... this
* should be addressed in openat().
*/
/* XXX - would it be OK to generate a log record here? */
err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
}
if (mask & (AT_ATIME|AT_MTIME) ||
((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
skipaclchk, cr);
}
if (mask & (AT_UID|AT_GID)) {
int idmask = (mask & (AT_UID|AT_GID));
int take_owner;
int take_group;
/*
* NOTE: even if a new mode is being set,
* we may clear S_ISUID/S_ISGID bits.
*/
if (!(mask & AT_MODE))
vap->va_mode = zp->z_mode;
/*
* Take ownership or chgrp to group we are a member of
*/
take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
take_group = (mask & AT_GID) &&
zfs_groupmember(zfsvfs, vap->va_gid, cr);
/*
* If both AT_UID and AT_GID are set then take_owner and
* take_group must both be set in order to allow taking
* ownership.
*
* Otherwise, send the check through secpolicy_vnode_setattr()
*
*/
if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
((idmask == AT_UID) && take_owner) ||
((idmask == AT_GID) && take_group)) {
if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
skipaclchk, cr) == 0) {
/*
* Remove setuid/setgid for non-privileged users
*/
secpolicy_setid_clear(vap, vp, cr);
trim_mask = (mask & (AT_UID|AT_GID));
} else {
need_policy = TRUE;
}
} else {
need_policy = TRUE;
}
}
oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
/*
* Update xvattr mask to include only those attributes
* that are actually changing.
*
* the bits will be restored prior to actually setting
* the attributes so the caller thinks they were set.
*/
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
if (xoap->xoa_appendonly !=
((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
}
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
if (xoap->xoa_nounlink !=
((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
}
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
}
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
((zp->z_pflags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
}
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (need_policy == FALSE &&
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
need_policy = TRUE;
}
}
if (mask & AT_MODE) {
if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
err = secpolicy_setid_setsticky_clear(vp, vap,
&oldva, cr);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
trim_mask |= AT_MODE;
} else {
need_policy = TRUE;
}
}
if (need_policy) {
/*
* If trim_mask is set then take ownership
* has been granted or write_acl is present and user
* has the ability to modify mode. In that case remove
* UID|GID and or MODE from mask so that
* secpolicy_vnode_setattr() doesn't revoke it.
*/
if (trim_mask) {
saved_mask = vap->va_mask;
vap->va_mask &= ~trim_mask;
if (trim_mask & AT_MODE) {
/*
* Save the mode, as secpolicy_vnode_setattr()
* will overwrite it with ova.va_mode.
*/
saved_mode = vap->va_mode;
}
}
err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
(int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
}
if (trim_mask) {
vap->va_mask |= saved_mask;
if (trim_mask & AT_MODE) {
/*
* Recover the mode after
* secpolicy_vnode_setattr().
*/
vap->va_mode = saved_mode;
}
}
}
/*
* secpolicy_vnode_setattr, or take ownership may have
* changed va_mask
*/
mask = vap->va_mask;
if ((mask & (AT_UID | AT_GID))) {
err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
if (err == 0 && xattr_obj) {
err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err == 0) {
err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
if (err != 0)
vrele(ZTOV(attrzp));
}
if (err)
goto out2;
}
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
if (new_uid != zp->z_uid &&
zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
if (attrzp)
vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
if (new_gid != zp->z_gid &&
zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
if (attrzp)
vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
}
}
tx = dmu_tx_create(zfsvfs->z_os);
if (mask & AT_MODE) {
uint64_t pmode = zp->z_mode;
uint64_t acl_obj;
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
!(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
err = SET_ERROR(EPERM);
goto out;
}
if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
goto out;
if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
/*
* Are we upgrading ACL from old V0 format
* to V1 format?
*/
if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
zfs_znode_acl_version(zp) ==
ZFS_ACL_VERSION_INITIAL) {
dmu_tx_hold_free(tx, acl_obj, 0,
DMU_OBJECT_END);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
} else {
dmu_tx_hold_write(tx, acl_obj, 0,
aclp->z_acl_bytes);
}
} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
}
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
} else {
if ((mask & AT_XVATTR) &&
XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
else
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
}
if (attrzp) {
dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
}
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err)
goto out;
count = 0;
/*
* Set each attribute requested.
* We group settings according to the locks they need to acquire.
*
* Note: you cannot set ctime directly, although it will be
* updated as a side-effect of calling this function.
*/
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&zp->z_acl_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, sizeof (zp->z_pflags));
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&attrzp->z_acl_lock);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
sizeof (attrzp->z_pflags));
}
if (mask & (AT_UID|AT_GID)) {
if (mask & AT_UID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
&new_uid, sizeof (new_uid));
zp->z_uid = new_uid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_UID(zfsvfs), NULL, &new_uid,
sizeof (new_uid));
attrzp->z_uid = new_uid;
}
}
if (mask & AT_GID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
NULL, &new_gid, sizeof (new_gid));
zp->z_gid = new_gid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_GID(zfsvfs), NULL, &new_gid,
sizeof (new_gid));
attrzp->z_gid = new_gid;
}
}
if (!(mask & AT_MODE)) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
NULL, &new_mode, sizeof (new_mode));
new_mode = zp->z_mode;
}
err = zfs_acl_chown_setattr(zp);
ASSERT(err == 0);
if (attrzp) {
err = zfs_acl_chown_setattr(attrzp);
ASSERT(err == 0);
}
}
if (mask & AT_MODE) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
&new_mode, sizeof (new_mode));
zp->z_mode = new_mode;
ASSERT3U((uintptr_t)aclp, !=, 0);
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT0(err);
if (zp->z_acl_cached)
zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = aclp;
aclp = NULL;
}
if (mask & AT_ATIME) {
ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&zp->z_atime, sizeof (zp->z_atime));
}
if (mask & AT_MTIME) {
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
mtime, sizeof (mtime));
}
/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
if (mask & AT_SIZE && !(mask & AT_MTIME)) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
NULL, mtime, sizeof (mtime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
} else if (mask != 0) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
B_TRUE);
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
mtime, ctime, B_TRUE);
}
}
/*
* Do this after setting timestamps to prevent timestamp
* update from toggling bit
*/
if (xoap && (mask & AT_XVATTR)) {
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
xoap->xoa_createtime = vap->va_birthtime;
/*
* restore trimmed off masks
* so that return masks can be set for caller.
*/
if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
XVA_SET_REQ(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
XVA_SET_REQ(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
XVA_SET_REQ(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
XVA_SET_REQ(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
ASSERT(vp->v_type == VREG);
zfs_xvattr_set(zp, xvap, tx);
}
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&zp->z_acl_lock);
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&attrzp->z_acl_lock);
}
out:
if (err == 0 && attrzp) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
xattr_count, tx);
ASSERT(err2 == 0);
}
if (attrzp)
vput(ZTOV(attrzp));
if (aclp)
zfs_acl_free(aclp);
if (fuidp) {
zfs_fuid_info_free(fuidp);
fuidp = NULL;
}
if (err) {
dmu_tx_abort(tx);
} else {
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
}
out2:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (err);
}
/*
* We acquire all but fdvp locks using non-blocking acquisitions. If we
* fail to acquire any lock in the path we will drop all held locks,
* acquire the new lock in a blocking fashion, and then release it and
* restart the rename. This acquire/release step ensures that we do not
* spin on a lock waiting for release. On error release all vnode locks
* and decrement references the way tmpfs_rename() would do.
*/
static int
zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
struct vnode *tdvp, struct vnode **tvpp,
const struct componentname *scnp, const struct componentname *tcnp)
{
zfsvfs_t *zfsvfs;
struct vnode *nvp, *svp, *tvp;
znode_t *sdzp, *tdzp, *szp, *tzp;
const char *snm = scnp->cn_nameptr;
const char *tnm = tcnp->cn_nameptr;
int error;
VOP_UNLOCK(tdvp, 0);
if (*tvpp != NULL && *tvpp != tdvp)
VOP_UNLOCK(*tvpp, 0);
relock:
error = vn_lock(sdvp, LK_EXCLUSIVE);
if (error)
goto out;
sdzp = VTOZ(sdvp);
error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
if (error != EBUSY)
goto out;
error = vn_lock(tdvp, LK_EXCLUSIVE);
if (error)
goto out;
VOP_UNLOCK(tdvp, 0);
goto relock;
}
tdzp = VTOZ(tdvp);
/*
* Before using sdzp and tdzp we must ensure that they are live.
* As a porting legacy from illumos we have two things to worry
* about. One is typical for FreeBSD and it is that the vnode is
* not reclaimed (doomed). The other is that the znode is live.
* The current code can invalidate the znode without acquiring the
* corresponding vnode lock if the object represented by the znode
* and vnode is no longer valid after a rollback or receive operation.
* z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
* that protects the znodes from the invalidation.
*/
zfsvfs = sdzp->z_zfsvfs;
ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
ZFS_ENTER(zfsvfs);
/*
* We can not use ZFS_VERIFY_ZP() here because it could directly return
* bypassing the cleanup code in the case of an error.
*/
if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
error = SET_ERROR(EIO);
goto out;
}
/*
* Re-resolve svp to be certain it still exists and fetch the
* correct vnode.
*/
error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
if (error != 0) {
/* Source entry invalid or not there. */
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
if ((scnp->cn_flags & ISDOTDOT) != 0 ||
(scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
error = SET_ERROR(EINVAL);
goto out;
}
svp = ZTOV(szp);
/*
* Re-resolve tvp, if it disappeared we just carry on.
*/
error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
if (error != 0) {
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
vrele(svp);
if ((tcnp->cn_flags & ISDOTDOT) != 0)
error = SET_ERROR(EINVAL);
goto out;
}
if (tzp != NULL)
tvp = ZTOV(tzp);
else
tvp = NULL;
/*
* At present the vnode locks must be acquired before z_teardown_lock,
* although it would be more logical to use the opposite order.
*/
ZFS_EXIT(zfsvfs);
/*
* Now try acquire locks on svp and tvp.
*/
nvp = svp;
error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
if (tvp != NULL)
vrele(tvp);
if (error != EBUSY) {
vrele(nvp);
goto out;
}
error = vn_lock(nvp, LK_EXCLUSIVE);
if (error != 0) {
vrele(nvp);
goto out;
}
VOP_UNLOCK(nvp, 0);
/*
* Concurrent rename race.
* XXX ?
*/
if (nvp == tdvp) {
vrele(nvp);
error = SET_ERROR(EINVAL);
goto out;
}
vrele(*svpp);
*svpp = nvp;
goto relock;
}
vrele(*svpp);
*svpp = nvp;
if (*tvpp != NULL)
vrele(*tvpp);
*tvpp = NULL;
if (tvp != NULL) {
nvp = tvp;
error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
if (error != 0) {
VOP_UNLOCK(sdvp, 0);
VOP_UNLOCK(tdvp, 0);
VOP_UNLOCK(*svpp, 0);
if (error != EBUSY) {
vrele(nvp);
goto out;
}
error = vn_lock(nvp, LK_EXCLUSIVE);
if (error != 0) {
vrele(nvp);
goto out;
}
vput(nvp);
goto relock;
}
*tvpp = nvp;
}
return (0);
out:
return (error);
}
/*
* Note that we must use VRELE_ASYNC in this function as it walks
* up the directory tree and vrele may need to acquire an exclusive
* lock if a last reference to a vnode is dropped.
*/
static int
zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
{
zfsvfs_t *zfsvfs;
znode_t *zp, *zp1;
uint64_t parent;
int error;
zfsvfs = tdzp->z_zfsvfs;
if (tdzp == szp)
return (SET_ERROR(EINVAL));
if (tdzp == sdzp)
return (0);
if (tdzp->z_id == zfsvfs->z_root)
return (0);
zp = tdzp;
for (;;) {
ASSERT(!zp->z_unlinked);
if ((error = sa_lookup(zp->z_sa_hdl,
SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
break;
if (parent == szp->z_id) {
error = SET_ERROR(EINVAL);
break;
}
if (parent == zfsvfs->z_root)
break;
if (parent == sdzp->z_id)
break;
error = zfs_zget(zfsvfs, parent, &zp1);
if (error != 0)
break;
if (zp != tdzp)
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
zp = zp1;
}
if (error == ENOTDIR)
panic("checkpath: .. not a directory\n");
if (zp != tdzp)
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
return (error);
}
/*
* Move an entry from the provided source directory to the target
* directory. Change the entry name as indicated.
*
* IN: sdvp - Source directory containing the "old entry".
* snm - Old entry name.
* tdvp - Target directory to contain the "new entry".
* tnm - New entry name.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* sdvp,tdvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
cred_t *cr)
{
zfsvfs_t *zfsvfs;
znode_t *sdzp, *tdzp, *szp, *tzp;
zilog_t *zilog = NULL;
dmu_tx_t *tx;
char *snm = scnp->cn_nameptr;
char *tnm = tcnp->cn_nameptr;
int error = 0;
/* Reject renames across filesystems. */
if ((*svpp)->v_mount != tdvp->v_mount ||
((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
error = SET_ERROR(EXDEV);
goto out;
}
if (zfsctl_is_node(tdvp)) {
error = SET_ERROR(EXDEV);
goto out;
}
/*
* Lock all four vnodes to ensure safety and semantics of renaming.
*/
error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
if (error != 0) {
/* no vnodes are locked in the case of error here */
return (error);
}
tdzp = VTOZ(tdvp);
sdzp = VTOZ(sdvp);
zfsvfs = tdzp->z_zfsvfs;
zilog = zfsvfs->z_log;
/*
* After we re-enter ZFS_ENTER() we will have to revalidate all
* znodes involved.
*/
ZFS_ENTER(zfsvfs);
if (zfsvfs->z_utf8 && u8_validate(tnm,
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
error = SET_ERROR(EILSEQ);
goto unlockout;
}
/* If source and target are the same file, there is nothing to do. */
if ((*svpp) == (*tvpp)) {
error = 0;
goto unlockout;
}
if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
(*tvpp)->v_mountedhere != NULL)) {
error = SET_ERROR(EXDEV);
goto unlockout;
}
/*
* We can not use ZFS_VERIFY_ZP() here because it could directly return
* bypassing the cleanup code in the case of an error.
*/
if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
error = SET_ERROR(EIO);
goto unlockout;
}
szp = VTOZ(*svpp);
tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
error = SET_ERROR(EIO);
goto unlockout;
}
/*
* This is to prevent the creation of links into attribute space
* by renaming a linked file into/outof an attribute directory.
* See the comment in zfs_link() for why this is considered bad.
*/
if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
error = SET_ERROR(EINVAL);
goto unlockout;
}
/*
* Must have write access at the source to remove the old entry
* and write access at the target to create the new entry.
* Note that if target and source are the same, this can be
* done in a single check.
*/
if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
goto unlockout;
if ((*svpp)->v_type == VDIR) {
/*
* Avoid ".", "..", and aliases of "." for obvious reasons.
*/
if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
sdzp == szp ||
(scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
error = EINVAL;
goto unlockout;
}
/*
* Check to make sure rename is valid.
* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
*/
if (error = zfs_rename_check(szp, sdzp, tdzp))
goto unlockout;
}
/*
* Does target exist?
*/
if (tzp) {
/*
* Source and target must be the same type.
*/
if ((*svpp)->v_type == VDIR) {
if ((*tvpp)->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
goto unlockout;
} else {
cache_purge(tdvp);
if (sdvp != tdvp)
cache_purge(sdvp);
}
} else {
if ((*tvpp)->v_type == VDIR) {
error = SET_ERROR(EISDIR);
goto unlockout;
}
}
}
vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
if (tzp)
vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
/*
* notify the target directory if it is not the same
* as source directory.
*/
if (tdvp != sdvp) {
vnevent_rename_dest_dir(tdvp, ct);
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
if (sdzp != tdzp) {
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tdzp);
}
if (tzp) {
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tzp);
}
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
goto unlockout;
}
if (tzp) /* Attempt to remove the existing target */
error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
if (error == 0) {
error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
if (error == 0) {
szp->z_pflags |= ZFS_AV_MODIFIED;
error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
(void *)&szp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
NULL);
if (error == 0) {
zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
snm, tdzp, tnm, szp);
/*
* Update path information for the target vnode
*/
vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
} else {
/*
* At this point, we have successfully created
* the target name, but have failed to remove
* the source name. Since the create was done
* with the ZRENAMING flag, there are
* complications; for one, the link count is
* wrong. The easiest way to deal with this
* is to remove the newly created target, and
* return the original error. This must
* succeed; fortunately, it is very unlikely to
* fail, since we just created it.
*/
VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
ZRENAMING, NULL), ==, 0);
}
}
if (error == 0) {
cache_purge(*svpp);
if (*tvpp != NULL)
cache_purge(*tvpp);
cache_purge_negative(tdvp);
}
}
dmu_tx_commit(tx);
unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
ZFS_EXIT(zfsvfs);
VOP_UNLOCK(*svpp, 0);
VOP_UNLOCK(sdvp, 0);
out: /* original two vnodes are locked */
if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
if (*tvpp != NULL)
VOP_UNLOCK(*tvpp, 0);
if (tdvp != *tvpp)
VOP_UNLOCK(tdvp, 0);
return (error);
}
/*
* Insert the indicated symbolic reference entry into the directory.
*
* IN: dvp - Directory to contain new symbolic link.
* link - Name for new symlink entry.
* vap - Attributes of new entry.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
*/
/*ARGSUSED*/
static int
zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t len = strlen(link);
int error;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
uint64_t txtype = TX_SYMLINK;
int flags = 0;
ASSERT(vap->va_type == VLNK);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(ENAMETOOLONG));
}
if ((error = zfs_acl_ids_create(dzp, 0,
vap, cr, NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE + len);
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
acl_ids.z_aclp->z_acl_bytes);
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Create a new object for the symlink.
* for version 4 ZPL datsets the symlink will be an SA attribute
*/
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (zp->z_is_sa)
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
link, len, tx);
else
zfs_sa_symlink(zp, link, len, tx);
zp->z_size = len;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
&zp->z_size, sizeof (zp->z_size), tx);
/*
* Insert the new object into the directory.
*/
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
*vpp = ZTOV(zp);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Return, in the buffer contained in the provided uio structure,
* the symbolic path referred to by vp.
*
* IN: vp - vnode of symbolic link.
* uio - structure to contain the link path.
* cr - credentials of caller.
* ct - caller context
*
* OUT: uio - structure containing the link path.
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
*/
/* ARGSUSED */
static int
zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if (zp->z_is_sa)
error = sa_lookup_uio(zp->z_sa_hdl,
SA_ZPL_SYMLINK(zfsvfs), uio);
else
error = zfs_sa_readlink(zp, uio);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Insert a new entry into directory tdvp referencing svp.
*
* IN: tdvp - Directory to contain new entry.
* svp - vnode of new entry.
* name - name of new entry.
* cr - credentials of caller.
* ct - caller context
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* tdvp - ctime|mtime updated
* svp - ctime updated
*/
/* ARGSUSED */
static int
zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
caller_context_t *ct, int flags)
{
znode_t *dzp = VTOZ(tdvp);
znode_t *tzp, *szp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
int error;
uint64_t parent;
uid_t owner;
ASSERT(tdvp->v_type == VDIR);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
/*
* POSIX dictates that we return EPERM here.
* Better choices include ENOTSUP or EISDIR.
*/
if (svp->v_type == VDIR) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
szp = VTOZ(svp);
ZFS_VERIFY_ZP(szp);
if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
/* Prevent links to .zfs/shares files */
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (uint64_t))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
if (parent == zfsvfs->z_shares_dir) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (zfsvfs->z_utf8 && u8_validate(name,
strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
/*
* We do not support links between attributes and non-attributes
* because of the potential security risk of creating links
* into "normal" file space in order to circumvent restrictions
* imposed in attribute space.
*/
if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Attempt to lock directory; fail if entry already exists.
*/
error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
error = zfs_link_create(dzp, name, szp, tx, 0);
if (error == 0) {
uint64_t txtype = TX_LINK;
zfs_log_link(zilog, tx, txtype, dzp, szp, name);
}
dmu_tx_commit(tx);
if (error == 0) {
vnevent_link(svp, ct);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
/*ARGSUSED*/
void
zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
if (zp->z_sa_hdl == NULL) {
/*
* The fs has been unmounted, or we did a
* suspend/resume and this file no longer exists.
*/
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vrecycle(vp);
return;
}
if (zp->z_unlinked) {
/*
* Fast path to recycle a vnode of a removed file.
*/
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vrecycle(vp);
return;
}
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
dmu_tx_commit(tx);
}
}
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
/*ARGSUSED*/
static int
zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint32_t gen;
uint64_t gen64;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
int size, i, error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
&gen64, sizeof (uint64_t))) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
gen = (uint32_t)gen64;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
#ifdef illumos
if (fidp->fid_len < size) {
fidp->fid_len = size;
ZFS_EXIT(zfsvfs);
return (SET_ERROR(ENOSPC));
}
#else
fidp->fid_len = size;
#endif
zfid = (zfid_short_t *)fidp;
zfid->zf_len = size;
for (i = 0; i < sizeof (zfid->zf_object); i++)
zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
/* Must have a non-zero generation number to distinguish from .zfs */
if (gen == 0)
gen = 1;
for (i = 0; i < sizeof (zfid->zf_gen); i++)
zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
if (size == LONG_FID_LEN) {
uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
zfid_long_t *zlfid;
zlfid = (zfid_long_t *)fidp;
for (i = 0; i < sizeof (zlfid->zf_setid); i++)
zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
/* XXX - this should be the generation number for the objset */
for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
zlfid->zf_setgen[i] = 0;
}
ZFS_EXIT(zfsvfs);
return (0);
}
static int
zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp, *xzp;
zfsvfs_t *zfsvfs;
int error;
switch (cmd) {
case _PC_LINK_MAX:
*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
return (0);
case _PC_FILESIZEBITS:
*valp = 64;
return (0);
#ifdef illumos
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
*valp = 0;
error = zfs_dirent_lookup(zp, "", &xzp,
ZXATTR | ZEXISTS | ZSHARED);
if (error == 0) {
if (!zfs_dirempty(xzp))
*valp = 1;
vrele(ZTOV(xzp));
} else if (error == ENOENT) {
/*
* If there aren't extended attributes, it's the
* same as having zero of them.
*/
error = 0;
}
ZFS_EXIT(zfsvfs);
return (error);
case _PC_SATTR_ENABLED:
case _PC_SATTR_EXISTS:
*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
(vp->v_type == VREG || vp->v_type == VDIR);
return (0);
case _PC_ACCESS_FILTERING:
*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
vp->v_type == VDIR;
return (0);
case _PC_ACL_ENABLED:
*valp = _ACL_ACE_ENABLED;
return (0);
#endif /* illumos */
case _PC_MIN_HOLE_SIZE:
*valp = (int)SPA_MINBLOCKSIZE;
return (0);
#ifdef illumos
case _PC_TIMESTAMP_RESOLUTION:
/* nanosecond timestamp resolution */
*valp = 1L;
return (0);
#endif
case _PC_ACL_EXTENDED:
*valp = 0;
return (0);
case _PC_ACL_NFS4:
*valp = 1;
return (0);
case _PC_ACL_PATH_MAX:
*valp = ACL_MAX_ENTRIES;
return (0);
default:
return (EOPNOTSUPP);
}
}
/*ARGSUSED*/
static int
zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
error = zfs_getacl(zp, vsecp, skipaclchk, cr);
ZFS_EXIT(zfsvfs);
return (error);
}
/*ARGSUSED*/
int
zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
zilog_t *zilog = zfsvfs->z_log;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
}
static int
zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
int *rahead)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zp->z_zfsvfs->z_os;
rl_t *rl;
vm_object_t object;
off_t start, end, obj_size;
uint_t blksz;
int pgsin_b, pgsin_a;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
start = IDX_TO_OFF(ma[0]->pindex);
end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
/*
* Lock a range covering all required and optional pages.
* Note that we need to handle the case of the block size growing.
*/
for (;;) {
blksz = zp->z_blksz;
rl = zfs_range_lock(zp, rounddown(start, blksz),
roundup(end, blksz) - rounddown(start, blksz), RL_READER);
if (blksz == zp->z_blksz)
break;
zfs_range_unlock(rl);
}
object = ma[0]->object;
zfs_vmobject_wlock(object);
obj_size = object->un_pager.vnp.vnp_size;
zfs_vmobject_wunlock(object);
if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (zfs_vm_pagerret_bad);
}
pgsin_b = 0;
if (rbehind != NULL) {
pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
pgsin_b = MIN(*rbehind, pgsin_b);
}
pgsin_a = 0;
if (rahead != NULL) {
pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
pgsin_a = MIN(*rahead, pgsin_a);
}
/*
* NB: we need to pass the exact byte size of the data that we expect
* to read after accounting for the file size. This is required because
* ZFS will panic if we request DMU to read beyond the end of the last
* allocated block.
*/
error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
MIN(end, obj_size) - (end - PAGE_SIZE));
zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
if (error != 0)
return (zfs_vm_pagerret_error);
VM_CNT_INC(v_vnodein);
VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
if (rbehind != NULL)
*rbehind = pgsin_b;
if (rahead != NULL)
*rahead = pgsin_a;
return (zfs_vm_pagerret_ok);
}
static int
zfs_freebsd_getpages(ap)
struct vop_getpages_args /* {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
int *a_rbehind;
int *a_rahead;
} */ *ap;
{
return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
ap->a_rahead));
}
static int
zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
int *rtvals)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
rl_t *rl;
dmu_tx_t *tx;
struct sf_buf *sf;
vm_object_t object;
vm_page_t m;
caddr_t va;
size_t tocopy;
size_t lo_len;
vm_ooffset_t lo_off;
vm_ooffset_t off;
uint_t blksz;
int ncount;
int pcount;
int err;
int i;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
object = vp->v_object;
pcount = btoc(len);
ncount = pcount;
KASSERT(ma[0]->object == object, ("mismatching object"));
KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
for (i = 0; i < pcount; i++)
rtvals[i] = zfs_vm_pagerret_error;
off = IDX_TO_OFF(ma[0]->pindex);
blksz = zp->z_blksz;
lo_off = rounddown(off, blksz);
lo_len = roundup(len + (off - lo_off), blksz);
rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
zfs_vmobject_wlock(object);
if (len + off > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > off) {
int pgoff;
len = object->un_pager.vnp.vnp_size - off;
ncount = btoc(len);
if ((pgoff = (int)len & PAGE_MASK) != 0) {
/*
* If the object is locked and the following
* conditions hold, then the page's dirty
* field cannot be concurrently changed by a
* pmap operation.
*/
m = ma[ncount - 1];
vm_page_assert_sbusied(m);
KASSERT(!pmap_page_is_write_mapped(m),
("zfs_putpages: page %p is not read-only", m));
vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
pgoff);
}
} else {
len = 0;
ncount = 0;
}
if (ncount < pcount) {
for (i = ncount; i < pcount; i++) {
rtvals[i] = zfs_vm_pagerret_bad;
}
}
}
zfs_vmobject_wunlock(object);
if (ncount == 0)
goto out;
if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
goto out;
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
goto out;
}
if (zp->z_blksz < PAGE_SIZE) {
for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
va = zfs_map_page(ma[i], &sf);
dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
zfs_unmap_page(sf);
}
} else {
err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
}
if (err == 0) {
uint64_t mtime[2], ctime[2];
sa_bulk_attr_t bulk[3];
int count = 0;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
&mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
&ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT0(err);
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
zfs_vmobject_wlock(object);
for (i = 0; i < ncount; i++) {
rtvals[i] = zfs_vm_pagerret_ok;
vm_page_undirty(ma[i]);
}
zfs_vmobject_wunlock(object);
VM_CNT_INC(v_vnodeout);
VM_CNT_ADD(v_vnodepgsout, ncount);
}
dmu_tx_commit(tx);
out:
zfs_range_unlock(rl);
if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
return (rtvals[0]);
}
int
zfs_freebsd_putpages(ap)
struct vop_putpages_args /* {
struct vnode *a_vp;
vm_page_t *a_m;
int a_count;
int a_sync;
int *a_rtvals;
} */ *ap;
{
return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
ap->a_rtvals));
}
static int
zfs_freebsd_bmap(ap)
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct bufobj **a_bop;
daddr_t *a_bnp;
int *a_runp;
int *a_runb;
} */ *ap;
{
if (ap->a_bop != NULL)
*ap->a_bop = &ap->a_vp->v_bufobj;
if (ap->a_bnp != NULL)
*ap->a_bnp = ap->a_bn;
if (ap->a_runp != NULL)
*ap->a_runp = 0;
if (ap->a_runb != NULL)
*ap->a_runb = 0;
return (0);
}
static int
zfs_freebsd_open(ap)
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
int error;
error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
if (error == 0)
vnode_create_vobject(vp, zp->z_size, ap->a_td);
return (error);
}
static int
zfs_freebsd_close(ap)
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
}
static int
zfs_freebsd_ioctl(ap)
struct vop_ioctl_args /* {
struct vnode *a_vp;
u_long a_command;
caddr_t a_data;
int a_fflag;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
ap->a_fflag, ap->a_cred, NULL, NULL));
}
static int
ioflags(int ioflags)
{
int flags = 0;
if (ioflags & IO_APPEND)
flags |= FAPPEND;
if (ioflags & IO_NDELAY)
flags |= FNONBLOCK;
if (ioflags & IO_SYNC)
flags |= (FSYNC | FDSYNC | FRSYNC);
return (flags);
}
static int
zfs_freebsd_read(ap)
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred, NULL));
}
static int
zfs_freebsd_write(ap)
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred, NULL));
}
static int
zfs_freebsd_access(ap)
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
struct ucred *a_cred;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
accmode_t accmode;
int error = 0;
/*
* ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
*/
accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0)
error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
/*
* VADMIN has to be handled by vaccess().
*/
if (error == 0) {
accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0) {
error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
zp->z_gid, accmode, ap->a_cred, NULL);
}
}
/*
* For VEXEC, ensure that at least one execute bit is set for
* non-directories.
*/
if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
(zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
error = EACCES;
}
return (error);
}
static int
zfs_freebsd_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
char nm[NAME_MAX + 1];
ASSERT(cnp->cn_namelen < sizeof(nm));
strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
cnp->cn_cred, cnp->cn_thread, 0));
}
static int
zfs_cache_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap;
{
zfsvfs_t *zfsvfs;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
if (zfsvfs->z_use_namecache)
return (vfs_cache_lookup(ap));
else
return (zfs_freebsd_lookup(ap));
}
static int
zfs_freebsd_create(ap)
struct vop_create_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
} */ *ap;
{
zfsvfs_t *zfsvfs;
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
int error, mode;
ASSERT(cnp->cn_flags & SAVENAME);
vattr_init_mask(vap);
mode = vap->va_mode & ALLPERMS;
zfsvfs = ap->a_dvp->v_mount->mnt_data;
error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
if (zfsvfs->z_use_namecache &&
error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
return (error);
}
static int
zfs_freebsd_remove(ap)
struct vop_remove_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
ap->a_cnp->cn_cred));
}
static int
zfs_freebsd_mkdir(ap)
struct vop_mkdir_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
vattr_init_mask(vap);
return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
ap->a_cnp->cn_cred));
}
static int
zfs_freebsd_rmdir(ap)
struct vop_rmdir_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
ASSERT(cnp->cn_flags & SAVENAME);
return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
}
static int
zfs_freebsd_readdir(ap)
struct vop_readdir_args /* {
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
int *a_ncookies;
u_long **a_cookies;
} */ *ap;
{
return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
ap->a_ncookies, ap->a_cookies));
}
static int
zfs_freebsd_fsync(ap)
struct vop_fsync_args /* {
struct vnode *a_vp;
int a_waitfor;
struct thread *a_td;
} */ *ap;
{
vop_stdfsync(ap);
return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
}
static int
zfs_freebsd_getattr(ap)
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
} */ *ap;
{
vattr_t *vap = ap->a_vap;
xvattr_t xvap;
u_long fflags = 0;
int error;
xva_init(&xvap);
xvap.xva_vattr = *vap;
xvap.xva_vattr.va_mask |= AT_XVATTR;
/* Convert chflags into ZFS-type flags. */
/* XXX: what about SF_SETTABLE?. */
XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
XVA_SET_REQ(&xvap, XAT_APPENDONLY);
XVA_SET_REQ(&xvap, XAT_NOUNLINK);
XVA_SET_REQ(&xvap, XAT_NODUMP);
XVA_SET_REQ(&xvap, XAT_READONLY);
XVA_SET_REQ(&xvap, XAT_ARCHIVE);
XVA_SET_REQ(&xvap, XAT_SYSTEM);
XVA_SET_REQ(&xvap, XAT_HIDDEN);
XVA_SET_REQ(&xvap, XAT_REPARSE);
XVA_SET_REQ(&xvap, XAT_OFFLINE);
XVA_SET_REQ(&xvap, XAT_SPARSE);
error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
if (error != 0)
return (error);
/* Convert ZFS xattr into chflags. */
#define FLAG_CHECK(fflag, xflag, xfield) do { \
if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
fflags |= (fflag); \
} while (0)
FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
xvap.xva_xoptattrs.xoa_immutable);
FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
xvap.xva_xoptattrs.xoa_archive);
FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
FLAG_CHECK(UF_READONLY, XAT_READONLY,
xvap.xva_xoptattrs.xoa_readonly);
FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
xvap.xva_xoptattrs.xoa_system);
FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
xvap.xva_xoptattrs.xoa_hidden);
FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
xvap.xva_xoptattrs.xoa_reparse);
FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
xvap.xva_xoptattrs.xoa_offline);
FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
xvap.xva_xoptattrs.xoa_sparse);
#undef FLAG_CHECK
*vap = xvap.xva_vattr;
vap->va_flags = fflags;
return (0);
}
static int
zfs_freebsd_setattr(ap)
struct vop_setattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
struct ucred *a_cred;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
vattr_t *vap = ap->a_vap;
cred_t *cred = ap->a_cred;
xvattr_t xvap;
u_long fflags;
uint64_t zflags;
vattr_init_mask(vap);
vap->va_mask &= ~AT_NOSET;
xva_init(&xvap);
xvap.xva_vattr = *vap;
zflags = VTOZ(vp)->z_pflags;
if (vap->va_flags != VNOVAL) {
zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
int error;
if (zfsvfs->z_use_fuids == B_FALSE)
return (EOPNOTSUPP);
fflags = vap->va_flags;
/*
* XXX KDM
* We need to figure out whether it makes sense to allow
* UF_REPARSE through, since we don't really have other
* facilities to handle reparse points and zfs_setattr()
* doesn't currently allow setting that attribute anyway.
*/
if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
UF_OFFLINE|UF_SPARSE)) != 0)
return (EOPNOTSUPP);
/*
* Unprivileged processes are not permitted to unset system
* flags, or modify flags if any system flags are set.
* Privileged non-jail processes may not modify system flags
* if securelevel > 0 and any existing system flags are set.
* Privileged jail processes behave like privileged non-jail
* processes if the PR_ALLOW_CHFLAGS permission bit is set;
* otherwise, they behave like unprivileged processes.
*/
if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
if (zflags &
(ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
error = securelevel_gt(cred, 0);
if (error != 0)
return (error);
}
} else {
/*
* Callers may only modify the file flags on objects they
* have VADMIN rights for.
*/
if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
return (error);
if (zflags &
(ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
return (EPERM);
}
if (fflags &
(SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
return (EPERM);
}
}
#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
if (((fflags & (fflag)) && !(zflags & (zflag))) || \
((zflags & (zflag)) && !(fflags & (fflag)))) { \
XVA_SET_REQ(&xvap, (xflag)); \
(xfield) = ((fflags & (fflag)) != 0); \
} \
} while (0)
/* Convert chflags into ZFS-type flags. */
/* XXX: what about SF_SETTABLE?. */
FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
xvap.xva_xoptattrs.xoa_immutable);
FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
xvap.xva_xoptattrs.xoa_archive);
FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
xvap.xva_xoptattrs.xoa_readonly);
FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
xvap.xva_xoptattrs.xoa_system);
FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
xvap.xva_xoptattrs.xoa_hidden);
FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
xvap.xva_xoptattrs.xoa_reparse);
FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
xvap.xva_xoptattrs.xoa_offline);
FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
xvap.xva_xoptattrs.xoa_sparse);
#undef FLAG_CHANGE
}
if (vap->va_birthtime.tv_sec != VNOVAL) {
xvap.xva_vattr.va_mask |= AT_XVATTR;
XVA_SET_REQ(&xvap, XAT_CREATETIME);
}
return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
}
static int
zfs_freebsd_rename(ap)
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap;
{
vnode_t *fdvp = ap->a_fdvp;
vnode_t *fvp = ap->a_fvp;
vnode_t *tdvp = ap->a_tdvp;
vnode_t *tvp = ap->a_tvp;
int error;
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
ap->a_tcnp, ap->a_fcnp->cn_cred);
vrele(fdvp);
vrele(fvp);
vrele(tdvp);
if (tvp != NULL)
vrele(tvp);
return (error);
}
static int
zfs_freebsd_symlink(ap)
struct vop_symlink_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
struct vattr *a_vap;
char *a_target;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
ASSERT(cnp->cn_flags & SAVENAME);
vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
vattr_init_mask(vap);
return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
__DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
}
static int
zfs_freebsd_readlink(ap)
struct vop_readlink_args /* {
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
} */ *ap;
{
return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
}
static int
zfs_freebsd_link(ap)
struct vop_link_args /* {
struct vnode *a_tdvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
vnode_t *vp = ap->a_vp;
vnode_t *tdvp = ap->a_tdvp;
if (tdvp->v_mount != vp->v_mount)
return (EXDEV);
ASSERT(cnp->cn_flags & SAVENAME);
return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
}
static int
zfs_freebsd_inactive(ap)
struct vop_inactive_args /* {
struct vnode *a_vp;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
zfs_inactive(vp, ap->a_td->td_ucred, NULL);
return (0);
}
static int
zfs_freebsd_reclaim(ap)
struct vop_reclaim_args /* {
struct vnode *a_vp;
struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp != NULL);
/*
* z_teardown_inactive_lock protects from a race with
* zfs_znode_dmu_fini in zfsvfs_teardown during
* force unmount.
*/
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
if (zp->z_sa_hdl == NULL)
zfs_znode_free(zp);
else
zfs_zinactive(zp);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
vp->v_data = NULL;
return (0);
}
static int
zfs_freebsd_fid(ap)
struct vop_fid_args /* {
struct vnode *a_vp;
struct fid *a_fid;
} */ *ap;
{
return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
}
static int
zfs_freebsd_pathconf(ap)
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap;
{
ulong_t val;
int error;
error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
if (error == 0) {
*ap->a_retval = val;
return (error);
}
if (error != EOPNOTSUPP)
return (error);
switch (ap->a_name) {
case _PC_NAME_MAX:
*ap->a_retval = NAME_MAX;
return (0);
case _PC_PIPE_BUF:
if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
*ap->a_retval = PIPE_BUF;
return (0);
}
return (EINVAL);
default:
return (vop_stdpathconf(ap));
}
}
/*
* FreeBSD's extended attributes namespace defines file name prefix for ZFS'
* extended attribute name:
*
* NAMESPACE PREFIX
* system freebsd:system:
* user (none, can be used to access ZFS fsattr(5) attributes
* created on Solaris)
*/
static int
zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
size_t size)
{
const char *namespace, *prefix, *suffix;
/* We don't allow '/' character in attribute name. */
if (strchr(name, '/') != NULL)
return (EINVAL);
/* We don't allow attribute names that start with "freebsd:" string. */
if (strncmp(name, "freebsd:", 8) == 0)
return (EINVAL);
bzero(attrname, size);
switch (attrnamespace) {
case EXTATTR_NAMESPACE_USER:
#if 0
prefix = "freebsd:";
namespace = EXTATTR_NAMESPACE_USER_STRING;
suffix = ":";
#else
/*
* This is the default namespace by which we can access all
* attributes created on Solaris.
*/
prefix = namespace = suffix = "";
#endif
break;
case EXTATTR_NAMESPACE_SYSTEM:
prefix = "freebsd:";
namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
suffix = ":";
break;
case EXTATTR_NAMESPACE_EMPTY:
default:
return (EINVAL);
}
if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
name) >= size) {
return (ENAMETOOLONG);
}
return (0);
}
/*
* Vnode operating to retrieve a named extended attribute.
*/
static int
zfs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
flags = FREAD;
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
if (error == ENOENT)
error = ENOATTR;
return (error);
}
if (ap->a_size != NULL) {
error = VOP_GETATTR(vp, &va, ap->a_cred);
if (error == 0)
*ap->a_size = (size_t)va.va_size;
} else if (ap->a_uio != NULL)
error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
VOP_UNLOCK(vp, 0);
vn_close(vp, flags, ap->a_cred, td);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to remove a named attribute.
*/
int
zfs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
UIO_SYSSPACE, attrname, xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
if (error != 0) {
ZFS_EXIT(zfsvfs);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error == ENOENT)
error = ENOATTR;
return (error);
}
error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (vp == nd.ni_dvp)
vrele(vp);
else
vput(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to set a named attribute.
*/
static int
zfs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrname[255];
struct vattr va;
vnode_t *xvp = NULL, *vp;
int error, flags;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VWRITE);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
sizeof(attrname));
if (error != 0)
return (error);
ZFS_ENTER(zfsvfs);
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR | CREATE_XATTR_DIR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
flags = FFLAGS(O_WRONLY | O_CREAT);
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
VATTR_NULL(&va);
va.va_size = 0;
error = VOP_SETATTR(vp, &va, ap->a_cred);
if (error == 0)
VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
VOP_UNLOCK(vp, 0);
vn_close(vp, flags, ap->a_cred, td);
ZFS_EXIT(zfsvfs);
return (error);
}
/*
* Vnode operation to retrieve extended attributes on a vnode.
*/
static int
zfs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN struct ucred *a_cred;
IN struct thread *a_td;
};
*/
{
zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
struct thread *td = ap->a_td;
struct nameidata nd;
char attrprefix[16];
u_char dirbuf[sizeof(struct dirent)];
struct dirent *dp;
struct iovec aiov;
struct uio auio, *uio = ap->a_uio;
size_t *sizep = ap->a_size;
size_t plen;
vnode_t *xvp = NULL, *vp;
int done, error, eof, pos;
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, VREAD);
if (error != 0)
return (error);
error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
sizeof(attrprefix));
if (error != 0)
return (error);
plen = strlen(attrprefix);
ZFS_ENTER(zfsvfs);
if (sizep != NULL)
*sizep = 0;
error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
LOOKUP_XATTR);
if (error != 0) {
ZFS_EXIT(zfsvfs);
/*
* ENOATTR means that the EA directory does not yet exist,
* i.e. there are no extended attributes there.
*/
if (error == ENOATTR)
error = 0;
return (error);
}
NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
UIO_SYSSPACE, ".", xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
auio.uio_rw = UIO_READ;
auio.uio_offset = 0;
do {
u_char nlen;
aiov.iov_base = (void *)dirbuf;
aiov.iov_len = sizeof(dirbuf);
auio.uio_resid = sizeof(dirbuf);
error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
done = sizeof(dirbuf) - auio.uio_resid;
if (error != 0)
break;
for (pos = 0; pos < done;) {
dp = (struct dirent *)(dirbuf + pos);
pos += dp->d_reclen;
/*
* XXX: Temporarily we also accept DT_UNKNOWN, as this
* is what we get when attribute was created on Solaris.
*/
if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
continue;
if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
continue;
else if (strncmp(dp->d_name, attrprefix, plen) != 0)
continue;
nlen = dp->d_namlen - plen;
if (sizep != NULL)
*sizep += 1 + nlen;
else if (uio != NULL) {
/*
* Format of extattr name entry is one byte for
* length and the rest for name.
*/
error = uiomove(&nlen, 1, uio->uio_rw, uio);
if (error == 0) {
error = uiomove(dp->d_name + plen, nlen,
uio->uio_rw, uio);
}
if (error != 0)
break;
}
}
} while (!eof && error == 0);
vput(vp);
ZFS_EXIT(zfsvfs);
return (error);
}
int
zfs_freebsd_getacl(ap)
struct vop_getacl_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
int error;
vsecattr_t vsecattr;
if (ap->a_type != ACL_TYPE_NFS4)
return (EINVAL);
vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
return (error);
error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
if (vsecattr.vsa_aclentp != NULL)
kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
return (error);
}
int
zfs_freebsd_setacl(ap)
struct vop_setacl_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
int error;
vsecattr_t vsecattr;
int aclbsize; /* size of acl list in bytes */
aclent_t *aaclp;
if (ap->a_type != ACL_TYPE_NFS4)
return (EINVAL);
if (ap->a_aclp == NULL)
return (EINVAL);
if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
return (EINVAL);
/*
* With NFSv4 ACLs, chmod(2) may need to add additional entries,
* splitting every entry into two and appending "canonical six"
* entries at the end. Don't allow for setting an ACL that would
* cause chmod(2) to run out of ACL entries.
*/
if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
return (ENOSPC);
error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
if (error != 0)
return (error);
vsecattr.vsa_mask = VSA_ACE;
aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
aaclp = vsecattr.vsa_aclentp;
vsecattr.vsa_aclentsz = aclbsize;
aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
kmem_free(aaclp, aclbsize);
return (error);
}
int
zfs_freebsd_aclcheck(ap)
struct vop_aclcheck_args /* {
struct vnode *vp;
acl_type_t type;
struct acl *aclp;
struct ucred *cred;
struct thread *td;
} */ *ap;
{
return (EOPNOTSUPP);
}
static int
zfs_vptocnp(struct vop_vptocnp_args *ap)
{
vnode_t *covered_vp;
vnode_t *vp = ap->a_vp;;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
znode_t *zp = VTOZ(vp);
enum vgetstate vs;
int ltype;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
/*
* If we are a snapshot mounted under .zfs, run the operation
* on the covered vnode.
*/
if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
char name[MAXNAMLEN + 1];
znode_t *dzp;
size_t len;
error = zfs_znode_parent_and_name(zp, &dzp, name);
if (error == 0) {
len = strlen(name);
if (*ap->a_buflen < len)
error = SET_ERROR(ENOMEM);
}
if (error == 0) {
*ap->a_buflen -= len;
bcopy(name, ap->a_buf + *ap->a_buflen, len);
*ap->a_vpp = ZTOV(dzp);
}
ZFS_EXIT(zfsvfs);
return (error);
}
ZFS_EXIT(zfsvfs);
covered_vp = vp->v_mount->mnt_vnodecovered;
vs = vget_prep(covered_vp);
ltype = VOP_ISLOCKED(vp);
VOP_UNLOCK(vp, 0);
error = vget_finish(covered_vp, LK_SHARED, vs);
if (error == 0) {
error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
ap->a_buf, ap->a_buflen);
vput(covered_vp);
}
vn_lock(vp, ltype | LK_RETRY);
if ((vp->v_iflag & VI_DOOMED) != 0)
error = SET_ERROR(ENOENT);
return (error);
}
#ifdef DIAGNOSTIC
static int
zfs_lock(ap)
struct vop_lock1_args /* {
struct vnode *a_vp;
int a_flags;
char *file;
int line;
} */ *ap;
{
vnode_t *vp;
znode_t *zp;
int err;
err = vop_stdlock(ap);
if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
vp = ap->a_vp;
zp = vp->v_data;
if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
}
return (err);
}
#endif
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
struct vop_vector zfs_shareops;
struct vop_vector zfs_vnodeops = {
.vop_default = &default_vnodeops,
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_access = zfs_freebsd_access,
.vop_allocate = VOP_EINVAL,
.vop_lookup = zfs_cache_lookup,
.vop_cachedlookup = zfs_freebsd_lookup,
.vop_getattr = zfs_freebsd_getattr,
.vop_setattr = zfs_freebsd_setattr,
.vop_create = zfs_freebsd_create,
.vop_mknod = zfs_freebsd_create,
.vop_mkdir = zfs_freebsd_mkdir,
.vop_readdir = zfs_freebsd_readdir,
.vop_fsync = zfs_freebsd_fsync,
.vop_open = zfs_freebsd_open,
.vop_close = zfs_freebsd_close,
.vop_rmdir = zfs_freebsd_rmdir,
.vop_ioctl = zfs_freebsd_ioctl,
.vop_link = zfs_freebsd_link,
.vop_symlink = zfs_freebsd_symlink,
.vop_readlink = zfs_freebsd_readlink,
.vop_read = zfs_freebsd_read,
.vop_write = zfs_freebsd_write,
.vop_remove = zfs_freebsd_remove,
.vop_rename = zfs_freebsd_rename,
.vop_pathconf = zfs_freebsd_pathconf,
.vop_bmap = zfs_freebsd_bmap,
.vop_fid = zfs_freebsd_fid,
.vop_getextattr = zfs_getextattr,
.vop_deleteextattr = zfs_deleteextattr,
.vop_setextattr = zfs_setextattr,
.vop_listextattr = zfs_listextattr,
.vop_getacl = zfs_freebsd_getacl,
.vop_setacl = zfs_freebsd_setacl,
.vop_aclcheck = zfs_freebsd_aclcheck,
.vop_getpages = zfs_freebsd_getpages,
.vop_putpages = zfs_freebsd_putpages,
.vop_vptocnp = zfs_vptocnp,
#ifdef DIAGNOSTIC
.vop_lock1 = zfs_lock,
#endif
};
struct vop_vector zfs_fifoops = {
.vop_default = &fifo_specops,
.vop_fsync = zfs_freebsd_fsync,
.vop_access = zfs_freebsd_access,
.vop_getattr = zfs_freebsd_getattr,
.vop_inactive = zfs_freebsd_inactive,
.vop_read = VOP_PANIC,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_setattr = zfs_freebsd_setattr,
.vop_write = VOP_PANIC,
.vop_pathconf = zfs_freebsd_pathconf,
.vop_fid = zfs_freebsd_fid,
.vop_getacl = zfs_freebsd_getacl,
.vop_setacl = zfs_freebsd_setacl,
.vop_aclcheck = zfs_freebsd_aclcheck,
};
/*
* special share hidden files vnode operations template
*/
struct vop_vector zfs_shareops = {
.vop_default = &default_vnodeops,
.vop_access = zfs_freebsd_access,
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_fid = zfs_freebsd_fid,
.vop_pathconf = zfs_freebsd_pathconf,
};
Index: head/sys/compat/linuxkpi/common/src/linux_compat.c
===================================================================
--- head/sys/compat/linuxkpi/common/src/linux_compat.c (revision 353538)
+++ head/sys/compat/linuxkpi/common/src/linux_compat.c (revision 353539)
@@ -1,2458 +1,2458 @@
/*-
* Copyright (c) 2010 Isilon Systems, Inc.
* Copyright (c) 2010 iX Systems, Inc.
* Copyright (c) 2010 Panasas, Inc.
* Copyright (c) 2013-2018 Mellanox Technologies, Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include
__FBSDID("$FreeBSD$");
#include "opt_stack.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(__i386__) || defined(__amd64__)
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(__i386__) || defined(__amd64__)
#include
#endif
SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters");
int linuxkpi_debug;
SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
&linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
#include
/* Undo Linux compat changes. */
#undef RB_ROOT
#undef file
#undef cdev
#define RB_ROOT(head) (head)->rbh_root
static void linux_cdev_deref(struct linux_cdev *ldev);
static struct vm_area_struct *linux_cdev_handle_find(void *handle);
struct kobject linux_class_root;
struct device linux_root_device;
struct class linux_class_misc;
struct list_head pci_drivers;
struct list_head pci_devices;
spinlock_t pci_lock;
unsigned long linux_timer_hz_mask;
int
panic_cmp(struct rb_node *one, struct rb_node *two)
{
panic("no cmp");
}
RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
int
kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
{
va_list tmp_va;
int len;
char *old;
char *name;
char dummy;
old = kobj->name;
if (old && fmt == NULL)
return (0);
/* compute length of string */
va_copy(tmp_va, args);
len = vsnprintf(&dummy, 0, fmt, tmp_va);
va_end(tmp_va);
/* account for zero termination */
len++;
/* check for error */
if (len < 1)
return (-EINVAL);
/* allocate memory for string */
name = kzalloc(len, GFP_KERNEL);
if (name == NULL)
return (-ENOMEM);
vsnprintf(name, len, fmt, args);
kobj->name = name;
/* free old string */
kfree(old);
/* filter new string */
for (; *name != '\0'; name++)
if (*name == '/')
*name = '!';
return (0);
}
int
kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
va_list args;
int error;
va_start(args, fmt);
error = kobject_set_name_vargs(kobj, fmt, args);
va_end(args);
return (error);
}
static int
kobject_add_complete(struct kobject *kobj, struct kobject *parent)
{
const struct kobj_type *t;
int error;
kobj->parent = parent;
error = sysfs_create_dir(kobj);
if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
struct attribute **attr;
t = kobj->ktype;
for (attr = t->default_attrs; *attr != NULL; attr++) {
error = sysfs_create_file(kobj, *attr);
if (error)
break;
}
if (error)
sysfs_remove_dir(kobj);
}
return (error);
}
int
kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
{
va_list args;
int error;
va_start(args, fmt);
error = kobject_set_name_vargs(kobj, fmt, args);
va_end(args);
if (error)
return (error);
return kobject_add_complete(kobj, parent);
}
void
linux_kobject_release(struct kref *kref)
{
struct kobject *kobj;
char *name;
kobj = container_of(kref, struct kobject, kref);
sysfs_remove_dir(kobj);
name = kobj->name;
if (kobj->ktype && kobj->ktype->release)
kobj->ktype->release(kobj);
kfree(name);
}
static void
linux_kobject_kfree(struct kobject *kobj)
{
kfree(kobj);
}
static void
linux_kobject_kfree_name(struct kobject *kobj)
{
if (kobj) {
kfree(kobj->name);
}
}
const struct kobj_type linux_kfree_type = {
.release = linux_kobject_kfree
};
static void
linux_device_release(struct device *dev)
{
pr_debug("linux_device_release: %s\n", dev_name(dev));
kfree(dev);
}
static ssize_t
linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct class_attribute *dattr;
ssize_t error;
dattr = container_of(attr, struct class_attribute, attr);
error = -EIO;
if (dattr->show)
error = dattr->show(container_of(kobj, struct class, kobj),
dattr, buf);
return (error);
}
static ssize_t
linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
size_t count)
{
struct class_attribute *dattr;
ssize_t error;
dattr = container_of(attr, struct class_attribute, attr);
error = -EIO;
if (dattr->store)
error = dattr->store(container_of(kobj, struct class, kobj),
dattr, buf, count);
return (error);
}
static void
linux_class_release(struct kobject *kobj)
{
struct class *class;
class = container_of(kobj, struct class, kobj);
if (class->class_release)
class->class_release(class);
}
static const struct sysfs_ops linux_class_sysfs = {
.show = linux_class_show,
.store = linux_class_store,
};
const struct kobj_type linux_class_ktype = {
.release = linux_class_release,
.sysfs_ops = &linux_class_sysfs
};
static void
linux_dev_release(struct kobject *kobj)
{
struct device *dev;
dev = container_of(kobj, struct device, kobj);
/* This is the precedence defined by linux. */
if (dev->release)
dev->release(dev);
else if (dev->class && dev->class->dev_release)
dev->class->dev_release(dev);
}
static ssize_t
linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct device_attribute *dattr;
ssize_t error;
dattr = container_of(attr, struct device_attribute, attr);
error = -EIO;
if (dattr->show)
error = dattr->show(container_of(kobj, struct device, kobj),
dattr, buf);
return (error);
}
static ssize_t
linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
size_t count)
{
struct device_attribute *dattr;
ssize_t error;
dattr = container_of(attr, struct device_attribute, attr);
error = -EIO;
if (dattr->store)
error = dattr->store(container_of(kobj, struct device, kobj),
dattr, buf, count);
return (error);
}
static const struct sysfs_ops linux_dev_sysfs = {
.show = linux_dev_show,
.store = linux_dev_store,
};
const struct kobj_type linux_dev_ktype = {
.release = linux_dev_release,
.sysfs_ops = &linux_dev_sysfs
};
struct device *
device_create(struct class *class, struct device *parent, dev_t devt,
void *drvdata, const char *fmt, ...)
{
struct device *dev;
va_list args;
dev = kzalloc(sizeof(*dev), M_WAITOK);
dev->parent = parent;
dev->class = class;
dev->devt = devt;
dev->driver_data = drvdata;
dev->release = linux_device_release;
va_start(args, fmt);
kobject_set_name_vargs(&dev->kobj, fmt, args);
va_end(args);
device_register(dev);
return (dev);
}
int
kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
struct kobject *parent, const char *fmt, ...)
{
va_list args;
int error;
kobject_init(kobj, ktype);
kobj->ktype = ktype;
kobj->parent = parent;
kobj->name = NULL;
va_start(args, fmt);
error = kobject_set_name_vargs(kobj, fmt, args);
va_end(args);
if (error)
return (error);
return kobject_add_complete(kobj, parent);
}
static void
linux_kq_lock(void *arg)
{
spinlock_t *s = arg;
spin_lock(s);
}
static void
linux_kq_unlock(void *arg)
{
spinlock_t *s = arg;
spin_unlock(s);
}
static void
linux_kq_lock_owned(void *arg)
{
#ifdef INVARIANTS
spinlock_t *s = arg;
mtx_assert(&s->m, MA_OWNED);
#endif
}
static void
linux_kq_lock_unowned(void *arg)
{
#ifdef INVARIANTS
spinlock_t *s = arg;
mtx_assert(&s->m, MA_NOTOWNED);
#endif
}
static void
linux_file_kqfilter_poll(struct linux_file *, int);
struct linux_file *
linux_file_alloc(void)
{
struct linux_file *filp;
filp = kzalloc(sizeof(*filp), GFP_KERNEL);
/* set initial refcount */
filp->f_count = 1;
/* setup fields needed by kqueue support */
spin_lock_init(&filp->f_kqlock);
knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
linux_kq_lock, linux_kq_unlock,
linux_kq_lock_owned, linux_kq_lock_unowned);
return (filp);
}
void
linux_file_free(struct linux_file *filp)
{
if (filp->_file == NULL) {
if (filp->f_shmem != NULL)
vm_object_deallocate(filp->f_shmem);
kfree(filp);
} else {
/*
* The close method of the character device or file
* will free the linux_file structure:
*/
_fdrop(filp->_file, curthread);
}
}
static int
linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
vm_page_t *mres)
{
struct vm_area_struct *vmap;
vmap = linux_cdev_handle_find(vm_obj->handle);
MPASS(vmap != NULL);
MPASS(vmap->vm_private_data == vm_obj->handle);
if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
vm_page_t page;
if (((*mres)->flags & PG_FICTITIOUS) != 0) {
/*
* If the passed in result page is a fake
* page, update it with the new physical
* address.
*/
page = *mres;
vm_page_updatefake(page, paddr, vm_obj->memattr);
} else {
/*
* Replace the passed in "mres" page with our
* own fake page and free up the all of the
* original pages.
*/
VM_OBJECT_WUNLOCK(vm_obj);
page = vm_page_getfake(paddr, vm_obj->memattr);
VM_OBJECT_WLOCK(vm_obj);
vm_page_replace_checked(page, vm_obj,
(*mres)->pindex, *mres);
vm_page_free(*mres);
*mres = page;
}
- page->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(page);
return (VM_PAGER_OK);
}
return (VM_PAGER_FAIL);
}
static int
linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
{
struct vm_area_struct *vmap;
int err;
linux_set_current(curthread);
/* get VM area structure */
vmap = linux_cdev_handle_find(vm_obj->handle);
MPASS(vmap != NULL);
MPASS(vmap->vm_private_data == vm_obj->handle);
VM_OBJECT_WUNLOCK(vm_obj);
down_write(&vmap->vm_mm->mmap_sem);
if (unlikely(vmap->vm_ops == NULL)) {
err = VM_FAULT_SIGBUS;
} else {
struct vm_fault vmf;
/* fill out VM fault structure */
vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
vmf.pgoff = 0;
vmf.page = NULL;
vmf.vma = vmap;
vmap->vm_pfn_count = 0;
vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
vmap->vm_obj = vm_obj;
err = vmap->vm_ops->fault(vmap, &vmf);
while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
kern_yield(PRI_USER);
err = vmap->vm_ops->fault(vmap, &vmf);
}
}
/* translate return code */
switch (err) {
case VM_FAULT_OOM:
err = VM_PAGER_AGAIN;
break;
case VM_FAULT_SIGBUS:
err = VM_PAGER_BAD;
break;
case VM_FAULT_NOPAGE:
/*
* By contract the fault handler will return having
* busied all the pages itself. If pidx is already
* found in the object, it will simply xbusy the first
* page and return with vm_pfn_count set to 1.
*/
*first = vmap->vm_pfn_first;
*last = *first + vmap->vm_pfn_count - 1;
err = VM_PAGER_OK;
break;
default:
err = VM_PAGER_ERROR;
break;
}
up_write(&vmap->vm_mm->mmap_sem);
VM_OBJECT_WLOCK(vm_obj);
return (err);
}
static struct rwlock linux_vma_lock;
static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
TAILQ_HEAD_INITIALIZER(linux_vma_head);
static void
linux_cdev_handle_free(struct vm_area_struct *vmap)
{
/* Drop reference on vm_file */
if (vmap->vm_file != NULL)
fput(vmap->vm_file);
/* Drop reference on mm_struct */
mmput(vmap->vm_mm);
kfree(vmap);
}
static void
linux_cdev_handle_remove(struct vm_area_struct *vmap)
{
rw_wlock(&linux_vma_lock);
TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
rw_wunlock(&linux_vma_lock);
}
static struct vm_area_struct *
linux_cdev_handle_find(void *handle)
{
struct vm_area_struct *vmap;
rw_rlock(&linux_vma_lock);
TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
if (vmap->vm_private_data == handle)
break;
}
rw_runlock(&linux_vma_lock);
return (vmap);
}
static int
linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
{
MPASS(linux_cdev_handle_find(handle) != NULL);
*color = 0;
return (0);
}
static void
linux_cdev_pager_dtor(void *handle)
{
const struct vm_operations_struct *vm_ops;
struct vm_area_struct *vmap;
vmap = linux_cdev_handle_find(handle);
MPASS(vmap != NULL);
/*
* Remove handle before calling close operation to prevent
* other threads from reusing the handle pointer.
*/
linux_cdev_handle_remove(vmap);
down_write(&vmap->vm_mm->mmap_sem);
vm_ops = vmap->vm_ops;
if (likely(vm_ops != NULL))
vm_ops->close(vmap);
up_write(&vmap->vm_mm->mmap_sem);
linux_cdev_handle_free(vmap);
}
static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
{
/* OBJT_MGTDEVICE */
.cdev_pg_populate = linux_cdev_pager_populate,
.cdev_pg_ctor = linux_cdev_pager_ctor,
.cdev_pg_dtor = linux_cdev_pager_dtor
},
{
/* OBJT_DEVICE */
.cdev_pg_fault = linux_cdev_pager_fault,
.cdev_pg_ctor = linux_cdev_pager_ctor,
.cdev_pg_dtor = linux_cdev_pager_dtor
},
};
int
zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
vm_object_t obj;
vm_page_t m;
obj = vma->vm_obj;
if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
return (-ENOTSUP);
VM_OBJECT_RLOCK(obj);
for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
m != NULL && m->pindex < OFF_TO_IDX(address + size);
m = TAILQ_NEXT(m, listq))
pmap_remove_all(m);
VM_OBJECT_RUNLOCK(obj);
return (0);
}
static struct file_operations dummy_ldev_ops = {
/* XXXKIB */
};
static struct linux_cdev dummy_ldev = {
.ops = &dummy_ldev_ops,
};
#define LDEV_SI_DTR 0x0001
#define LDEV_SI_REF 0x0002
static void
linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
struct linux_cdev **dev)
{
struct linux_cdev *ldev;
u_int siref;
ldev = filp->f_cdev;
*fop = filp->f_op;
if (ldev != NULL) {
for (siref = ldev->siref;;) {
if ((siref & LDEV_SI_DTR) != 0) {
ldev = &dummy_ldev;
siref = ldev->siref;
*fop = ldev->ops;
MPASS((ldev->siref & LDEV_SI_DTR) == 0);
} else if (atomic_fcmpset_int(&ldev->siref, &siref,
siref + LDEV_SI_REF)) {
break;
}
}
}
*dev = ldev;
}
static void
linux_drop_fop(struct linux_cdev *ldev)
{
if (ldev == NULL)
return;
MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
}
#define OPW(fp,td,code) ({ \
struct file *__fpop; \
__typeof(code) __retval; \
\
__fpop = (td)->td_fpop; \
(td)->td_fpop = (fp); \
__retval = (code); \
(td)->td_fpop = __fpop; \
__retval; \
})
static int
linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
struct file *file)
{
struct linux_cdev *ldev;
struct linux_file *filp;
const struct file_operations *fop;
int error;
ldev = dev->si_drv1;
filp = linux_file_alloc();
filp->f_dentry = &filp->f_dentry_store;
filp->f_op = ldev->ops;
filp->f_mode = file->f_flag;
filp->f_flags = file->f_flag;
filp->f_vnode = file->f_vnode;
filp->_file = file;
refcount_acquire(&ldev->refs);
filp->f_cdev = ldev;
linux_set_current(td);
linux_get_fop(filp, &fop, &ldev);
if (fop->open != NULL) {
error = -fop->open(file->f_vnode, filp);
if (error != 0) {
linux_drop_fop(ldev);
linux_cdev_deref(filp->f_cdev);
kfree(filp);
return (error);
}
}
/* hold on to the vnode - used for fstat() */
vhold(filp->f_vnode);
/* release the file from devfs */
finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
linux_drop_fop(ldev);
return (ENXIO);
}
#define LINUX_IOCTL_MIN_PTR 0x10000UL
#define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
static inline int
linux_remap_address(void **uaddr, size_t len)
{
uintptr_t uaddr_val = (uintptr_t)(*uaddr);
if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
uaddr_val < LINUX_IOCTL_MAX_PTR)) {
struct task_struct *pts = current;
if (pts == NULL) {
*uaddr = NULL;
return (1);
}
/* compute data offset */
uaddr_val -= LINUX_IOCTL_MIN_PTR;
/* check that length is within bounds */
if ((len > IOCPARM_MAX) ||
(uaddr_val + len) > pts->bsd_ioctl_len) {
*uaddr = NULL;
return (1);
}
/* re-add kernel buffer address */
uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
/* update address location */
*uaddr = (void *)uaddr_val;
return (1);
}
return (0);
}
int
linux_copyin(const void *uaddr, void *kaddr, size_t len)
{
if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
if (uaddr == NULL)
return (-EFAULT);
memcpy(kaddr, uaddr, len);
return (0);
}
return (-copyin(uaddr, kaddr, len));
}
int
linux_copyout(const void *kaddr, void *uaddr, size_t len)
{
if (linux_remap_address(&uaddr, len)) {
if (uaddr == NULL)
return (-EFAULT);
memcpy(uaddr, kaddr, len);
return (0);
}
return (-copyout(kaddr, uaddr, len));
}
size_t
linux_clear_user(void *_uaddr, size_t _len)
{
uint8_t *uaddr = _uaddr;
size_t len = _len;
/* make sure uaddr is aligned before going into the fast loop */
while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
if (subyte(uaddr, 0))
return (_len);
uaddr++;
len--;
}
/* zero 8 bytes at a time */
while (len > 7) {
#ifdef __LP64__
if (suword64(uaddr, 0))
return (_len);
#else
if (suword32(uaddr, 0))
return (_len);
if (suword32(uaddr + 4, 0))
return (_len);
#endif
uaddr += 8;
len -= 8;
}
/* zero fill end, if any */
while (len > 0) {
if (subyte(uaddr, 0))
return (_len);
uaddr++;
len--;
}
return (0);
}
int
linux_access_ok(const void *uaddr, size_t len)
{
uintptr_t saddr;
uintptr_t eaddr;
/* get start and end address */
saddr = (uintptr_t)uaddr;
eaddr = (uintptr_t)uaddr + len;
/* verify addresses are valid for userspace */
return ((saddr == eaddr) ||
(eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
}
/*
* This function should return either EINTR or ERESTART depending on
* the signal type sent to this thread:
*/
static int
linux_get_error(struct task_struct *task, int error)
{
/* check for signal type interrupt code */
if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
error = -linux_schedule_get_interrupt_value(task);
if (error == 0)
error = EINTR;
}
return (error);
}
static int
linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
const struct file_operations *fop, u_long cmd, caddr_t data,
struct thread *td)
{
struct task_struct *task = current;
unsigned size;
int error;
size = IOCPARM_LEN(cmd);
/* refer to logic in sys_ioctl() */
if (size > 0) {
/*
* Setup hint for linux_copyin() and linux_copyout().
*
* Background: Linux code expects a user-space address
* while FreeBSD supplies a kernel-space address.
*/
task->bsd_ioctl_data = data;
task->bsd_ioctl_len = size;
data = (void *)LINUX_IOCTL_MIN_PTR;
} else {
/* fetch user-space pointer */
data = *(void **)data;
}
#if defined(__amd64__)
if (td->td_proc->p_elf_machine == EM_386) {
/* try the compat IOCTL handler first */
if (fop->compat_ioctl != NULL) {
error = -OPW(fp, td, fop->compat_ioctl(filp,
cmd, (u_long)data));
} else {
error = ENOTTY;
}
/* fallback to the regular IOCTL handler, if any */
if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
error = -OPW(fp, td, fop->unlocked_ioctl(filp,
cmd, (u_long)data));
}
} else
#endif
{
if (fop->unlocked_ioctl != NULL) {
error = -OPW(fp, td, fop->unlocked_ioctl(filp,
cmd, (u_long)data));
} else {
error = ENOTTY;
}
}
if (size > 0) {
task->bsd_ioctl_data = NULL;
task->bsd_ioctl_len = 0;
}
if (error == EWOULDBLOCK) {
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp,
LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
} else {
error = linux_get_error(task, error);
}
return (error);
}
#define LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
/*
* This function atomically updates the poll wakeup state and returns
* the previous state at the time of update.
*/
static uint8_t
linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
{
int c, old;
c = v->counter;
while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
c = old;
return (c);
}
static int
linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
{
static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
};
struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
case LINUX_FWQ_STATE_QUEUED:
linux_poll_wakeup(filp);
return (1);
default:
return (0);
}
}
void
linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
{
static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
};
/* check if we are called inside the select system call */
if (p == LINUX_POLL_TABLE_NORMAL)
selrecord(curthread, &filp->f_selinfo);
switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
case LINUX_FWQ_STATE_INIT:
/* NOTE: file handles can only belong to one wait-queue */
filp->f_wait_queue.wqh = wqh;
filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
add_wait_queue(wqh, &filp->f_wait_queue.wq);
atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
break;
default:
break;
}
}
static void
linux_poll_wait_dequeue(struct linux_file *filp)
{
static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
};
seldrain(&filp->f_selinfo);
switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
case LINUX_FWQ_STATE_NOT_READY:
case LINUX_FWQ_STATE_QUEUED:
case LINUX_FWQ_STATE_READY:
remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
break;
default:
break;
}
}
void
linux_poll_wakeup(struct linux_file *filp)
{
/* this function should be NULL-safe */
if (filp == NULL)
return;
selwakeup(&filp->f_selinfo);
spin_lock(&filp->f_kqlock);
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
LINUX_KQ_FLAG_NEED_WRITE;
/* make sure the "knote" gets woken up */
KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
spin_unlock(&filp->f_kqlock);
}
static void
linux_file_kqfilter_detach(struct knote *kn)
{
struct linux_file *filp = kn->kn_hook;
spin_lock(&filp->f_kqlock);
knlist_remove(&filp->f_selinfo.si_note, kn, 1);
spin_unlock(&filp->f_kqlock);
}
static int
linux_file_kqfilter_read_event(struct knote *kn, long hint)
{
struct linux_file *filp = kn->kn_hook;
mtx_assert(&filp->f_kqlock.m, MA_OWNED);
return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
}
static int
linux_file_kqfilter_write_event(struct knote *kn, long hint)
{
struct linux_file *filp = kn->kn_hook;
mtx_assert(&filp->f_kqlock.m, MA_OWNED);
return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
}
static struct filterops linux_dev_kqfiltops_read = {
.f_isfd = 1,
.f_detach = linux_file_kqfilter_detach,
.f_event = linux_file_kqfilter_read_event,
};
static struct filterops linux_dev_kqfiltops_write = {
.f_isfd = 1,
.f_detach = linux_file_kqfilter_detach,
.f_event = linux_file_kqfilter_write_event,
};
static void
linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
{
struct thread *td;
const struct file_operations *fop;
struct linux_cdev *ldev;
int temp;
if ((filp->f_kqflags & kqflags) == 0)
return;
td = curthread;
linux_get_fop(filp, &fop, &ldev);
/* get the latest polling state */
temp = OPW(filp->_file, td, fop->poll(filp, NULL));
linux_drop_fop(ldev);
spin_lock(&filp->f_kqlock);
/* clear kqflags */
filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
LINUX_KQ_FLAG_NEED_WRITE);
/* update kqflags */
if ((temp & (POLLIN | POLLOUT)) != 0) {
if ((temp & POLLIN) != 0)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
if ((temp & POLLOUT) != 0)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
/* make sure the "knote" gets woken up */
KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
}
spin_unlock(&filp->f_kqlock);
}
static int
linux_file_kqfilter(struct file *file, struct knote *kn)
{
struct linux_file *filp;
struct thread *td;
int error;
td = curthread;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
if (filp->f_op->poll == NULL)
return (EINVAL);
spin_lock(&filp->f_kqlock);
switch (kn->kn_filter) {
case EVFILT_READ:
filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
kn->kn_fop = &linux_dev_kqfiltops_read;
kn->kn_hook = filp;
knlist_add(&filp->f_selinfo.si_note, kn, 1);
error = 0;
break;
case EVFILT_WRITE:
filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
kn->kn_fop = &linux_dev_kqfiltops_write;
kn->kn_hook = filp;
knlist_add(&filp->f_selinfo.si_note, kn, 1);
error = 0;
break;
default:
error = EINVAL;
break;
}
spin_unlock(&filp->f_kqlock);
if (error == 0) {
linux_set_current(td);
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp,
LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
}
return (error);
}
static int
linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
int nprot, struct thread *td)
{
struct task_struct *task;
struct vm_area_struct *vmap;
struct mm_struct *mm;
struct linux_file *filp;
vm_memattr_t attr;
int error;
filp = (struct linux_file *)fp->f_data;
filp->f_flags = fp->f_flag;
if (fop->mmap == NULL)
return (EOPNOTSUPP);
linux_set_current(td);
/*
* The same VM object might be shared by multiple processes
* and the mm_struct is usually freed when a process exits.
*
* The atomic reference below makes sure the mm_struct is
* available as long as the vmap is in the linux_vma_head.
*/
task = current;
mm = task->mm;
if (atomic_inc_not_zero(&mm->mm_users) == 0)
return (EINVAL);
vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
vmap->vm_start = 0;
vmap->vm_end = size;
vmap->vm_pgoff = *offset / PAGE_SIZE;
vmap->vm_pfn = 0;
vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
vmap->vm_ops = NULL;
vmap->vm_file = get_file(filp);
vmap->vm_mm = mm;
if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
error = linux_get_error(task, EINTR);
} else {
error = -OPW(fp, td, fop->mmap(filp, vmap));
error = linux_get_error(task, error);
up_write(&vmap->vm_mm->mmap_sem);
}
if (error != 0) {
linux_cdev_handle_free(vmap);
return (error);
}
attr = pgprot2cachemode(vmap->vm_page_prot);
if (vmap->vm_ops != NULL) {
struct vm_area_struct *ptr;
void *vm_private_data;
bool vm_no_fault;
if (vmap->vm_ops->open == NULL ||
vmap->vm_ops->close == NULL ||
vmap->vm_private_data == NULL) {
/* free allocated VM area struct */
linux_cdev_handle_free(vmap);
return (EINVAL);
}
vm_private_data = vmap->vm_private_data;
rw_wlock(&linux_vma_lock);
TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
if (ptr->vm_private_data == vm_private_data)
break;
}
/* check if there is an existing VM area struct */
if (ptr != NULL) {
/* check if the VM area structure is invalid */
if (ptr->vm_ops == NULL ||
ptr->vm_ops->open == NULL ||
ptr->vm_ops->close == NULL) {
error = ESTALE;
vm_no_fault = 1;
} else {
error = EEXIST;
vm_no_fault = (ptr->vm_ops->fault == NULL);
}
} else {
/* insert VM area structure into list */
TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
error = 0;
vm_no_fault = (vmap->vm_ops->fault == NULL);
}
rw_wunlock(&linux_vma_lock);
if (error != 0) {
/* free allocated VM area struct */
linux_cdev_handle_free(vmap);
/* check for stale VM area struct */
if (error != EEXIST)
return (error);
}
/* check if there is no fault handler */
if (vm_no_fault) {
*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
&linux_cdev_pager_ops[1], size, nprot, *offset,
td->td_ucred);
} else {
*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
&linux_cdev_pager_ops[0], size, nprot, *offset,
td->td_ucred);
}
/* check if allocating the VM object failed */
if (*object == NULL) {
if (error == 0) {
/* remove VM area struct from list */
linux_cdev_handle_remove(vmap);
/* free allocated VM area struct */
linux_cdev_handle_free(vmap);
}
return (EINVAL);
}
} else {
struct sglist *sg;
sg = sglist_alloc(1, M_WAITOK);
sglist_append_phys(sg,
(vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
nprot, 0, td->td_ucred);
linux_cdev_handle_free(vmap);
if (*object == NULL) {
sglist_free(sg);
return (EINVAL);
}
}
if (attr != VM_MEMATTR_DEFAULT) {
VM_OBJECT_WLOCK(*object);
vm_object_set_memattr(*object, attr);
VM_OBJECT_WUNLOCK(*object);
}
*offset = 0;
return (0);
}
struct cdevsw linuxcdevsw = {
.d_version = D_VERSION,
.d_fdopen = linux_dev_fdopen,
.d_name = "lkpidev",
};
static int
linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
ssize_t bytes;
int error;
error = 0;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
/* XXX no support for I/O vectors currently */
if (uio->uio_iovcnt != 1)
return (EOPNOTSUPP);
if (uio->uio_resid > DEVFS_IOSIZE_MAX)
return (EINVAL);
linux_set_current(td);
linux_get_fop(filp, &fop, &ldev);
if (fop->read != NULL) {
bytes = OPW(file, td, fop->read(filp,
uio->uio_iov->iov_base,
uio->uio_iov->iov_len, &uio->uio_offset));
if (bytes >= 0) {
uio->uio_iov->iov_base =
((uint8_t *)uio->uio_iov->iov_base) + bytes;
uio->uio_iov->iov_len -= bytes;
uio->uio_resid -= bytes;
} else {
error = linux_get_error(current, -bytes);
}
} else
error = ENXIO;
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
linux_drop_fop(ldev);
return (error);
}
static int
linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
ssize_t bytes;
int error;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
/* XXX no support for I/O vectors currently */
if (uio->uio_iovcnt != 1)
return (EOPNOTSUPP);
if (uio->uio_resid > DEVFS_IOSIZE_MAX)
return (EINVAL);
linux_set_current(td);
linux_get_fop(filp, &fop, &ldev);
if (fop->write != NULL) {
bytes = OPW(file, td, fop->write(filp,
uio->uio_iov->iov_base,
uio->uio_iov->iov_len, &uio->uio_offset));
if (bytes >= 0) {
uio->uio_iov->iov_base =
((uint8_t *)uio->uio_iov->iov_base) + bytes;
uio->uio_iov->iov_len -= bytes;
uio->uio_resid -= bytes;
error = 0;
} else {
error = linux_get_error(current, -bytes);
}
} else
error = ENXIO;
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
linux_drop_fop(ldev);
return (error);
}
static int
linux_file_poll(struct file *file, int events, struct ucred *active_cred,
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int revents;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
linux_set_current(td);
linux_get_fop(filp, &fop, &ldev);
if (fop->poll != NULL) {
revents = OPW(file, td, fop->poll(filp,
LINUX_POLL_TABLE_NORMAL)) & events;
} else {
revents = 0;
}
linux_drop_fop(ldev);
return (revents);
}
static int
linux_file_close(struct file *file, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int error;
filp = (struct linux_file *)file->f_data;
KASSERT(file_count(filp) == 0,
("File refcount(%d) is not zero", file_count(filp)));
error = 0;
filp->f_flags = file->f_flag;
linux_set_current(td);
linux_poll_wait_dequeue(filp);
linux_get_fop(filp, &fop, &ldev);
if (fop->release != NULL)
error = -OPW(file, td, fop->release(filp->f_vnode, filp));
funsetown(&filp->f_sigio);
if (filp->f_vnode != NULL)
vdrop(filp->f_vnode);
linux_drop_fop(ldev);
if (filp->f_cdev != NULL)
linux_cdev_deref(filp->f_cdev);
kfree(filp);
return (error);
}
static int
linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int error;
error = 0;
filp = (struct linux_file *)fp->f_data;
filp->f_flags = fp->f_flag;
linux_get_fop(filp, &fop, &ldev);
linux_set_current(td);
switch (cmd) {
case FIONBIO:
break;
case FIOASYNC:
if (fop->fasync == NULL)
break;
error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
break;
case FIOSETOWN:
error = fsetown(*(int *)data, &filp->f_sigio);
if (error == 0) {
if (fop->fasync == NULL)
break;
error = -OPW(fp, td, fop->fasync(0, filp,
fp->f_flag & FASYNC));
}
break;
case FIOGETOWN:
*(int *)data = fgetown(&filp->f_sigio);
break;
default:
error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
break;
}
linux_drop_fop(ldev);
return (error);
}
static int
linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
vm_prot_t *maxprotp, int *flagsp, struct file *fp,
vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
{
/*
* Character devices do not provide private mappings
* of any kind:
*/
if ((*maxprotp & VM_PROT_WRITE) == 0 &&
(prot & VM_PROT_WRITE) != 0)
return (EACCES);
if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0)
return (EINVAL);
return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
(int)prot, td));
}
static int
linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
struct mount *mp;
struct vnode *vp;
vm_object_t object;
vm_prot_t maxprot;
int error;
filp = (struct linux_file *)fp->f_data;
vp = filp->f_vnode;
if (vp == NULL)
return (EOPNOTSUPP);
/*
* Ensure that file and memory protections are
* compatible.
*/
mp = vp->v_mount;
if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
maxprot = VM_PROT_NONE;
if ((prot & VM_PROT_EXECUTE) != 0)
return (EACCES);
} else
maxprot = VM_PROT_EXECUTE;
if ((fp->f_flag & FREAD) != 0)
maxprot |= VM_PROT_READ;
else if ((prot & VM_PROT_READ) != 0)
return (EACCES);
/*
* If we are sharing potential changes via MAP_SHARED and we
* are trying to get write permission although we opened it
* without asking for it, bail out.
*
* Note that most character devices always share mappings.
*
* Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
* requests rather than doing it here.
*/
if ((flags & MAP_SHARED) != 0) {
if ((fp->f_flag & FWRITE) != 0)
maxprot |= VM_PROT_WRITE;
else if ((prot & VM_PROT_WRITE) != 0)
return (EACCES);
}
maxprot &= cap_maxprot;
linux_get_fop(filp, &fop, &ldev);
error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp,
&foff, fop, &object);
if (error != 0)
goto out;
error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
foff, FALSE, td);
if (error != 0)
vm_object_deallocate(object);
out:
linux_drop_fop(ldev);
return (error);
}
static int
linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
struct thread *td)
{
struct linux_file *filp;
struct vnode *vp;
int error;
filp = (struct linux_file *)fp->f_data;
if (filp->f_vnode == NULL)
return (EOPNOTSUPP);
vp = filp->f_vnode;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
VOP_UNLOCK(vp, 0);
return (error);
}
static int
linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
struct filedesc *fdp)
{
struct linux_file *filp;
struct vnode *vp;
int error;
filp = fp->f_data;
vp = filp->f_vnode;
if (vp == NULL) {
error = 0;
kif->kf_type = KF_TYPE_DEV;
} else {
vref(vp);
FILEDESC_SUNLOCK(fdp);
error = vn_fill_kinfo_vnode(vp, kif);
vrele(vp);
kif->kf_type = KF_TYPE_VNODE;
FILEDESC_SLOCK(fdp);
}
return (error);
}
unsigned int
linux_iminor(struct inode *inode)
{
struct linux_cdev *ldev;
if (inode == NULL || inode->v_rdev == NULL ||
inode->v_rdev->si_devsw != &linuxcdevsw)
return (-1U);
ldev = inode->v_rdev->si_drv1;
if (ldev == NULL)
return (-1U);
return (minor(ldev->dev));
}
struct fileops linuxfileops = {
.fo_read = linux_file_read,
.fo_write = linux_file_write,
.fo_truncate = invfo_truncate,
.fo_kqfilter = linux_file_kqfilter,
.fo_stat = linux_file_stat,
.fo_fill_kinfo = linux_file_fill_kinfo,
.fo_poll = linux_file_poll,
.fo_close = linux_file_close,
.fo_ioctl = linux_file_ioctl,
.fo_mmap = linux_file_mmap,
.fo_chmod = invfo_chmod,
.fo_chown = invfo_chown,
.fo_sendfile = invfo_sendfile,
.fo_flags = DFLAG_PASSABLE,
};
/*
* Hash of vmmap addresses. This is infrequently accessed and does not
* need to be particularly large. This is done because we must store the
* caller's idea of the map size to properly unmap.
*/
struct vmmap {
LIST_ENTRY(vmmap) vm_next;
void *vm_addr;
unsigned long vm_size;
};
struct vmmaphd {
struct vmmap *lh_first;
};
#define VMMAP_HASH_SIZE 64
#define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1)
#define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
static struct mtx vmmaplock;
static void
vmmap_add(void *addr, unsigned long size)
{
struct vmmap *vmmap;
vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
mtx_lock(&vmmaplock);
vmmap->vm_size = size;
vmmap->vm_addr = addr;
LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
mtx_unlock(&vmmaplock);
}
static struct vmmap *
vmmap_remove(void *addr)
{
struct vmmap *vmmap;
mtx_lock(&vmmaplock);
LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
if (vmmap->vm_addr == addr)
break;
if (vmmap)
LIST_REMOVE(vmmap, vm_next);
mtx_unlock(&vmmaplock);
return (vmmap);
}
#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
void *
_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
{
void *addr;
addr = pmap_mapdev_attr(phys_addr, size, attr);
if (addr == NULL)
return (NULL);
vmmap_add(addr, size);
return (addr);
}
#endif
void
iounmap(void *addr)
{
struct vmmap *vmmap;
vmmap = vmmap_remove(addr);
if (vmmap == NULL)
return;
#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
#endif
kfree(vmmap);
}
void *
vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
{
vm_offset_t off;
size_t size;
size = count * PAGE_SIZE;
off = kva_alloc(size);
if (off == 0)
return (NULL);
vmmap_add((void *)off, size);
pmap_qenter(off, pages, count);
return ((void *)off);
}
void
vunmap(void *addr)
{
struct vmmap *vmmap;
vmmap = vmmap_remove(addr);
if (vmmap == NULL)
return;
pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
kva_free((vm_offset_t)addr, vmmap->vm_size);
kfree(vmmap);
}
char *
kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
{
unsigned int len;
char *p;
va_list aq;
va_copy(aq, ap);
len = vsnprintf(NULL, 0, fmt, aq);
va_end(aq);
p = kmalloc(len + 1, gfp);
if (p != NULL)
vsnprintf(p, len + 1, fmt, ap);
return (p);
}
char *
kasprintf(gfp_t gfp, const char *fmt, ...)
{
va_list ap;
char *p;
va_start(ap, fmt);
p = kvasprintf(gfp, fmt, ap);
va_end(ap);
return (p);
}
static void
linux_timer_callback_wrapper(void *context)
{
struct timer_list *timer;
linux_set_current(curthread);
timer = context;
timer->function(timer->data);
}
void
mod_timer(struct timer_list *timer, int expires)
{
timer->expires = expires;
callout_reset(&timer->callout,
linux_timer_jiffies_until(expires),
&linux_timer_callback_wrapper, timer);
}
void
add_timer(struct timer_list *timer)
{
callout_reset(&timer->callout,
linux_timer_jiffies_until(timer->expires),
&linux_timer_callback_wrapper, timer);
}
void
add_timer_on(struct timer_list *timer, int cpu)
{
callout_reset_on(&timer->callout,
linux_timer_jiffies_until(timer->expires),
&linux_timer_callback_wrapper, timer, cpu);
}
int
del_timer(struct timer_list *timer)
{
if (callout_stop(&(timer)->callout) == -1)
return (0);
return (1);
}
static void
linux_timer_init(void *arg)
{
/*
* Compute an internal HZ value which can divide 2**32 to
* avoid timer rounding problems when the tick value wraps
* around 2**32:
*/
linux_timer_hz_mask = 1;
while (linux_timer_hz_mask < (unsigned long)hz)
linux_timer_hz_mask *= 2;
linux_timer_hz_mask--;
}
SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
void
linux_complete_common(struct completion *c, int all)
{
int wakeup_swapper;
sleepq_lock(c);
if (all) {
c->done = UINT_MAX;
wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
} else {
if (c->done != UINT_MAX)
c->done++;
wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
}
sleepq_release(c);
if (wakeup_swapper)
kick_proc0();
}
/*
* Indefinite wait for done != 0 with or without signals.
*/
int
linux_wait_for_common(struct completion *c, int flags)
{
struct task_struct *task;
int error;
if (SCHEDULER_STOPPED())
return (0);
task = current;
if (flags != 0)
flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
else
flags = SLEEPQ_SLEEP;
error = 0;
for (;;) {
sleepq_lock(c);
if (c->done)
break;
sleepq_add(c, NULL, "completion", flags, 0);
if (flags & SLEEPQ_INTERRUPTIBLE) {
DROP_GIANT();
error = -sleepq_wait_sig(c, 0);
PICKUP_GIANT();
if (error != 0) {
linux_schedule_save_interrupt_value(task, error);
error = -ERESTARTSYS;
goto intr;
}
} else {
DROP_GIANT();
sleepq_wait(c, 0);
PICKUP_GIANT();
}
}
if (c->done != UINT_MAX)
c->done--;
sleepq_release(c);
intr:
return (error);
}
/*
* Time limited wait for done != 0 with or without signals.
*/
int
linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
{
struct task_struct *task;
int end = jiffies + timeout;
int error;
if (SCHEDULER_STOPPED())
return (0);
task = current;
if (flags != 0)
flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
else
flags = SLEEPQ_SLEEP;
for (;;) {
sleepq_lock(c);
if (c->done)
break;
sleepq_add(c, NULL, "completion", flags, 0);
sleepq_set_timeout(c, linux_timer_jiffies_until(end));
DROP_GIANT();
if (flags & SLEEPQ_INTERRUPTIBLE)
error = -sleepq_timedwait_sig(c, 0);
else
error = -sleepq_timedwait(c, 0);
PICKUP_GIANT();
if (error != 0) {
/* check for timeout */
if (error == -EWOULDBLOCK) {
error = 0; /* timeout */
} else {
/* signal happened */
linux_schedule_save_interrupt_value(task, error);
error = -ERESTARTSYS;
}
goto done;
}
}
if (c->done != UINT_MAX)
c->done--;
sleepq_release(c);
/* return how many jiffies are left */
error = linux_timer_jiffies_until(end);
done:
return (error);
}
int
linux_try_wait_for_completion(struct completion *c)
{
int isdone;
sleepq_lock(c);
isdone = (c->done != 0);
if (c->done != 0 && c->done != UINT_MAX)
c->done--;
sleepq_release(c);
return (isdone);
}
int
linux_completion_done(struct completion *c)
{
int isdone;
sleepq_lock(c);
isdone = (c->done != 0);
sleepq_release(c);
return (isdone);
}
static void
linux_cdev_deref(struct linux_cdev *ldev)
{
if (refcount_release(&ldev->refs))
kfree(ldev);
}
static void
linux_cdev_release(struct kobject *kobj)
{
struct linux_cdev *cdev;
struct kobject *parent;
cdev = container_of(kobj, struct linux_cdev, kobj);
parent = kobj->parent;
linux_destroy_dev(cdev);
linux_cdev_deref(cdev);
kobject_put(parent);
}
static void
linux_cdev_static_release(struct kobject *kobj)
{
struct linux_cdev *cdev;
struct kobject *parent;
cdev = container_of(kobj, struct linux_cdev, kobj);
parent = kobj->parent;
linux_destroy_dev(cdev);
kobject_put(parent);
}
void
linux_destroy_dev(struct linux_cdev *ldev)
{
if (ldev->cdev == NULL)
return;
MPASS((ldev->siref & LDEV_SI_DTR) == 0);
atomic_set_int(&ldev->siref, LDEV_SI_DTR);
while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
pause("ldevdtr", hz / 4);
destroy_dev(ldev->cdev);
ldev->cdev = NULL;
}
const struct kobj_type linux_cdev_ktype = {
.release = linux_cdev_release,
};
const struct kobj_type linux_cdev_static_ktype = {
.release = linux_cdev_static_release,
};
static void
linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
{
struct notifier_block *nb;
nb = arg;
if (linkstate == LINK_STATE_UP)
nb->notifier_call(nb, NETDEV_UP, ifp);
else
nb->notifier_call(nb, NETDEV_DOWN, ifp);
}
static void
linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
{
struct notifier_block *nb;
nb = arg;
nb->notifier_call(nb, NETDEV_REGISTER, ifp);
}
static void
linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
{
struct notifier_block *nb;
nb = arg;
nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
}
static void
linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
{
struct notifier_block *nb;
nb = arg;
nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
}
static void
linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
{
struct notifier_block *nb;
nb = arg;
nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
}
int
register_netdevice_notifier(struct notifier_block *nb)
{
nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
iflladdr_event, linux_handle_iflladdr_event, nb, 0);
return (0);
}
int
register_inetaddr_notifier(struct notifier_block *nb)
{
nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
ifaddr_event, linux_handle_ifaddr_event, nb, 0);
return (0);
}
int
unregister_netdevice_notifier(struct notifier_block *nb)
{
EVENTHANDLER_DEREGISTER(ifnet_link_event,
nb->tags[NETDEV_UP]);
EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
nb->tags[NETDEV_REGISTER]);
EVENTHANDLER_DEREGISTER(ifnet_departure_event,
nb->tags[NETDEV_UNREGISTER]);
EVENTHANDLER_DEREGISTER(iflladdr_event,
nb->tags[NETDEV_CHANGEADDR]);
return (0);
}
int
unregister_inetaddr_notifier(struct notifier_block *nb)
{
EVENTHANDLER_DEREGISTER(ifaddr_event,
nb->tags[NETDEV_CHANGEIFADDR]);
return (0);
}
struct list_sort_thunk {
int (*cmp)(void *, struct list_head *, struct list_head *);
void *priv;
};
static inline int
linux_le_cmp(void *priv, const void *d1, const void *d2)
{
struct list_head *le1, *le2;
struct list_sort_thunk *thunk;
thunk = priv;
le1 = *(__DECONST(struct list_head **, d1));
le2 = *(__DECONST(struct list_head **, d2));
return ((thunk->cmp)(thunk->priv, le1, le2));
}
void
list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
struct list_head *a, struct list_head *b))
{
struct list_sort_thunk thunk;
struct list_head **ar, *le;
size_t count, i;
count = 0;
list_for_each(le, head)
count++;
ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
i = 0;
list_for_each(le, head)
ar[i++] = le;
thunk.cmp = cmp;
thunk.priv = priv;
qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
INIT_LIST_HEAD(head);
for (i = 0; i < count; i++)
list_add_tail(ar[i], head);
free(ar, M_KMALLOC);
}
void
linux_irq_handler(void *ent)
{
struct irq_ent *irqe;
linux_set_current(curthread);
irqe = ent;
irqe->handler(irqe->irq, irqe->arg);
}
#if defined(__i386__) || defined(__amd64__)
int
linux_wbinvd_on_all_cpus(void)
{
pmap_invalidate_cache();
return (0);
}
#endif
int
linux_on_each_cpu(void callback(void *), void *data)
{
smp_rendezvous(smp_no_rendezvous_barrier, callback,
smp_no_rendezvous_barrier, data);
return (0);
}
int
linux_in_atomic(void)
{
return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
}
struct linux_cdev *
linux_find_cdev(const char *name, unsigned major, unsigned minor)
{
dev_t dev = MKDEV(major, minor);
struct cdev *cdev;
dev_lock();
LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
struct linux_cdev *ldev = cdev->si_drv1;
if (ldev->dev == dev &&
strcmp(kobject_name(&ldev->kobj), name) == 0) {
break;
}
}
dev_unlock();
return (cdev != NULL ? cdev->si_drv1 : NULL);
}
int
__register_chrdev(unsigned int major, unsigned int baseminor,
unsigned int count, const char *name,
const struct file_operations *fops)
{
struct linux_cdev *cdev;
int ret = 0;
int i;
for (i = baseminor; i < baseminor + count; i++) {
cdev = cdev_alloc();
cdev->ops = fops;
kobject_set_name(&cdev->kobj, name);
ret = cdev_add(cdev, makedev(major, i), 1);
if (ret != 0)
break;
}
return (ret);
}
int
__register_chrdev_p(unsigned int major, unsigned int baseminor,
unsigned int count, const char *name,
const struct file_operations *fops, uid_t uid,
gid_t gid, int mode)
{
struct linux_cdev *cdev;
int ret = 0;
int i;
for (i = baseminor; i < baseminor + count; i++) {
cdev = cdev_alloc();
cdev->ops = fops;
kobject_set_name(&cdev->kobj, name);
ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
if (ret != 0)
break;
}
return (ret);
}
void
__unregister_chrdev(unsigned int major, unsigned int baseminor,
unsigned int count, const char *name)
{
struct linux_cdev *cdevp;
int i;
for (i = baseminor; i < baseminor + count; i++) {
cdevp = linux_find_cdev(name, major, i);
if (cdevp != NULL)
cdev_del(cdevp);
}
}
void
linux_dump_stack(void)
{
#ifdef STACK
struct stack st;
stack_zero(&st);
stack_save(&st);
stack_print(&st);
#endif
}
#if defined(__i386__) || defined(__amd64__)
bool linux_cpu_has_clflush;
#endif
static void
linux_compat_init(void *arg)
{
struct sysctl_oid *rootoid;
int i;
#if defined(__i386__) || defined(__amd64__)
linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
#endif
rw_init(&linux_vma_lock, "lkpi-vma-lock");
rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
kobject_init(&linux_class_root, &linux_class_ktype);
kobject_set_name(&linux_class_root, "class");
linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
kobject_set_name(&linux_root_device.kobj, "device");
linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
"device");
linux_root_device.bsddev = root_bus;
linux_class_misc.name = "misc";
class_register(&linux_class_misc);
INIT_LIST_HEAD(&pci_drivers);
INIT_LIST_HEAD(&pci_devices);
spin_lock_init(&pci_lock);
mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
for (i = 0; i < VMMAP_HASH_SIZE; i++)
LIST_INIT(&vmmaphead[i]);
}
SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
static void
linux_compat_uninit(void *arg)
{
linux_kobject_kfree_name(&linux_class_root);
linux_kobject_kfree_name(&linux_root_device.kobj);
linux_kobject_kfree_name(&linux_class_misc.kobj);
mtx_destroy(&vmmaplock);
spin_lock_destroy(&pci_lock);
rw_destroy(&linux_vma_lock);
}
SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
/*
* NOTE: Linux frequently uses "unsigned long" for pointer to integer
* conversion and vice versa, where in FreeBSD "uintptr_t" would be
* used. Assert these types have the same size, else some parts of the
* LinuxKPI may not work like expected:
*/
CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
Index: head/sys/dev/drm2/ttm/ttm_bo_vm.c
===================================================================
--- head/sys/dev/drm2/ttm/ttm_bo_vm.c (revision 353538)
+++ head/sys/dev/drm2/ttm/ttm_bo_vm.c (revision 353539)
@@ -1,555 +1,555 @@
/**************************************************************************
*
* Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/*
* Authors: Thomas Hellstrom
*/
/*
* Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by Konstantin Belousov
* under sponsorship from the FreeBSD Foundation.
*/
#include
__FBSDID("$FreeBSD$");
#include "opt_vm.h"
#include
#include
#include
#include
#include
#include
#include
#define TTM_BO_VM_NUM_PREFAULT 16
RB_GENERATE(ttm_bo_device_buffer_objects, ttm_buffer_object, vm_rb,
ttm_bo_cmp_rb_tree_items);
int
ttm_bo_cmp_rb_tree_items(struct ttm_buffer_object *a,
struct ttm_buffer_object *b)
{
if (a->vm_node->start < b->vm_node->start) {
return (-1);
} else if (a->vm_node->start > b->vm_node->start) {
return (1);
} else {
return (0);
}
}
static struct ttm_buffer_object *ttm_bo_vm_lookup_rb(struct ttm_bo_device *bdev,
unsigned long page_start,
unsigned long num_pages)
{
unsigned long cur_offset;
struct ttm_buffer_object *bo;
struct ttm_buffer_object *best_bo = NULL;
bo = RB_ROOT(&bdev->addr_space_rb);
while (bo != NULL) {
cur_offset = bo->vm_node->start;
if (page_start >= cur_offset) {
best_bo = bo;
if (page_start == cur_offset)
break;
bo = RB_RIGHT(bo, vm_rb);
} else
bo = RB_LEFT(bo, vm_rb);
}
if (unlikely(best_bo == NULL))
return NULL;
if (unlikely((best_bo->vm_node->start + best_bo->num_pages) <
(page_start + num_pages)))
return NULL;
return best_bo;
}
static int
ttm_bo_vm_fault(vm_object_t vm_obj, vm_ooffset_t offset,
int prot, vm_page_t *mres)
{
struct ttm_buffer_object *bo = vm_obj->handle;
struct ttm_bo_device *bdev = bo->bdev;
struct ttm_tt *ttm = NULL;
vm_page_t m, m1;
int ret;
int retval = VM_PAGER_OK;
struct ttm_mem_type_manager *man =
&bdev->man[bo->mem.mem_type];
vm_object_pip_add(vm_obj, 1);
if (*mres != NULL) {
(void)vm_page_remove(*mres);
}
retry:
VM_OBJECT_WUNLOCK(vm_obj);
m = NULL;
reserve:
ret = ttm_bo_reserve(bo, false, false, false, 0);
if (unlikely(ret != 0)) {
if (ret == -EBUSY) {
kern_yield(PRI_USER);
goto reserve;
}
}
if (bdev->driver->fault_reserve_notify) {
ret = bdev->driver->fault_reserve_notify(bo);
switch (ret) {
case 0:
break;
case -EBUSY:
case -ERESTARTSYS:
case -EINTR:
kern_yield(PRI_USER);
goto reserve;
default:
retval = VM_PAGER_ERROR;
goto out_unlock;
}
}
/*
* Wait for buffer data in transit, due to a pipelined
* move.
*/
mtx_lock(&bdev->fence_lock);
if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
/*
* Here, the behavior differs between Linux and FreeBSD.
*
* On Linux, the wait is interruptible (3rd argument to
* ttm_bo_wait). There must be some mechanism to resume
* page fault handling, once the signal is processed.
*
* On FreeBSD, the wait is uninteruptible. This is not a
* problem as we can't end up with an unkillable process
* here, because the wait will eventually time out.
*
* An example of this situation is the Xorg process
* which uses SIGALRM internally. The signal could
* interrupt the wait, causing the page fault to fail
* and the process to receive SIGSEGV.
*/
ret = ttm_bo_wait(bo, false, false, false);
mtx_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) {
retval = VM_PAGER_ERROR;
goto out_unlock;
}
} else
mtx_unlock(&bdev->fence_lock);
ret = ttm_mem_io_lock(man, true);
if (unlikely(ret != 0)) {
retval = VM_PAGER_ERROR;
goto out_unlock;
}
ret = ttm_mem_io_reserve_vm(bo);
if (unlikely(ret != 0)) {
retval = VM_PAGER_ERROR;
goto out_io_unlock;
}
/*
* Strictly, we're not allowed to modify vma->vm_page_prot here,
* since the mmap_sem is only held in read mode. However, we
* modify only the caching bits of vma->vm_page_prot and
* consider those bits protected by
* the bo->mutex, as we should be the only writers.
* There shouldn't really be any readers of these bits except
* within vm_insert_mixed()? fork?
*
* TODO: Add a list of vmas to the bo, and change the
* vma->vm_page_prot when the object changes caching policy, with
* the correct locks held.
*/
if (!bo->mem.bus.is_iomem) {
/* Allocate all page at once, most common usage */
ttm = bo->ttm;
if (ttm->bdev->driver->ttm_tt_populate(ttm)) {
retval = VM_PAGER_ERROR;
goto out_io_unlock;
}
}
if (bo->mem.bus.is_iomem) {
m = PHYS_TO_VM_PAGE(bo->mem.bus.base + bo->mem.bus.offset +
offset);
KASSERT((m->flags & PG_FICTITIOUS) != 0,
("physical address %#jx not fictitious",
(uintmax_t)(bo->mem.bus.base + bo->mem.bus.offset
+ offset)));
pmap_page_set_memattr(m, ttm_io_prot(bo->mem.placement));
} else {
ttm = bo->ttm;
m = ttm->pages[OFF_TO_IDX(offset)];
if (unlikely(!m)) {
retval = VM_PAGER_ERROR;
goto out_io_unlock;
}
pmap_page_set_memattr(m,
(bo->mem.placement & TTM_PL_FLAG_CACHED) ?
VM_MEMATTR_WRITE_BACK : ttm_io_prot(bo->mem.placement));
}
VM_OBJECT_WLOCK(vm_obj);
if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) {
ttm_mem_io_unlock(man);
ttm_bo_unreserve(bo);
goto retry;
}
m1 = vm_page_lookup(vm_obj, OFF_TO_IDX(offset));
if (m1 == NULL) {
if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
vm_page_xunbusy(m);
VM_OBJECT_WUNLOCK(vm_obj);
vm_wait(vm_obj);
VM_OBJECT_WLOCK(vm_obj);
ttm_mem_io_unlock(man);
ttm_bo_unreserve(bo);
goto retry;
}
} else {
KASSERT(m == m1,
("inconsistent insert bo %p m %p m1 %p offset %jx",
bo, m, m1, (uintmax_t)offset));
}
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
if (*mres != NULL) {
KASSERT(*mres != m, ("losing %p %p", *mres, m));
vm_page_free(*mres);
}
*mres = m;
out_io_unlock1:
ttm_mem_io_unlock(man);
out_unlock1:
ttm_bo_unreserve(bo);
vm_object_pip_wakeup(vm_obj);
return (retval);
out_io_unlock:
VM_OBJECT_WLOCK(vm_obj);
goto out_io_unlock1;
out_unlock:
VM_OBJECT_WLOCK(vm_obj);
goto out_unlock1;
}
static int
ttm_bo_vm_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
{
/*
* On Linux, a reference to the buffer object is acquired here.
* The reason is that this function is not called when the
* mmap() is initialized, but only when a process forks for
* instance. Therefore on Linux, the reference on the bo is
* acquired either in ttm_bo_mmap() or ttm_bo_vm_open(). It's
* then released in ttm_bo_vm_close().
*
* Here, this function is called during mmap() initialization.
* Thus, the reference acquired in ttm_bo_mmap_single() is
* sufficient.
*/
*color = 0;
return (0);
}
static void
ttm_bo_vm_dtor(void *handle)
{
struct ttm_buffer_object *bo = handle;
ttm_bo_unref(&bo);
}
static struct cdev_pager_ops ttm_pager_ops = {
.cdev_pg_fault = ttm_bo_vm_fault,
.cdev_pg_ctor = ttm_bo_vm_ctor,
.cdev_pg_dtor = ttm_bo_vm_dtor
};
int
ttm_bo_mmap_single(struct ttm_bo_device *bdev, vm_ooffset_t *offset, vm_size_t size,
struct vm_object **obj_res, int nprot)
{
struct ttm_bo_driver *driver;
struct ttm_buffer_object *bo;
struct vm_object *vm_obj;
int ret;
rw_wlock(&bdev->vm_lock);
bo = ttm_bo_vm_lookup_rb(bdev, OFF_TO_IDX(*offset), OFF_TO_IDX(size));
if (likely(bo != NULL))
refcount_acquire(&bo->kref);
rw_wunlock(&bdev->vm_lock);
if (unlikely(bo == NULL)) {
printf("[TTM] Could not find buffer object to map\n");
return (-EINVAL);
}
driver = bo->bdev->driver;
if (unlikely(!driver->verify_access)) {
ret = -EPERM;
goto out_unref;
}
ret = driver->verify_access(bo);
if (unlikely(ret != 0))
goto out_unref;
vm_obj = cdev_pager_allocate(bo, OBJT_MGTDEVICE, &ttm_pager_ops,
size, nprot, 0, curthread->td_ucred);
if (vm_obj == NULL) {
ret = -EINVAL;
goto out_unref;
}
/*
* Note: We're transferring the bo reference to vm_obj->handle here.
*/
*offset = 0;
*obj_res = vm_obj;
return 0;
out_unref:
ttm_bo_unref(&bo);
return ret;
}
void
ttm_bo_release_mmap(struct ttm_buffer_object *bo)
{
vm_object_t vm_obj;
vm_page_t m;
int i;
vm_obj = cdev_pager_lookup(bo);
if (vm_obj == NULL)
return;
VM_OBJECT_WLOCK(vm_obj);
retry:
for (i = 0; i < bo->num_pages; i++) {
m = vm_page_lookup(vm_obj, i);
if (m == NULL)
continue;
if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0)
goto retry;
cdev_pager_free_page(vm_obj, m);
}
VM_OBJECT_WUNLOCK(vm_obj);
vm_object_deallocate(vm_obj);
}
#if 0
int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
{
if (vma->vm_pgoff != 0)
return -EACCES;
vma->vm_ops = &ttm_bo_vm_ops;
vma->vm_private_data = ttm_bo_reference(bo);
vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
return 0;
}
ssize_t ttm_bo_io(struct ttm_bo_device *bdev, struct file *filp,
const char __user *wbuf, char __user *rbuf, size_t count,
loff_t *f_pos, bool write)
{
struct ttm_buffer_object *bo;
struct ttm_bo_driver *driver;
struct ttm_bo_kmap_obj map;
unsigned long dev_offset = (*f_pos >> PAGE_SHIFT);
unsigned long kmap_offset;
unsigned long kmap_end;
unsigned long kmap_num;
size_t io_size;
unsigned int page_offset;
char *virtual;
int ret;
bool no_wait = false;
bool dummy;
read_lock(&bdev->vm_lock);
bo = ttm_bo_vm_lookup_rb(bdev, dev_offset, 1);
if (likely(bo != NULL))
ttm_bo_reference(bo);
read_unlock(&bdev->vm_lock);
if (unlikely(bo == NULL))
return -EFAULT;
driver = bo->bdev->driver;
if (unlikely(!driver->verify_access)) {
ret = -EPERM;
goto out_unref;
}
ret = driver->verify_access(bo, filp);
if (unlikely(ret != 0))
goto out_unref;
kmap_offset = dev_offset - bo->vm_node->start;
if (unlikely(kmap_offset >= bo->num_pages)) {
ret = -EFBIG;
goto out_unref;
}
page_offset = *f_pos & ~PAGE_MASK;
io_size = bo->num_pages - kmap_offset;
io_size = (io_size << PAGE_SHIFT) - page_offset;
if (count < io_size)
io_size = count;
kmap_end = (*f_pos + count - 1) >> PAGE_SHIFT;
kmap_num = kmap_end - kmap_offset + 1;
ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
switch (ret) {
case 0:
break;
case -EBUSY:
ret = -EAGAIN;
goto out_unref;
default:
goto out_unref;
}
ret = ttm_bo_kmap(bo, kmap_offset, kmap_num, &map);
if (unlikely(ret != 0)) {
ttm_bo_unreserve(bo);
goto out_unref;
}
virtual = ttm_kmap_obj_virtual(&map, &dummy);
virtual += page_offset;
if (write)
ret = copy_from_user(virtual, wbuf, io_size);
else
ret = copy_to_user(rbuf, virtual, io_size);
ttm_bo_kunmap(&map);
ttm_bo_unreserve(bo);
ttm_bo_unref(&bo);
if (unlikely(ret != 0))
return -EFBIG;
*f_pos += io_size;
return io_size;
out_unref:
ttm_bo_unref(&bo);
return ret;
}
ssize_t ttm_bo_fbdev_io(struct ttm_buffer_object *bo, const char __user *wbuf,
char __user *rbuf, size_t count, loff_t *f_pos,
bool write)
{
struct ttm_bo_kmap_obj map;
unsigned long kmap_offset;
unsigned long kmap_end;
unsigned long kmap_num;
size_t io_size;
unsigned int page_offset;
char *virtual;
int ret;
bool no_wait = false;
bool dummy;
kmap_offset = (*f_pos >> PAGE_SHIFT);
if (unlikely(kmap_offset >= bo->num_pages))
return -EFBIG;
page_offset = *f_pos & ~PAGE_MASK;
io_size = bo->num_pages - kmap_offset;
io_size = (io_size << PAGE_SHIFT) - page_offset;
if (count < io_size)
io_size = count;
kmap_end = (*f_pos + count - 1) >> PAGE_SHIFT;
kmap_num = kmap_end - kmap_offset + 1;
ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
switch (ret) {
case 0:
break;
case -EBUSY:
return -EAGAIN;
default:
return ret;
}
ret = ttm_bo_kmap(bo, kmap_offset, kmap_num, &map);
if (unlikely(ret != 0)) {
ttm_bo_unreserve(bo);
return ret;
}
virtual = ttm_kmap_obj_virtual(&map, &dummy);
virtual += page_offset;
if (write)
ret = copy_from_user(virtual, wbuf, io_size);
else
ret = copy_to_user(rbuf, virtual, io_size);
ttm_bo_kunmap(&map);
ttm_bo_unreserve(bo);
ttm_bo_unref(&bo);
if (unlikely(ret != 0))
return ret;
*f_pos += io_size;
return io_size;
}
#endif
Index: head/sys/dev/drm2/ttm/ttm_tt.c
===================================================================
--- head/sys/dev/drm2/ttm/ttm_tt.c (revision 353538)
+++ head/sys/dev/drm2/ttm/ttm_tt.c (revision 353539)
@@ -1,360 +1,360 @@
/**************************************************************************
*
* Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/*
* Authors: Thomas Hellstrom
*/
/*
* Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by Konstantin Belousov
* under sponsorship from the FreeBSD Foundation.
*/
#include
__FBSDID("$FreeBSD$");
#include
#include
#include
#include
#include
MALLOC_DEFINE(M_TTM_PD, "ttm_pd", "TTM Page Directories");
/**
* Allocates storage for pointers to the pages that back the ttm.
*/
static void ttm_tt_alloc_page_directory(struct ttm_tt *ttm)
{
ttm->pages = malloc(ttm->num_pages * sizeof(void *),
M_TTM_PD, M_WAITOK | M_ZERO);
}
static void ttm_dma_tt_alloc_page_directory(struct ttm_dma_tt *ttm)
{
ttm->ttm.pages = malloc(ttm->ttm.num_pages * sizeof(void *),
M_TTM_PD, M_WAITOK | M_ZERO);
ttm->dma_address = malloc(ttm->ttm.num_pages *
sizeof(*ttm->dma_address), M_TTM_PD, M_WAITOK);
}
#if defined(__i386__) || defined(__amd64__)
static inline int ttm_tt_set_page_caching(vm_page_t p,
enum ttm_caching_state c_old,
enum ttm_caching_state c_new)
{
/* XXXKIB our VM does not need this. */
#if 0
if (c_old != tt_cached) {
/* p isn't in the default caching state, set it to
* writeback first to free its current memtype. */
pmap_page_set_memattr(p, VM_MEMATTR_WRITE_BACK);
}
#endif
if (c_new == tt_wc)
pmap_page_set_memattr(p, VM_MEMATTR_WRITE_COMBINING);
else if (c_new == tt_uncached)
pmap_page_set_memattr(p, VM_MEMATTR_UNCACHEABLE);
return (0);
}
#else
static inline int ttm_tt_set_page_caching(vm_page_t p,
enum ttm_caching_state c_old,
enum ttm_caching_state c_new)
{
return 0;
}
#endif
/*
* Change caching policy for the linear kernel map
* for range of pages in a ttm.
*/
static int ttm_tt_set_caching(struct ttm_tt *ttm,
enum ttm_caching_state c_state)
{
int i, j;
vm_page_t cur_page;
int ret;
if (ttm->caching_state == c_state)
return 0;
if (ttm->state == tt_unpopulated) {
/* Change caching but don't populate */
ttm->caching_state = c_state;
return 0;
}
if (ttm->caching_state == tt_cached)
drm_clflush_pages(ttm->pages, ttm->num_pages);
for (i = 0; i < ttm->num_pages; ++i) {
cur_page = ttm->pages[i];
if (likely(cur_page != NULL)) {
ret = ttm_tt_set_page_caching(cur_page,
ttm->caching_state,
c_state);
if (unlikely(ret != 0))
goto out_err;
}
}
ttm->caching_state = c_state;
return 0;
out_err:
for (j = 0; j < i; ++j) {
cur_page = ttm->pages[j];
if (cur_page != NULL) {
(void)ttm_tt_set_page_caching(cur_page, c_state,
ttm->caching_state);
}
}
return ret;
}
int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t placement)
{
enum ttm_caching_state state;
if (placement & TTM_PL_FLAG_WC)
state = tt_wc;
else if (placement & TTM_PL_FLAG_UNCACHED)
state = tt_uncached;
else
state = tt_cached;
return ttm_tt_set_caching(ttm, state);
}
void ttm_tt_destroy(struct ttm_tt *ttm)
{
if (unlikely(ttm == NULL))
return;
if (ttm->state == tt_bound) {
ttm_tt_unbind(ttm);
}
if (likely(ttm->pages != NULL)) {
ttm->bdev->driver->ttm_tt_unpopulate(ttm);
}
if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP) &&
ttm->swap_storage)
vm_object_deallocate(ttm->swap_storage);
ttm->swap_storage = NULL;
ttm->func->destroy(ttm);
}
int ttm_tt_init(struct ttm_tt *ttm, struct ttm_bo_device *bdev,
unsigned long size, uint32_t page_flags,
vm_page_t dummy_read_page)
{
ttm->bdev = bdev;
ttm->glob = bdev->glob;
ttm->num_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
ttm->caching_state = tt_cached;
ttm->page_flags = page_flags;
ttm->dummy_read_page = dummy_read_page;
ttm->state = tt_unpopulated;
ttm->swap_storage = NULL;
ttm_tt_alloc_page_directory(ttm);
if (!ttm->pages) {
ttm_tt_destroy(ttm);
printf("Failed allocating page table\n");
return -ENOMEM;
}
return 0;
}
void ttm_tt_fini(struct ttm_tt *ttm)
{
free(ttm->pages, M_TTM_PD);
ttm->pages = NULL;
}
int ttm_dma_tt_init(struct ttm_dma_tt *ttm_dma, struct ttm_bo_device *bdev,
unsigned long size, uint32_t page_flags,
vm_page_t dummy_read_page)
{
struct ttm_tt *ttm = &ttm_dma->ttm;
ttm->bdev = bdev;
ttm->glob = bdev->glob;
ttm->num_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
ttm->caching_state = tt_cached;
ttm->page_flags = page_flags;
ttm->dummy_read_page = dummy_read_page;
ttm->state = tt_unpopulated;
ttm->swap_storage = NULL;
INIT_LIST_HEAD(&ttm_dma->pages_list);
ttm_dma_tt_alloc_page_directory(ttm_dma);
if (!ttm->pages || !ttm_dma->dma_address) {
ttm_tt_destroy(ttm);
printf("Failed allocating page table\n");
return -ENOMEM;
}
return 0;
}
void ttm_dma_tt_fini(struct ttm_dma_tt *ttm_dma)
{
struct ttm_tt *ttm = &ttm_dma->ttm;
free(ttm->pages, M_TTM_PD);
ttm->pages = NULL;
free(ttm_dma->dma_address, M_TTM_PD);
ttm_dma->dma_address = NULL;
}
void ttm_tt_unbind(struct ttm_tt *ttm)
{
int ret;
if (ttm->state == tt_bound) {
ret = ttm->func->unbind(ttm);
MPASS(ret == 0);
ttm->state = tt_unbound;
}
}
int ttm_tt_bind(struct ttm_tt *ttm, struct ttm_mem_reg *bo_mem)
{
int ret = 0;
if (!ttm)
return -EINVAL;
if (ttm->state == tt_bound)
return 0;
ret = ttm->bdev->driver->ttm_tt_populate(ttm);
if (ret)
return ret;
ret = ttm->func->bind(ttm, bo_mem);
if (unlikely(ret != 0))
return ret;
ttm->state = tt_bound;
return 0;
}
int ttm_tt_swapin(struct ttm_tt *ttm)
{
vm_object_t obj;
vm_page_t from_page, to_page;
int i, ret, rv;
obj = ttm->swap_storage;
VM_OBJECT_WLOCK(obj);
vm_object_pip_add(obj, 1);
for (i = 0; i < ttm->num_pages; ++i) {
rv = vm_page_grab_valid(&from_page, obj, i,
VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
if (rv != VM_PAGER_OK) {
ret = -EIO;
goto err_ret;
}
to_page = ttm->pages[i];
if (unlikely(to_page == NULL)) {
ret = -ENOMEM;
goto err_ret;
}
pmap_copy_page(from_page, to_page);
}
vm_object_pip_wakeup(obj);
VM_OBJECT_WUNLOCK(obj);
if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP))
vm_object_deallocate(obj);
ttm->swap_storage = NULL;
ttm->page_flags &= ~TTM_PAGE_FLAG_SWAPPED;
return (0);
err_ret:
vm_object_pip_wakeup(obj);
VM_OBJECT_WUNLOCK(obj);
return (ret);
}
int ttm_tt_swapout(struct ttm_tt *ttm, vm_object_t persistent_swap_storage)
{
vm_object_t obj;
vm_page_t from_page, to_page;
int i;
MPASS(ttm->state == tt_unbound || ttm->state == tt_unpopulated);
MPASS(ttm->caching_state == tt_cached);
if (persistent_swap_storage == NULL) {
obj = vm_pager_allocate(OBJT_SWAP, NULL,
IDX_TO_OFF(ttm->num_pages), VM_PROT_DEFAULT, 0,
curthread->td_ucred);
if (obj == NULL) {
printf("[TTM] Failed allocating swap storage\n");
return (-ENOMEM);
}
} else
obj = persistent_swap_storage;
VM_OBJECT_WLOCK(obj);
vm_object_pip_add(obj, 1);
for (i = 0; i < ttm->num_pages; ++i) {
from_page = ttm->pages[i];
if (unlikely(from_page == NULL))
continue;
to_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL);
pmap_copy_page(from_page, to_page);
- to_page->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(to_page);
vm_page_dirty(to_page);
vm_page_xunbusy(to_page);
}
vm_object_pip_wakeup(obj);
VM_OBJECT_WUNLOCK(obj);
ttm->bdev->driver->ttm_tt_unpopulate(ttm);
ttm->swap_storage = obj;
ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED;
if (persistent_swap_storage != NULL)
ttm->page_flags |= TTM_PAGE_FLAG_PERSISTENT_SWAP;
return (0);
}
Index: head/sys/dev/md/md.c
===================================================================
--- head/sys/dev/md/md.c (revision 353538)
+++ head/sys/dev/md/md.c (revision 353539)
@@ -1,2182 +1,2182 @@
/*-
* SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
*
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
* ----------------------------------------------------------------------------
*
* $FreeBSD$
*
*/
/*-
* The following functions are based in the vn(4) driver: mdstart_swap(),
* mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
* and as such under the following copyright:
*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Portions of this software were developed by Konstantin Belousov
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah Hdr: vn.c 1.13 94/04/02
*
* from: @(#)vn.c 8.6 (Berkeley) 4/1/94
* From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
*/
#include "opt_rootdevname.h"
#include "opt_geom.h"
#include "opt_md.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MD_MODVER 1
#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */
#define MD_EXITING 0x20000 /* Worker thread is exiting. */
#define MD_PROVIDERGONE 0x40000 /* Safe to free the softc */
#ifndef MD_NSECT
#define MD_NSECT (10000 * 2)
#endif
struct md_req {
unsigned md_unit; /* unit number */
enum md_types md_type; /* type of disk */
off_t md_mediasize; /* size of disk in bytes */
unsigned md_sectorsize; /* sectorsize */
unsigned md_options; /* options */
int md_fwheads; /* firmware heads */
int md_fwsectors; /* firmware sectors */
char *md_file; /* pathname of file to mount */
enum uio_seg md_file_seg; /* location of md_file */
char *md_label; /* label of the device (userspace) */
int *md_units; /* pointer to units array (kernel) */
size_t md_units_nitems; /* items in md_units array */
};
#ifdef COMPAT_FREEBSD32
struct md_ioctl32 {
unsigned md_version;
unsigned md_unit;
enum md_types md_type;
uint32_t md_file;
off_t md_mediasize;
unsigned md_sectorsize;
unsigned md_options;
uint64_t md_base;
int md_fwheads;
int md_fwsectors;
uint32_t md_label;
int md_pad[MDNPAD];
} __attribute__((__packed__));
CTASSERT((sizeof(struct md_ioctl32)) == 436);
#define MDIOCATTACH_32 _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
#define MDIOCDETACH_32 _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
#define MDIOCQUERY_32 _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
#define MDIOCRESIZE_32 _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
#endif /* COMPAT_FREEBSD32 */
static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
static int md_debug;
SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
"Enable md(4) debug messages");
static int md_malloc_wait;
SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
"Allow malloc to wait for memory allocations");
#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
#define MD_ROOT_FSTYPE "ufs"
#endif
#if defined(MD_ROOT)
/*
* Preloaded image gets put here.
*/
#if defined(MD_ROOT_SIZE)
/*
* We put the mfs_root symbol into the oldmfs section of the kernel object file.
* Applications that patch the object with the image can determine
* the size looking at the oldmfs section size within the kernel.
*/
u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
const int mfs_root_size = sizeof(mfs_root);
#elif defined(MD_ROOT_MEM)
/* MD region already mapped in the memory */
u_char *mfs_root;
int mfs_root_size;
#else
extern volatile u_char __weak_symbol mfs_root;
extern volatile u_char __weak_symbol mfs_root_end;
__GLOBL(mfs_root);
__GLOBL(mfs_root_end);
#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
#endif
#endif
static g_init_t g_md_init;
static g_fini_t g_md_fini;
static g_start_t g_md_start;
static g_access_t g_md_access;
static void g_md_dumpconf(struct sbuf *sb, const char *indent,
struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
static g_provgone_t g_md_providergone;
static struct cdev *status_dev = NULL;
static struct sx md_sx;
static struct unrhdr *md_uh;
static d_ioctl_t mdctlioctl;
static struct cdevsw mdctl_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = mdctlioctl,
.d_name = MD_NAME,
};
struct g_class g_md_class = {
.name = "MD",
.version = G_VERSION,
.init = g_md_init,
.fini = g_md_fini,
.start = g_md_start,
.access = g_md_access,
.dumpconf = g_md_dumpconf,
.providergone = g_md_providergone,
};
DECLARE_GEOM_CLASS(g_md_class, g_md);
static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
#define NINDIR (PAGE_SIZE / sizeof(uintptr_t))
#define NMASK (NINDIR-1)
static int nshift;
static uma_zone_t md_pbuf_zone;
struct indir {
uintptr_t *array;
u_int total;
u_int used;
u_int shift;
};
struct md_s {
int unit;
LIST_ENTRY(md_s) list;
struct bio_queue_head bio_queue;
struct mtx queue_mtx;
struct mtx stat_mtx;
struct cdev *dev;
enum md_types type;
off_t mediasize;
unsigned sectorsize;
unsigned opencount;
unsigned fwheads;
unsigned fwsectors;
char ident[32];
unsigned flags;
char name[20];
struct proc *procp;
struct g_geom *gp;
struct g_provider *pp;
int (*start)(struct md_s *sc, struct bio *bp);
struct devstat *devstat;
/* MD_MALLOC related fields */
struct indir *indir;
uma_zone_t uma;
/* MD_PRELOAD related fields */
u_char *pl_ptr;
size_t pl_len;
/* MD_VNODE related fields */
struct vnode *vnode;
char file[PATH_MAX];
char label[PATH_MAX];
struct ucred *cred;
/* MD_SWAP related fields */
vm_object_t object;
};
static struct indir *
new_indir(u_int shift)
{
struct indir *ip;
ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
| M_ZERO);
if (ip == NULL)
return (NULL);
ip->array = malloc(sizeof(uintptr_t) * NINDIR,
M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
if (ip->array == NULL) {
free(ip, M_MD);
return (NULL);
}
ip->total = NINDIR;
ip->shift = shift;
return (ip);
}
static void
del_indir(struct indir *ip)
{
free(ip->array, M_MDSECT);
free(ip, M_MD);
}
static void
destroy_indir(struct md_s *sc, struct indir *ip)
{
int i;
for (i = 0; i < NINDIR; i++) {
if (!ip->array[i])
continue;
if (ip->shift)
destroy_indir(sc, (struct indir*)(ip->array[i]));
else if (ip->array[i] > 255)
uma_zfree(sc->uma, (void *)(ip->array[i]));
}
del_indir(ip);
}
/*
* This function does the math and allocates the top level "indir" structure
* for a device of "size" sectors.
*/
static struct indir *
dimension(off_t size)
{
off_t rcnt;
struct indir *ip;
int layer;
rcnt = size;
layer = 0;
while (rcnt > NINDIR) {
rcnt /= NINDIR;
layer++;
}
/*
* XXX: the top layer is probably not fully populated, so we allocate
* too much space for ip->array in here.
*/
ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
ip->array = malloc(sizeof(uintptr_t) * NINDIR,
M_MDSECT, M_WAITOK | M_ZERO);
ip->total = NINDIR;
ip->shift = layer * nshift;
return (ip);
}
/*
* Read a given sector
*/
static uintptr_t
s_read(struct indir *ip, off_t offset)
{
struct indir *cip;
int idx;
uintptr_t up;
if (md_debug > 1)
printf("s_read(%jd)\n", (intmax_t)offset);
up = 0;
for (cip = ip; cip != NULL;) {
if (cip->shift) {
idx = (offset >> cip->shift) & NMASK;
up = cip->array[idx];
cip = (struct indir *)up;
continue;
}
idx = offset & NMASK;
return (cip->array[idx]);
}
return (0);
}
/*
* Write a given sector, prune the tree if the value is 0
*/
static int
s_write(struct indir *ip, off_t offset, uintptr_t ptr)
{
struct indir *cip, *lip[10];
int idx, li;
uintptr_t up;
if (md_debug > 1)
printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
up = 0;
li = 0;
cip = ip;
for (;;) {
lip[li++] = cip;
if (cip->shift) {
idx = (offset >> cip->shift) & NMASK;
up = cip->array[idx];
if (up != 0) {
cip = (struct indir *)up;
continue;
}
/* Allocate branch */
cip->array[idx] =
(uintptr_t)new_indir(cip->shift - nshift);
if (cip->array[idx] == 0)
return (ENOSPC);
cip->used++;
up = cip->array[idx];
cip = (struct indir *)up;
continue;
}
/* leafnode */
idx = offset & NMASK;
up = cip->array[idx];
if (up != 0)
cip->used--;
cip->array[idx] = ptr;
if (ptr != 0)
cip->used++;
break;
}
if (cip->used != 0 || li == 1)
return (0);
li--;
while (cip->used == 0 && cip != ip) {
li--;
idx = (offset >> lip[li]->shift) & NMASK;
up = lip[li]->array[idx];
KASSERT(up == (uintptr_t)cip, ("md screwed up"));
del_indir(cip);
lip[li]->array[idx] = 0;
lip[li]->used--;
cip = lip[li];
}
return (0);
}
static int
g_md_access(struct g_provider *pp, int r, int w, int e)
{
struct md_s *sc;
sc = pp->geom->softc;
if (sc == NULL) {
if (r <= 0 && w <= 0 && e <= 0)
return (0);
return (ENXIO);
}
r += pp->acr;
w += pp->acw;
e += pp->ace;
if ((sc->flags & MD_READONLY) != 0 && w > 0)
return (EROFS);
if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
sc->opencount = 1;
} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
sc->opencount = 0;
}
return (0);
}
static void
g_md_start(struct bio *bp)
{
struct md_s *sc;
sc = bp->bio_to->geom->softc;
if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
mtx_lock(&sc->stat_mtx);
devstat_start_transaction_bio(sc->devstat, bp);
mtx_unlock(&sc->stat_mtx);
}
mtx_lock(&sc->queue_mtx);
bioq_disksort(&sc->bio_queue, bp);
wakeup(sc);
mtx_unlock(&sc->queue_mtx);
}
#define MD_MALLOC_MOVE_ZERO 1
#define MD_MALLOC_MOVE_FILL 2
#define MD_MALLOC_MOVE_READ 3
#define MD_MALLOC_MOVE_WRITE 4
#define MD_MALLOC_MOVE_CMP 5
static int
md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
void *ptr, u_char fill, int op)
{
struct sf_buf *sf;
vm_page_t m, *mp1;
char *p, first;
off_t *uc;
unsigned n;
int error, i, ma_offs1, sz, first_read;
m = NULL;
error = 0;
sf = NULL;
/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
first = 0;
first_read = 0;
uc = ptr;
mp1 = *mp;
ma_offs1 = *ma_offs;
/* } */
sched_pin();
for (n = sectorsize; n != 0; n -= sz) {
sz = imin(PAGE_SIZE - *ma_offs, n);
if (m != **mp) {
if (sf != NULL)
sf_buf_free(sf);
m = **mp;
sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
(md_malloc_wait ? 0 : SFB_NOWAIT));
if (sf == NULL) {
error = ENOMEM;
break;
}
}
p = (char *)sf_buf_kva(sf) + *ma_offs;
switch (op) {
case MD_MALLOC_MOVE_ZERO:
bzero(p, sz);
break;
case MD_MALLOC_MOVE_FILL:
memset(p, fill, sz);
break;
case MD_MALLOC_MOVE_READ:
bcopy(ptr, p, sz);
cpu_flush_dcache(p, sz);
break;
case MD_MALLOC_MOVE_WRITE:
bcopy(p, ptr, sz);
break;
case MD_MALLOC_MOVE_CMP:
for (i = 0; i < sz; i++, p++) {
if (!first_read) {
*uc = (u_char)*p;
first = *p;
first_read = 1;
} else if (*p != first) {
error = EDOOFUS;
break;
}
}
break;
default:
KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
break;
}
if (error != 0)
break;
*ma_offs += sz;
*ma_offs %= PAGE_SIZE;
if (*ma_offs == 0)
(*mp)++;
ptr = (char *)ptr + sz;
}
if (sf != NULL)
sf_buf_free(sf);
sched_unpin();
if (op == MD_MALLOC_MOVE_CMP && error != 0) {
*mp = mp1;
*ma_offs = ma_offs1;
}
return (error);
}
static int
md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
unsigned len, void *ptr, u_char fill, int op)
{
bus_dma_segment_t *vlist;
uint8_t *p, *end, first;
off_t *uc;
int ma_offs, seg_len;
vlist = *pvlist;
ma_offs = *pma_offs;
uc = ptr;
for (; len != 0; len -= seg_len) {
seg_len = imin(vlist->ds_len - ma_offs, len);
p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
switch (op) {
case MD_MALLOC_MOVE_ZERO:
bzero(p, seg_len);
break;
case MD_MALLOC_MOVE_FILL:
memset(p, fill, seg_len);
break;
case MD_MALLOC_MOVE_READ:
bcopy(ptr, p, seg_len);
cpu_flush_dcache(p, seg_len);
break;
case MD_MALLOC_MOVE_WRITE:
bcopy(p, ptr, seg_len);
break;
case MD_MALLOC_MOVE_CMP:
end = p + seg_len;
first = *uc = *p;
/* Confirm all following bytes match the first */
while (++p < end) {
if (*p != first)
return (EDOOFUS);
}
break;
default:
KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
break;
}
ma_offs += seg_len;
if (ma_offs == vlist->ds_len) {
ma_offs = 0;
vlist++;
}
ptr = (uint8_t *)ptr + seg_len;
}
*pvlist = vlist;
*pma_offs = ma_offs;
return (0);
}
static int
mdstart_malloc(struct md_s *sc, struct bio *bp)
{
u_char *dst;
vm_page_t *m;
bus_dma_segment_t *vlist;
int i, error, error1, ma_offs, notmapped;
off_t secno, nsec, uc;
uintptr_t sp, osp;
switch (bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
break;
default:
return (EOPNOTSUPP);
}
notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
(bus_dma_segment_t *)bp->bio_data : NULL;
if (notmapped) {
m = bp->bio_ma;
ma_offs = bp->bio_ma_offset;
dst = NULL;
KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
} else if (vlist != NULL) {
ma_offs = bp->bio_ma_offset;
dst = NULL;
} else {
dst = bp->bio_data;
}
nsec = bp->bio_length / sc->sectorsize;
secno = bp->bio_offset / sc->sectorsize;
error = 0;
while (nsec--) {
osp = s_read(sc->indir, secno);
if (bp->bio_cmd == BIO_DELETE) {
if (osp != 0)
error = s_write(sc->indir, secno, 0);
} else if (bp->bio_cmd == BIO_READ) {
if (osp == 0) {
if (notmapped) {
error = md_malloc_move_ma(&m, &ma_offs,
sc->sectorsize, NULL, 0,
MD_MALLOC_MOVE_ZERO);
} else if (vlist != NULL) {
error = md_malloc_move_vlist(&vlist,
&ma_offs, sc->sectorsize, NULL, 0,
MD_MALLOC_MOVE_ZERO);
} else
bzero(dst, sc->sectorsize);
} else if (osp <= 255) {
if (notmapped) {
error = md_malloc_move_ma(&m, &ma_offs,
sc->sectorsize, NULL, osp,
MD_MALLOC_MOVE_FILL);
} else if (vlist != NULL) {
error = md_malloc_move_vlist(&vlist,
&ma_offs, sc->sectorsize, NULL, osp,
MD_MALLOC_MOVE_FILL);
} else
memset(dst, osp, sc->sectorsize);
} else {
if (notmapped) {
error = md_malloc_move_ma(&m, &ma_offs,
sc->sectorsize, (void *)osp, 0,
MD_MALLOC_MOVE_READ);
} else if (vlist != NULL) {
error = md_malloc_move_vlist(&vlist,
&ma_offs, sc->sectorsize,
(void *)osp, 0,
MD_MALLOC_MOVE_READ);
} else {
bcopy((void *)osp, dst, sc->sectorsize);
cpu_flush_dcache(dst, sc->sectorsize);
}
}
osp = 0;
} else if (bp->bio_cmd == BIO_WRITE) {
if (sc->flags & MD_COMPRESS) {
if (notmapped) {
error1 = md_malloc_move_ma(&m, &ma_offs,
sc->sectorsize, &uc, 0,
MD_MALLOC_MOVE_CMP);
i = error1 == 0 ? sc->sectorsize : 0;
} else if (vlist != NULL) {
error1 = md_malloc_move_vlist(&vlist,
&ma_offs, sc->sectorsize, &uc, 0,
MD_MALLOC_MOVE_CMP);
i = error1 == 0 ? sc->sectorsize : 0;
} else {
uc = dst[0];
for (i = 1; i < sc->sectorsize; i++) {
if (dst[i] != uc)
break;
}
}
} else {
i = 0;
uc = 0;
}
if (i == sc->sectorsize) {
if (osp != uc)
error = s_write(sc->indir, secno, uc);
} else {
if (osp <= 255) {
sp = (uintptr_t)uma_zalloc(sc->uma,
md_malloc_wait ? M_WAITOK :
M_NOWAIT);
if (sp == 0) {
error = ENOSPC;
break;
}
if (notmapped) {
error = md_malloc_move_ma(&m,
&ma_offs, sc->sectorsize,
(void *)sp, 0,
MD_MALLOC_MOVE_WRITE);
} else if (vlist != NULL) {
error = md_malloc_move_vlist(
&vlist, &ma_offs,
sc->sectorsize, (void *)sp,
0, MD_MALLOC_MOVE_WRITE);
} else {
bcopy(dst, (void *)sp,
sc->sectorsize);
}
error = s_write(sc->indir, secno, sp);
} else {
if (notmapped) {
error = md_malloc_move_ma(&m,
&ma_offs, sc->sectorsize,
(void *)osp, 0,
MD_MALLOC_MOVE_WRITE);
} else if (vlist != NULL) {
error = md_malloc_move_vlist(
&vlist, &ma_offs,
sc->sectorsize, (void *)osp,
0, MD_MALLOC_MOVE_WRITE);
} else {
bcopy(dst, (void *)osp,
sc->sectorsize);
}
osp = 0;
}
}
} else {
error = EOPNOTSUPP;
}
if (osp > 255)
uma_zfree(sc->uma, (void*)osp);
if (error != 0)
break;
secno++;
if (!notmapped && vlist == NULL)
dst += sc->sectorsize;
}
bp->bio_resid = 0;
return (error);
}
static void
mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
{
off_t seg_len;
while (offset >= vlist->ds_len) {
offset -= vlist->ds_len;
vlist++;
}
while (len != 0) {
seg_len = omin(len, vlist->ds_len - offset);
bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
seg_len);
offset = 0;
src = (uint8_t *)src + seg_len;
len -= seg_len;
vlist++;
}
}
static void
mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
{
off_t seg_len;
while (offset >= vlist->ds_len) {
offset -= vlist->ds_len;
vlist++;
}
while (len != 0) {
seg_len = omin(len, vlist->ds_len - offset);
bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
seg_len);
offset = 0;
dst = (uint8_t *)dst + seg_len;
len -= seg_len;
vlist++;
}
}
static int
mdstart_preload(struct md_s *sc, struct bio *bp)
{
uint8_t *p;
p = sc->pl_ptr + bp->bio_offset;
switch (bp->bio_cmd) {
case BIO_READ:
if ((bp->bio_flags & BIO_VLIST) != 0) {
mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
bp->bio_ma_offset, bp->bio_length);
} else {
bcopy(p, bp->bio_data, bp->bio_length);
}
cpu_flush_dcache(bp->bio_data, bp->bio_length);
break;
case BIO_WRITE:
if ((bp->bio_flags & BIO_VLIST) != 0) {
mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
bp->bio_ma_offset, p, bp->bio_length);
} else {
bcopy(bp->bio_data, p, bp->bio_length);
}
break;
}
bp->bio_resid = 0;
return (0);
}
static int
mdstart_vnode(struct md_s *sc, struct bio *bp)
{
int error;
struct uio auio;
struct iovec aiov;
struct iovec *piov;
struct mount *mp;
struct vnode *vp;
struct buf *pb;
bus_dma_segment_t *vlist;
struct thread *td;
off_t iolen, iostart, len, zerosize;
int ma_offs, npages;
switch (bp->bio_cmd) {
case BIO_READ:
auio.uio_rw = UIO_READ;
break;
case BIO_WRITE:
case BIO_DELETE:
auio.uio_rw = UIO_WRITE;
break;
case BIO_FLUSH:
break;
default:
return (EOPNOTSUPP);
}
td = curthread;
vp = sc->vnode;
pb = NULL;
piov = NULL;
ma_offs = bp->bio_ma_offset;
len = bp->bio_length;
/*
* VNODE I/O
*
* If an error occurs, we set BIO_ERROR but we do not set
* B_INVAL because (for a write anyway), the buffer is
* still valid.
*/
if (bp->bio_cmd == BIO_FLUSH) {
(void) vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_FSYNC(vp, MNT_WAIT, td);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
return (error);
}
auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
auio.uio_resid = bp->bio_length;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
if (bp->bio_cmd == BIO_DELETE) {
/*
* Emulate BIO_DELETE by writing zeros.
*/
zerosize = ZERO_REGION_SIZE -
(ZERO_REGION_SIZE % sc->sectorsize);
auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
auio.uio_iov = piov;
while (len > 0) {
piov->iov_base = __DECONST(void *, zero_region);
piov->iov_len = len;
if (len > zerosize)
piov->iov_len = zerosize;
len -= piov->iov_len;
piov++;
}
piov = auio.uio_iov;
} else if ((bp->bio_flags & BIO_VLIST) != 0) {
piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
auio.uio_iov = piov;
vlist = (bus_dma_segment_t *)bp->bio_data;
while (len > 0) {
piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
ma_offs);
piov->iov_len = vlist->ds_len - ma_offs;
if (piov->iov_len > len)
piov->iov_len = len;
len -= piov->iov_len;
ma_offs = 0;
vlist++;
piov++;
}
auio.uio_iovcnt = piov - auio.uio_iov;
piov = auio.uio_iov;
} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
bp->bio_resid = len;
unmapped_step:
npages = atop(min(MAXPHYS, round_page(len + (ma_offs &
PAGE_MASK))));
iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
KASSERT(iolen > 0, ("zero iolen"));
pmap_qenter((vm_offset_t)pb->b_data,
&bp->bio_ma[atop(ma_offs)], npages);
aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
(ma_offs & PAGE_MASK));
aiov.iov_len = iolen;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = iolen;
} else {
aiov.iov_base = bp->bio_data;
aiov.iov_len = bp->bio_length;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
}
iostart = auio.uio_offset;
if (auio.uio_rw == UIO_READ) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_READ(vp, &auio, 0, sc->cred);
VOP_UNLOCK(vp, 0);
} else {
(void) vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
sc->cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
if (error == 0)
sc->flags &= ~MD_VERIFY;
}
/* When MD_CACHE is set, try to avoid double-caching the data. */
if (error == 0 && (sc->flags & MD_CACHE) == 0)
VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
POSIX_FADV_DONTNEED);
if (pb != NULL) {
pmap_qremove((vm_offset_t)pb->b_data, npages);
if (error == 0) {
len -= iolen;
bp->bio_resid -= iolen;
ma_offs += iolen;
if (len > 0)
goto unmapped_step;
}
uma_zfree(md_pbuf_zone, pb);
}
free(piov, M_MD);
if (pb == NULL)
bp->bio_resid = auio.uio_resid;
return (error);
}
static void
md_swap_page_free(vm_page_t m)
{
vm_page_xunbusy(m);
vm_page_free(m);
}
static int
mdstart_swap(struct md_s *sc, struct bio *bp)
{
vm_page_t m;
u_char *p;
vm_pindex_t i, lastp;
bus_dma_segment_t *vlist;
int rv, ma_offs, offs, len, lastend;
switch (bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
break;
default:
return (EOPNOTSUPP);
}
p = bp->bio_data;
ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
bp->bio_ma_offset : 0;
vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
(bus_dma_segment_t *)bp->bio_data : NULL;
/*
* offs is the offset at which to start operating on the
* next (ie, first) page. lastp is the last page on
* which we're going to operate. lastend is the ending
* position within that last page (ie, PAGE_SIZE if
* we're operating on complete aligned pages).
*/
offs = bp->bio_offset % PAGE_SIZE;
lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
rv = VM_PAGER_OK;
VM_OBJECT_WLOCK(sc->object);
vm_object_pip_add(sc->object, 1);
for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM);
if (bp->bio_cmd == BIO_READ) {
- if (m->valid == VM_PAGE_BITS_ALL)
+ if (vm_page_all_valid(m))
rv = VM_PAGER_OK;
else
rv = vm_pager_get_pages(sc->object, &m, 1,
NULL, NULL);
if (rv == VM_PAGER_ERROR) {
md_swap_page_free(m);
break;
} else if (rv == VM_PAGER_FAIL) {
/*
* Pager does not have the page. Zero
* the allocated page, and mark it as
* valid. Do not set dirty, the page
* can be recreated if thrown out.
*/
pmap_zero_page(m);
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
}
if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
pmap_copy_pages(&m, offs, bp->bio_ma,
ma_offs, len);
} else if ((bp->bio_flags & BIO_VLIST) != 0) {
physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
vlist, ma_offs, len);
cpu_flush_dcache(p, len);
} else {
physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
cpu_flush_dcache(p, len);
}
} else if (bp->bio_cmd == BIO_WRITE) {
- if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL)
+ if (len == PAGE_SIZE || vm_page_all_valid(m))
rv = VM_PAGER_OK;
else
rv = vm_pager_get_pages(sc->object, &m, 1,
NULL, NULL);
if (rv == VM_PAGER_ERROR) {
md_swap_page_free(m);
break;
} else if (rv == VM_PAGER_FAIL)
pmap_zero_page(m);
if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
pmap_copy_pages(bp->bio_ma, ma_offs, &m,
offs, len);
} else if ((bp->bio_flags & BIO_VLIST) != 0) {
physcopyin_vlist(vlist, ma_offs,
VM_PAGE_TO_PHYS(m) + offs, len);
} else {
physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
}
- m->valid = VM_PAGE_BITS_ALL;
+ vm_page_valid(m);
if (m->dirty != VM_PAGE_BITS_ALL) {
vm_page_dirty(m);
vm_pager_page_unswapped(m);
}
} else if (bp->bio_cmd == BIO_DELETE) {
- if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL)
+ if (len == PAGE_SIZE || vm_page_all_valid(m))
rv = VM_PAGER_OK;
else
rv = vm_pager_get_pages(sc->object, &m, 1,
NULL, NULL);
if (rv == VM_PAGER_ERROR) {
md_swap_page_free(m);
break;
} else if (rv == VM_PAGER_FAIL) {
md_swap_page_free(m);
m = NULL;
} else {
/* Page is valid. */
if (len != PAGE_SIZE) {
pmap_zero_page_area(m, offs, len);
if (m->dirty != VM_PAGE_BITS_ALL) {
vm_page_dirty(m);
vm_pager_page_unswapped(m);
}
} else {
vm_pager_page_unswapped(m);
md_swap_page_free(m);
m = NULL;
}
}
}
if (m != NULL) {
vm_page_xunbusy(m);
vm_page_lock(m);
if (vm_page_active(m))
vm_page_reference(m);
else
vm_page_activate(m);
vm_page_unlock(m);
}
/* Actions on further pages start at offset 0 */
p += PAGE_SIZE - offs;
offs = 0;
ma_offs += len;
}
vm_object_pip_wakeup(sc->object);
VM_OBJECT_WUNLOCK(sc->object);
return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
}
static int
mdstart_null(struct md_s *sc, struct bio *bp)
{
switch (bp->bio_cmd) {
case BIO_READ:
bzero(bp->bio_data, bp->bio_length);
cpu_flush_dcache(bp->bio_data, bp->bio_length);
break;
case BIO_WRITE:
break;
}
bp->bio_resid = 0;
return (0);
}
static void
md_kthread(void *arg)
{
struct md_s *sc;
struct bio *bp;
int error;
sc = arg;
thread_lock(curthread);
sched_prio(curthread, PRIBIO);
thread_unlock(curthread);
if (sc->type == MD_VNODE)
curthread->td_pflags |= TDP_NORUNNINGBUF;
for (;;) {
mtx_lock(&sc->queue_mtx);
if (sc->flags & MD_SHUTDOWN) {
sc->flags |= MD_EXITING;
mtx_unlock(&sc->queue_mtx);
kproc_exit(0);
}
bp = bioq_takefirst(&sc->bio_queue);
if (!bp) {
msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
continue;
}
mtx_unlock(&sc->queue_mtx);
if (bp->bio_cmd == BIO_GETATTR) {
int isv = ((sc->flags & MD_VERIFY) != 0);
if ((sc->fwsectors && sc->fwheads &&
(g_handleattr_int(bp, "GEOM::fwsectors",
sc->fwsectors) ||
g_handleattr_int(bp, "GEOM::fwheads",
sc->fwheads))) ||
g_handleattr_int(bp, "GEOM::candelete", 1))
error = -1;
else if (sc->ident[0] != '\0' &&
g_handleattr_str(bp, "GEOM::ident", sc->ident))
error = -1;
else if (g_handleattr_int(bp, "MNT::verified", isv))
error = -1;
else
error = EOPNOTSUPP;
} else {
error = sc->start(sc, bp);
}
if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
/*
* Devstat uses (bio_bcount, bio_resid) for
* determining the length of the completed part of
* the i/o. g_io_deliver() will translate from
* bio_completed to that, but it also destroys the
* bio so we must do our own translation.
*/
bp->bio_bcount = bp->bio_length;
bp->bio_resid = (error == -1 ? bp->bio_bcount : 0);
devstat_end_transaction_bio(sc->devstat, bp);
}
if (error != -1) {
bp->bio_completed = bp->bio_length;
g_io_deliver(bp, error);
}
}
}
static struct md_s *
mdfind(int unit)
{
struct md_s *sc;
LIST_FOREACH(sc, &md_softc_list, list) {
if (sc->unit == unit)
break;
}
return (sc);
}
static struct md_s *
mdnew(int unit, int *errp, enum md_types type)
{
struct md_s *sc;
int error;
*errp = 0;
if (unit == -1)
unit = alloc_unr(md_uh);
else
unit = alloc_unr_specific(md_uh, unit);
if (unit == -1) {
*errp = EBUSY;
return (NULL);
}
sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
sc->type = type;
bioq_init(&sc->bio_queue);
mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF);
sc->unit = unit;
sprintf(sc->name, "md%d", unit);
LIST_INSERT_HEAD(&md_softc_list, sc, list);
error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
if (error == 0)
return (sc);
LIST_REMOVE(sc, list);
mtx_destroy(&sc->stat_mtx);
mtx_destroy(&sc->queue_mtx);
free_unr(md_uh, sc->unit);
free(sc, M_MD);
*errp = error;
return (NULL);
}
static void
mdinit(struct md_s *sc)
{
struct g_geom *gp;
struct g_provider *pp;
g_topology_lock();
gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
gp->softc = sc;
pp = g_new_providerf(gp, "md%d", sc->unit);
pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
pp->mediasize = sc->mediasize;
pp->sectorsize = sc->sectorsize;
switch (sc->type) {
case MD_MALLOC:
case MD_VNODE:
case MD_SWAP:
pp->flags |= G_PF_ACCEPT_UNMAPPED;
break;
case MD_PRELOAD:
case MD_NULL:
break;
}
sc->gp = gp;
sc->pp = pp;
g_error_provider(pp, 0);
g_topology_unlock();
sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
}
static int
mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
{
uintptr_t sp;
int error;
off_t u;
error = 0;
if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
return (EINVAL);
if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
return (EINVAL);
/* Compression doesn't make sense if we have reserved space */
if (mdr->md_options & MD_RESERVE)
mdr->md_options &= ~MD_COMPRESS;
if (mdr->md_fwsectors != 0)
sc->fwsectors = mdr->md_fwsectors;
if (mdr->md_fwheads != 0)
sc->fwheads = mdr->md_fwheads;
sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
sc->indir = dimension(sc->mediasize / sc->sectorsize);
sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
0x1ff, 0);
if (mdr->md_options & MD_RESERVE) {
off_t nsectors;
nsectors = sc->mediasize / sc->sectorsize;
for (u = 0; u < nsectors; u++) {
sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
M_WAITOK : M_NOWAIT) | M_ZERO);
if (sp != 0)
error = s_write(sc->indir, u, sp);
else
error = ENOMEM;
if (error != 0)
break;
}
}
return (error);
}
static int
mdsetcred(struct md_s *sc, struct ucred *cred)
{
char *tmpbuf;
int error = 0;
/*
* Set credits in our softc
*/
if (sc->cred)
crfree(sc->cred);
sc->cred = crhold(cred);
/*
* Horrible kludge to establish credentials for NFS XXX.
*/
if (sc->vnode) {
struct uio auio;
struct iovec aiov;
tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
bzero(&auio, sizeof(auio));
aiov.iov_base = tmpbuf;
aiov.iov_len = sc->sectorsize;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_resid = aiov.iov_len;
vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
VOP_UNLOCK(sc->vnode, 0);
free(tmpbuf, M_TEMP);
}
return (error);
}
static int
mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
{
struct vattr vattr;
struct nameidata nd;
char *fname;
int error, flags;
fname = mdr->md_file;
if (mdr->md_file_seg == UIO_USERSPACE) {
error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
if (error != 0)
return (error);
} else if (mdr->md_file_seg == UIO_SYSSPACE)
strlcpy(sc->file, fname, sizeof(sc->file));
else
return (EDOOFUS);
/*
* If the user specified that this is a read only device, don't
* set the FWRITE mask before trying to open the backing store.
*/
flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
| ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
error = vn_open(&nd, &flags, 0, NULL);
if (error != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp->v_type != VREG) {
error = EINVAL;
goto bad;
}
error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
if (error != 0)
goto bad;
if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
if (nd.ni_vp->v_iflag & VI_DOOMED) {
/* Forced unmount. */
error = EBADF;
goto bad;
}
}
nd.ni_vp->v_vflag |= VV_MD;
VOP_UNLOCK(nd.ni_vp, 0);
if (mdr->md_fwsectors != 0)
sc->fwsectors = mdr->md_fwsectors;
if (mdr->md_fwheads != 0)
sc->fwheads = mdr->md_fwheads;
snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
(uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
MD_VERIFY);
if (!(flags & FWRITE))
sc->flags |= MD_READONLY;
sc->vnode = nd.ni_vp;
error = mdsetcred(sc, td->td_ucred);
if (error != 0) {
sc->vnode = NULL;
vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
nd.ni_vp->v_vflag &= ~VV_MD;
goto bad;
}
return (0);
bad:
VOP_UNLOCK(nd.ni_vp, 0);
(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
return (error);
}
static void
g_md_providergone(struct g_provider *pp)
{
struct md_s *sc = pp->geom->softc;
mtx_lock(&sc->queue_mtx);
sc->flags |= MD_PROVIDERGONE;
wakeup(&sc->flags);
mtx_unlock(&sc->queue_mtx);
}
static int
mddestroy(struct md_s *sc, struct thread *td)
{
if (sc->gp) {
g_topology_lock();
g_wither_geom(sc->gp, ENXIO);
g_topology_unlock();
mtx_lock(&sc->queue_mtx);
while (!(sc->flags & MD_PROVIDERGONE))
msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
mtx_unlock(&sc->queue_mtx);
}
if (sc->devstat) {
devstat_remove_entry(sc->devstat);
sc->devstat = NULL;
}
mtx_lock(&sc->queue_mtx);
sc->flags |= MD_SHUTDOWN;
wakeup(sc);
while (!(sc->flags & MD_EXITING))
msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
mtx_unlock(&sc->queue_mtx);
mtx_destroy(&sc->stat_mtx);
mtx_destroy(&sc->queue_mtx);
if (sc->vnode != NULL) {
vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
sc->vnode->v_vflag &= ~VV_MD;
VOP_UNLOCK(sc->vnode, 0);
(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
FREAD : (FREAD|FWRITE), sc->cred, td);
}
if (sc->cred != NULL)
crfree(sc->cred);
if (sc->object != NULL)
vm_object_deallocate(sc->object);
if (sc->indir)
destroy_indir(sc, sc->indir);
if (sc->uma)
uma_zdestroy(sc->uma);
LIST_REMOVE(sc, list);
free_unr(md_uh, sc->unit);
free(sc, M_MD);
return (0);
}
static int
mdresize(struct md_s *sc, struct md_req *mdr)
{
int error, res;
vm_pindex_t oldpages, newpages;
switch (sc->type) {
case MD_VNODE:
case MD_NULL:
break;
case MD_SWAP:
if (mdr->md_mediasize <= 0 ||
(mdr->md_mediasize % PAGE_SIZE) != 0)
return (EDOM);
oldpages = OFF_TO_IDX(round_page(sc->mediasize));
newpages = OFF_TO_IDX(round_page(mdr->md_mediasize));
if (newpages < oldpages) {
VM_OBJECT_WLOCK(sc->object);
vm_object_page_remove(sc->object, newpages, 0, 0);
swap_pager_freespace(sc->object, newpages,
oldpages - newpages);
swap_release_by_cred(IDX_TO_OFF(oldpages -
newpages), sc->cred);
sc->object->charge = IDX_TO_OFF(newpages);
sc->object->size = newpages;
VM_OBJECT_WUNLOCK(sc->object);
} else if (newpages > oldpages) {
res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
oldpages), sc->cred);
if (!res)
return (ENOMEM);
if ((mdr->md_options & MD_RESERVE) ||
(sc->flags & MD_RESERVE)) {
error = swap_pager_reserve(sc->object,
oldpages, newpages - oldpages);
if (error < 0) {
swap_release_by_cred(
IDX_TO_OFF(newpages - oldpages),
sc->cred);
return (EDOM);
}
}
VM_OBJECT_WLOCK(sc->object);
sc->object->charge = IDX_TO_OFF(newpages);
sc->object->size = newpages;
VM_OBJECT_WUNLOCK(sc->object);
}
break;
default:
return (EOPNOTSUPP);
}
sc->mediasize = mdr->md_mediasize;
g_topology_lock();
g_resize_provider(sc->pp, sc->mediasize);
g_topology_unlock();
return (0);
}
static int
mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
{
vm_ooffset_t npage;
int error;
/*
* Range check. Disallow negative sizes and sizes not being
* multiple of page size.
*/
if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
return (EDOM);
/*
* Allocate an OBJT_SWAP object.
*
* Note the truncation.
*/
if ((mdr->md_options & MD_VERIFY) != 0)
return (EINVAL);
npage = mdr->md_mediasize / PAGE_SIZE;
if (mdr->md_fwsectors != 0)
sc->fwsectors = mdr->md_fwsectors;
if (mdr->md_fwheads != 0)
sc->fwheads = mdr->md_fwheads;
sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
VM_PROT_DEFAULT, 0, td->td_ucred);
if (sc->object == NULL)
return (ENOMEM);
sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
if (mdr->md_options & MD_RESERVE) {
if (swap_pager_reserve(sc->object, 0, npage) < 0) {
error = EDOM;
goto finish;
}
}
error = mdsetcred(sc, td->td_ucred);
finish:
if (error != 0) {
vm_object_deallocate(sc->object);
sc->object = NULL;
}
return (error);
}
static int
mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
{
/*
* Range check. Disallow negative sizes and sizes not being
* multiple of page size.
*/
if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
return (EDOM);
return (0);
}
static int
kern_mdattach_locked(struct thread *td, struct md_req *mdr)
{
struct md_s *sc;
unsigned sectsize;
int error, i;
sx_assert(&md_sx, SA_XLOCKED);
switch (mdr->md_type) {
case MD_MALLOC:
case MD_PRELOAD:
case MD_VNODE:
case MD_SWAP:
case MD_NULL:
break;
default:
return (EINVAL);
}
if (mdr->md_sectorsize == 0)
sectsize = DEV_BSIZE;
else
sectsize = mdr->md_sectorsize;
if (sectsize > MAXPHYS || mdr->md_mediasize < sectsize)
return (EINVAL);
if (mdr->md_options & MD_AUTOUNIT)
sc = mdnew(-1, &error, mdr->md_type);
else {
if (mdr->md_unit > INT_MAX)
return (EINVAL);
sc = mdnew(mdr->md_unit, &error, mdr->md_type);
}
if (sc == NULL)
return (error);
if (mdr->md_label != NULL)
error = copyinstr(mdr->md_label, sc->label,
sizeof(sc->label), NULL);
if (error != 0)
goto err_after_new;
if (mdr->md_options & MD_AUTOUNIT)
mdr->md_unit = sc->unit;
sc->mediasize = mdr->md_mediasize;
sc->sectorsize = sectsize;
error = EDOOFUS;
switch (sc->type) {
case MD_MALLOC:
sc->start = mdstart_malloc;
error = mdcreate_malloc(sc, mdr);
break;
case MD_PRELOAD:
/*
* We disallow attaching preloaded memory disks via
* ioctl. Preloaded memory disks are automatically
* attached in g_md_init().
*/
error = EOPNOTSUPP;
break;
case MD_VNODE:
sc->start = mdstart_vnode;
error = mdcreate_vnode(sc, mdr, td);
break;
case MD_SWAP:
sc->start = mdstart_swap;
error = mdcreate_swap(sc, mdr, td);
break;
case MD_NULL:
sc->start = mdstart_null;
error = mdcreate_null(sc, mdr, td);
break;
}
err_after_new:
if (error != 0) {
mddestroy(sc, td);
return (error);
}
/* Prune off any residual fractional sector */
i = sc->mediasize % sc->sectorsize;
sc->mediasize -= i;
mdinit(sc);
return (0);
}
static int
kern_mdattach(struct thread *td, struct md_req *mdr)
{
int error;
sx_xlock(&md_sx);
error = kern_mdattach_locked(td, mdr);
sx_xunlock(&md_sx);
return (error);
}
static int
kern_mddetach_locked(struct thread *td, struct md_req *mdr)
{
struct md_s *sc;
sx_assert(&md_sx, SA_XLOCKED);
if (mdr->md_mediasize != 0 ||
(mdr->md_options & ~MD_FORCE) != 0)
return (EINVAL);
sc = mdfind(mdr->md_unit);
if (sc == NULL)
return (ENOENT);
if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
!(mdr->md_options & MD_FORCE))
return (EBUSY);
return (mddestroy(sc, td));
}
static int
kern_mddetach(struct thread *td, struct md_req *mdr)
{
int error;
sx_xlock(&md_sx);
error = kern_mddetach_locked(td, mdr);
sx_xunlock(&md_sx);
return (error);
}
static int
kern_mdresize_locked(struct md_req *mdr)
{
struct md_s *sc;
sx_assert(&md_sx, SA_XLOCKED);
if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
return (EINVAL);
sc = mdfind(mdr->md_unit);
if (sc == NULL)
return (ENOENT);
if (mdr->md_mediasize < sc->sectorsize)
return (EINVAL);
if (mdr->md_mediasize < sc->mediasize &&
!(sc->flags & MD_FORCE) &&
!(mdr->md_options & MD_FORCE))
return (EBUSY);
return (mdresize(sc, mdr));
}
static int
kern_mdresize(struct md_req *mdr)
{
int error;
sx_xlock(&md_sx);
error = kern_mdresize_locked(mdr);
sx_xunlock(&md_sx);
return (error);
}
static int
kern_mdquery_locked(struct md_req *mdr)
{
struct md_s *sc;
int error;
sx_assert(&md_sx, SA_XLOCKED);
sc = mdfind(mdr->md_unit);
if (sc == NULL)
return (ENOENT);
mdr->md_type = sc->type;
mdr->md_options = sc->flags;
mdr->md_mediasize = sc->mediasize;
mdr->md_sectorsize = sc->sectorsize;
error = 0;
if (mdr->md_label != NULL) {
error = copyout(sc->label, mdr->md_label,
strlen(sc->label) + 1);
if (error != 0)
return (error);
}
if (sc->type == MD_VNODE ||
(sc->type == MD_PRELOAD && mdr->md_file != NULL))
error = copyout(sc->file, mdr->md_file,
strlen(sc->file) + 1);
return (error);
}
static int
kern_mdquery(struct md_req *mdr)
{
int error;
sx_xlock(&md_sx);
error = kern_mdquery_locked(mdr);
sx_xunlock(&md_sx);
return (error);
}
/* Copy members that are not userspace pointers. */
#define MD_IOCTL2REQ(mdio, mdr) do { \
(mdr)->md_unit = (mdio)->md_unit; \
(mdr)->md_type = (mdio)->md_type; \
(mdr)->md_mediasize = (mdio)->md_mediasize; \
(mdr)->md_sectorsize = (mdio)->md_sectorsize; \
(mdr)->md_options = (mdio)->md_options; \
(mdr)->md_fwheads = (mdio)->md_fwheads; \
(mdr)->md_fwsectors = (mdio)->md_fwsectors; \
(mdr)->md_units = &(mdio)->md_pad[0]; \
(mdr)->md_units_nitems = nitems((mdio)->md_pad); \
} while(0)
/* Copy members that might have been updated */
#define MD_REQ2IOCTL(mdr, mdio) do { \
(mdio)->md_unit = (mdr)->md_unit; \
(mdio)->md_type = (mdr)->md_type; \
(mdio)->md_mediasize = (mdr)->md_mediasize; \
(mdio)->md_sectorsize = (mdr)->md_sectorsize; \
(mdio)->md_options = (mdr)->md_options; \
(mdio)->md_fwheads = (mdr)->md_fwheads; \
(mdio)->md_fwsectors = (mdr)->md_fwsectors; \
} while(0)
static int
mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
struct thread *td)
{
struct md_req mdr;
int error;
if (md_debug)
printf("mdctlioctl(%s %lx %p %x %p)\n",
devtoname(dev), cmd, addr, flags, td);
bzero(&mdr, sizeof(mdr));
switch (cmd) {
case MDIOCATTACH:
case MDIOCDETACH:
case MDIOCRESIZE:
case MDIOCQUERY: {
struct md_ioctl *mdio = (struct md_ioctl *)addr;
if (mdio->md_version != MDIOVERSION)
return (EINVAL);
MD_IOCTL2REQ(mdio, &mdr);
mdr.md_file = mdio->md_file;
mdr.md_file_seg = UIO_USERSPACE;
/* If the file is adjacent to the md_ioctl it's in kernel. */
if ((void *)mdio->md_file == (void *)(mdio + 1))
mdr.md_file_seg = UIO_SYSSPACE;
mdr.md_label = mdio->md_label;
break;
}
#ifdef COMPAT_FREEBSD32
case MDIOCATTACH_32:
case MDIOCDETACH_32:
case MDIOCRESIZE_32:
case MDIOCQUERY_32: {
struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
if (mdio->md_version != MDIOVERSION)
return (EINVAL);
MD_IOCTL2REQ(mdio, &mdr);
mdr.md_file = (void *)(uintptr_t)mdio->md_file;
mdr.md_file_seg = UIO_USERSPACE;
mdr.md_label = (void *)(uintptr_t)mdio->md_label;
break;
}
#endif
default:
/* Fall through to handler switch. */
break;
}
error = 0;
switch (cmd) {
case MDIOCATTACH:
#ifdef COMPAT_FREEBSD32
case MDIOCATTACH_32:
#endif
error = kern_mdattach(td, &mdr);
break;
case MDIOCDETACH:
#ifdef COMPAT_FREEBSD32
case MDIOCDETACH_32:
#endif
error = kern_mddetach(td, &mdr);
break;
case MDIOCRESIZE:
#ifdef COMPAT_FREEBSD32
case MDIOCRESIZE_32:
#endif
error = kern_mdresize(&mdr);
break;
case MDIOCQUERY:
#ifdef COMPAT_FREEBSD32
case MDIOCQUERY_32:
#endif
error = kern_mdquery(&mdr);
break;
default:
error = ENOIOCTL;
}
switch (cmd) {
case MDIOCATTACH:
case MDIOCQUERY: {
struct md_ioctl *mdio = (struct md_ioctl *)addr;
MD_REQ2IOCTL(&mdr, mdio);
break;
}
#ifdef COMPAT_FREEBSD32
case MDIOCATTACH_32:
case MDIOCQUERY_32: {
struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
MD_REQ2IOCTL(&mdr, mdio);
break;
}
#endif
default:
/* Other commands to not alter mdr. */
break;
}
return (error);
}
static void
md_preloaded(u_char *image, size_t length, const char *name)
{
struct md_s *sc;
int error;
sc = mdnew(-1, &error, MD_PRELOAD);
if (sc == NULL)
return;
sc->mediasize = length;
sc->sectorsize = DEV_BSIZE;
sc->pl_ptr = image;
sc->pl_len = length;
sc->start = mdstart_preload;
if (name != NULL)
strlcpy(sc->file, name, sizeof(sc->file));
#ifdef MD_ROOT
if (sc->unit == 0) {
#ifndef ROOTDEVNAME
rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
#endif
#ifdef MD_ROOT_READONLY
sc->flags |= MD_READONLY;
#endif
}
#endif
mdinit(sc);
if (name != NULL) {
printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
MD_NAME, sc->unit, name, length, image);
} else {
printf("%s%d: Embedded image %zd bytes at %p\n",
MD_NAME, sc->unit, length, image);
}
}
static void
g_md_init(struct g_class *mp __unused)
{
caddr_t mod;
u_char *ptr, *name, *type;
unsigned len;
int i;
/* figure out log2(NINDIR) */
for (i = NINDIR, nshift = -1; i; nshift++)
i >>= 1;
mod = NULL;
sx_init(&md_sx, "MD config lock");
g_topology_unlock();
md_uh = new_unrhdr(0, INT_MAX, NULL);
#ifdef MD_ROOT
if (mfs_root_size != 0) {
sx_xlock(&md_sx);
#ifdef MD_ROOT_MEM
md_preloaded(mfs_root, mfs_root_size, NULL);
#else
md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
NULL);
#endif
sx_xunlock(&md_sx);
}
#endif
/* XXX: are preload_* static or do they need Giant ? */
while ((mod = preload_search_next_name(mod)) != NULL) {
name = (char *)preload_search_info(mod, MODINFO_NAME);
if (name == NULL)
continue;
type = (char *)preload_search_info(mod, MODINFO_TYPE);
if (type == NULL)
continue;
if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
continue;
ptr = preload_fetch_addr(mod);
len = preload_fetch_size(mod);
if (ptr != NULL && len != 0) {
sx_xlock(&md_sx);
md_preloaded(ptr, len, name);
sx_xunlock(&md_sx);
}
}
md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
0600, MDCTL_NAME);
g_topology_lock();
}
static void
g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
struct g_consumer *cp __unused, struct g_provider *pp)
{
struct md_s *mp;
char *type;
mp = gp->softc;
if (mp == NULL)
return;
switch (mp->type) {
case MD_MALLOC:
type = "malloc";
break;
case MD_PRELOAD:
type = "preload";
break;
case MD_VNODE:
type = "vnode";
break;
case MD_SWAP:
type = "swap";
break;
case MD_NULL:
type = "null";
break;
default:
type = "unknown";
break;
}
if (pp != NULL) {
if (indent == NULL) {
sbuf_printf(sb, " u %d", mp->unit);
sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
sbuf_printf(sb, " t %s", type);
if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
(mp->type == MD_PRELOAD && mp->file[0] != '\0'))
sbuf_printf(sb, " file %s", mp->file);
sbuf_printf(sb, " label %s", mp->label);
} else {
sbuf_printf(sb, "%s%d\n", indent,
mp->unit);
sbuf_printf(sb, "%s%ju\n",
indent, (uintmax_t) mp->sectorsize);
sbuf_printf(sb, "%s%ju\n",
indent, (uintmax_t) mp->fwheads);
sbuf_printf(sb, "%s%ju\n",
indent, (uintmax_t) mp->fwsectors);
if (mp->ident[0] != '\0') {
sbuf_printf(sb, "%s", indent);
g_conf_printf_escaped(sb, "%s", mp->ident);
sbuf_printf(sb, "\n");
}
sbuf_printf(sb, "%s%ju\n",
indent, (uintmax_t) mp->mediasize);
sbuf_printf(sb, "%s%s\n", indent,
(mp->flags & MD_COMPRESS) == 0 ? "off": "on");
sbuf_printf(sb, "%s%s\n", indent,
(mp->flags & MD_READONLY) == 0 ? "read-write":
"read-only");
sbuf_printf(sb, "%s%s\n", indent,
type);
if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
(mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
sbuf_printf(sb, "%s", indent);
g_conf_printf_escaped(sb, "%s", mp->file);
sbuf_printf(sb, "\n");
}
if (mp->type == MD_VNODE)
sbuf_printf(sb, "%s%s\n", indent,
(mp->flags & MD_CACHE) == 0 ? "off": "on");
sbuf_printf(sb, "%s\n");
}
}
}
static void
g_md_fini(struct g_class *mp __unused)
{
sx_destroy(&md_sx);
if (status_dev != NULL)
destroy_dev(status_dev);
uma_zdestroy(md_pbuf_zone);
delete_unrhdr(md_uh);
}
Index: head/sys/dev/netmap/netmap_freebsd.c
===================================================================
--- head/sys/dev/netmap/netmap_freebsd.c (revision 353538)
+++ head/sys/dev/netmap/netmap_freebsd.c (revision 353539)
@@ -1,1620 +1,1620 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* $FreeBSD$ */
#include "opt_inet.h"
#include "opt_inet6.h"
#include
#include
#include
#include
#include
#include /* POLLIN, POLLOUT */
#include /* types used in module initialization */
#include /* DEV_MODULE_ORDERED */
#include
#include /* kern_ioctl() */
#include
#include /* vtophys */
#include /* vtophys */
#include
#include
#include
#include
#include
#include
#include /* sockaddrs */
#include
#include /* kthread_add() */
#include /* PROC_LOCK() */
#include /* RFNOWAIT */
#include /* sched_bind() */
#include /* mp_maxid */
#include /* taskqueue_enqueue(), taskqueue_create(), ... */
#include
#include
#include /* IFT_ETHER */
#include /* ether_ifdetach */
#include /* LLADDR */
#include /* bus_dmamap_* */
#include /* in6_cksum_pseudo() */
#include /* in_pseudo(), in_cksum_hdr() */
#include
#include
#include
#include
/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
static void
nm_kqueue_notify(void *opaque, int pending)
{
struct nm_selinfo *si = opaque;
/* We use a non-zero hint to distinguish this notification call
* from the call done in kqueue_scan(), which uses hint=0.
*/
KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100);
}
int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) {
int err;
TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si);
si->ntfytq = taskqueue_create(name, M_NOWAIT,
taskqueue_thread_enqueue, &si->ntfytq);
if (si->ntfytq == NULL)
return -ENOMEM;
err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name);
if (err) {
taskqueue_free(si->ntfytq);
si->ntfytq = NULL;
return err;
}
snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name);
mtx_init(&si->m, si->mtxname, NULL, MTX_DEF);
knlist_init_mtx(&si->si.si_note, &si->m);
si->kqueue_users = 0;
return (0);
}
void
nm_os_selinfo_uninit(NM_SELINFO_T *si)
{
if (si->ntfytq == NULL) {
return; /* si was not initialized */
}
taskqueue_drain(si->ntfytq, &si->ntfytask);
taskqueue_free(si->ntfytq);
si->ntfytq = NULL;
knlist_delete(&si->si.si_note, curthread, /*islocked=*/0);
knlist_destroy(&si->si.si_note);
/* now we don't need the mutex anymore */
mtx_destroy(&si->m);
}
void *
nm_os_malloc(size_t size)
{
return malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
}
void *
nm_os_realloc(void *addr, size_t new_size, size_t old_size __unused)
{
return realloc(addr, new_size, M_DEVBUF, M_NOWAIT | M_ZERO);
}
void
nm_os_free(void *addr)
{
free(addr, M_DEVBUF);
}
void
nm_os_ifnet_lock(void)
{
IFNET_RLOCK();
}
void
nm_os_ifnet_unlock(void)
{
IFNET_RUNLOCK();
}
static int netmap_use_count = 0;
void
nm_os_get_module(void)
{
netmap_use_count++;
}
void
nm_os_put_module(void)
{
netmap_use_count--;
}
static void
netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp)
{
netmap_undo_zombie(ifp);
}
static void
netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp)
{
netmap_make_zombie(ifp);
}
static eventhandler_tag nm_ifnet_ah_tag;
static eventhandler_tag nm_ifnet_dh_tag;
int
nm_os_ifnet_init(void)
{
nm_ifnet_ah_tag =
EVENTHANDLER_REGISTER(ifnet_arrival_event,
netmap_ifnet_arrival_handler,
NULL, EVENTHANDLER_PRI_ANY);
nm_ifnet_dh_tag =
EVENTHANDLER_REGISTER(ifnet_departure_event,
netmap_ifnet_departure_handler,
NULL, EVENTHANDLER_PRI_ANY);
return 0;
}
void
nm_os_ifnet_fini(void)
{
EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
nm_ifnet_ah_tag);
EVENTHANDLER_DEREGISTER(ifnet_departure_event,
nm_ifnet_dh_tag);
}
unsigned
nm_os_ifnet_mtu(struct ifnet *ifp)
{
#if __FreeBSD_version < 1100030
return ifp->if_data.ifi_mtu;
#else /* __FreeBSD_version >= 1100030 */
return ifp->if_mtu;
#endif
}
rawsum_t
nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
uint16_t *words = (uint16_t *)data;
int nw = len / 2;
int i;
for (i = 0; i < nw; i++)
cur_sum += be16toh(words[i]);
if (len & 1)
cur_sum += (data[len-1] << 8);
return cur_sum;
}
/* Fold a raw checksum: 'cur_sum' is in host byte order, while the
* return value is in network byte order.
*/
uint16_t
nm_os_csum_fold(rawsum_t cur_sum)
{
/* TODO XXX please use the FreeBSD implementation for this. */
while (cur_sum >> 16)
cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16);
return htobe16((~cur_sum) & 0xFFFF);
}
uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph)
{
#if 0
return in_cksum_hdr((void *)iph);
#else
return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
#endif
}
void
nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET
uint16_t pseudolen = datalen + iph->protocol;
/* Compute and insert the pseudo-header cheksum. */
*check = in_pseudo(iph->saddr, iph->daddr,
htobe16(pseudolen));
/* Compute the checksum on TCP/UDP header + payload
* (includes the pseudo-header).
*/
*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
notsupported = 1;
nm_prerr("inet4 segmentation not supported");
}
#endif
}
void
nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
size_t datalen, uint16_t *check)
{
#ifdef INET6
*check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
*check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0));
#else
static int notsupported = 0;
if (!notsupported) {
notsupported = 1;
nm_prerr("inet6 segmentation not supported");
}
#endif
}
/* on FreeBSD we send up one packet at a time */
void *
nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev)
{
NA(ifp)->if_input(ifp, m);
return NULL;
}
int
nm_os_mbuf_has_csum_offld(struct mbuf *m)
{
return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP |
CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |
CSUM_SCTP_IPV6);
}
int
nm_os_mbuf_has_seg_offld(struct mbuf *m)
{
return m->m_pkthdr.csum_flags & CSUM_TSO;
}
static void
freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
{
int stolen;
if (unlikely(!NM_NA_VALID(ifp))) {
nm_prlim(1, "Warning: RX packet intercepted, but no"
" emulated adapter");
return;
}
stolen = generic_rx_handler(ifp, m);
if (!stolen) {
struct netmap_generic_adapter *gna =
(struct netmap_generic_adapter *)NA(ifp);
gna->save_if_input(ifp, m);
}
}
/*
* Intercept the rx routine in the standard device driver.
* Second argument is non-zero to intercept, 0 to restore
*/
int
nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = na->ifp;
int ret = 0;
nm_os_ifnet_lock();
if (intercept) {
if (gna->save_if_input) {
nm_prerr("RX on %s already intercepted", na->name);
ret = EBUSY; /* already set */
goto out;
}
gna->save_if_input = ifp->if_input;
ifp->if_input = freebsd_generic_rx_handler;
} else {
if (!gna->save_if_input) {
nm_prerr("Failed to undo RX intercept on %s",
na->name);
ret = EINVAL; /* not saved */
goto out;
}
ifp->if_input = gna->save_if_input;
gna->save_if_input = NULL;
}
out:
nm_os_ifnet_unlock();
return ret;
}
/*
* Intercept the packet steering routine in the tx path,
* so that we can decide which queue is used for an mbuf.
* Second argument is non-zero to intercept, 0 to restore.
* On freebsd we just intercept if_transmit.
*/
int
nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
{
struct netmap_adapter *na = &gna->up.up;
struct ifnet *ifp = netmap_generic_getifp(gna);
nm_os_ifnet_lock();
if (intercept) {
na->if_transmit = ifp->if_transmit;
ifp->if_transmit = netmap_transmit;
} else {
ifp->if_transmit = na->if_transmit;
}
nm_os_ifnet_unlock();
return 0;
}
/*
* Transmit routine used by generic_netmap_txsync(). Returns 0 on success
* and non-zero on error (which may be packet drops or other errors).
* addr and len identify the netmap buffer, m is the (preallocated)
* mbuf to use for transmissions.
*
* We should add a reference to the mbuf so the m_freem() at the end
* of the transmission does not consume resources.
*
* On FreeBSD, and on multiqueue cards, we can force the queue using
* if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
* i = m->m_pkthdr.flowid % adapter->num_queues;
* else
* i = curcpu % adapter->num_queues;
*
*/
int
nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
{
int ret;
u_int len = a->len;
struct ifnet *ifp = a->ifp;
struct mbuf *m = a->m;
#if __FreeBSD_version < 1100000
/*
* Old FreeBSD versions. The mbuf has a cluster attached,
* we need to copy from the cluster to the netmap buffer.
*/
if (MBUF_REFCNT(m) != 1) {
nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
panic("in generic_xmit_frame");
}
if (m->m_ext.ext_size < len) {
nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len);
len = m->m_ext.ext_size;
}
bcopy(a->addr, m->m_data, len);
#else /* __FreeBSD_version >= 1100000 */
/* New FreeBSD versions. Link the external storage to
* the netmap buffer, so that no copy is necessary. */
m->m_ext.ext_buf = m->m_data = a->addr;
m->m_ext.ext_size = len;
#endif /* __FreeBSD_version >= 1100000 */
m->m_flags |= M_PKTHDR;
m->m_len = m->m_pkthdr.len = len;
/* mbuf refcnt is not contended, no need to use atomic
* (a memory barrier is enough). */
SET_MBUF_REFCNT(m, 2);
M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
m->m_pkthdr.flowid = a->ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
CURVNET_SET(ifp->if_vnet);
ret = NA(ifp)->if_transmit(ifp, m);
CURVNET_RESTORE();
return ret ? -1 : 0;
}
#if __FreeBSD_version >= 1100005
struct netmap_adapter *
netmap_getna(if_t ifp)
{
return (NA((struct ifnet *)ifp));
}
#endif /* __FreeBSD_version >= 1100005 */
/*
* The following two functions are empty until we have a generic
* way to extract the info from the ifp
*/
int
nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
{
return 0;
}
void
nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
{
unsigned num_rings = netmap_generic_rings ? netmap_generic_rings : 1;
*txq = num_rings;
*rxq = num_rings;
}
void
nm_os_generic_set_features(struct netmap_generic_adapter *gna)
{
gna->rxsg = 1; /* Supported through m_copydata. */
gna->txqdisc = 0; /* Not supported. */
}
void
nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
{
mit->mit_pending = 0;
mit->mit_ring_idx = idx;
mit->mit_na = na;
}
void
nm_os_mitigation_start(struct nm_generic_mit *mit)
{
}
void
nm_os_mitigation_restart(struct nm_generic_mit *mit)
{
}
int
nm_os_mitigation_active(struct nm_generic_mit *mit)
{
return 0;
}
void
nm_os_mitigation_cleanup(struct nm_generic_mit *mit)
{
}
static int
nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
{
return EINVAL;
}
static void
nm_vi_start(struct ifnet *ifp)
{
panic("nm_vi_start() must not be called");
}
/*
* Index manager of persistent virtual interfaces.
* It is used to decide the lowest byte of the MAC address.
* We use the same algorithm with management of bridge port index.
*/
#define NM_VI_MAX 255
static struct {
uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */
uint8_t active;
struct mtx lock;
} nm_vi_indices;
void
nm_os_vi_init_index(void)
{
int i;
for (i = 0; i < NM_VI_MAX; i++)
nm_vi_indices.index[i] = i;
nm_vi_indices.active = 0;
mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF);
}
/* return -1 if no index available */
static int
nm_vi_get_index(void)
{
int ret;
mtx_lock(&nm_vi_indices.lock);
ret = nm_vi_indices.active == NM_VI_MAX ? -1 :
nm_vi_indices.index[nm_vi_indices.active++];
mtx_unlock(&nm_vi_indices.lock);
return ret;
}
static void
nm_vi_free_index(uint8_t val)
{
int i, lim;
mtx_lock(&nm_vi_indices.lock);
lim = nm_vi_indices.active;
for (i = 0; i < lim; i++) {
if (nm_vi_indices.index[i] == val) {
/* swap index[lim-1] and j */
int tmp = nm_vi_indices.index[lim-1];
nm_vi_indices.index[lim-1] = val;
nm_vi_indices.index[i] = tmp;
nm_vi_indices.active--;
break;
}
}
if (lim == nm_vi_indices.active)
nm_prerr("Index %u not found", val);
mtx_unlock(&nm_vi_indices.lock);
}
#undef NM_VI_MAX
/*
* Implementation of a netmap-capable virtual interface that
* registered to the system.
* It is based on if_tap.c and ip_fw_log.c in FreeBSD 9.
*
* Note: Linux sets refcount to 0 on allocation of net_device,
* then increments it on registration to the system.
* FreeBSD sets refcount to 1 on if_alloc(), and does not
* increment this refcount on if_attach().
*/
int
nm_os_vi_persist(const char *name, struct ifnet **ret)
{
struct ifnet *ifp;
u_short macaddr_hi;
uint32_t macaddr_mid;
u_char eaddr[6];
int unit = nm_vi_get_index(); /* just to decide MAC address */
if (unit < 0)
return EBUSY;
/*
* We use the same MAC address generation method with tap
* except for the highest octet is 00:be instead of 00:bd
*/
macaddr_hi = htons(0x00be); /* XXX tap + 1 */
macaddr_mid = (uint32_t) ticks;
bcopy(&macaddr_hi, eaddr, sizeof(short));
bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t));
eaddr[5] = (uint8_t)unit;
ifp = if_alloc(IFT_ETHER);
if (ifp == NULL) {
nm_prerr("if_alloc failed");
return ENOMEM;
}
if_initname(ifp, name, IF_DUNIT_NONE);
ifp->if_mtu = 65536;
ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_init = (void *)nm_vi_dummy;
ifp->if_ioctl = nm_vi_dummy;
ifp->if_start = nm_vi_start;
ifp->if_mtu = ETHERMTU;
IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
ifp->if_capabilities |= IFCAP_LINKSTATE;
ifp->if_capenable |= IFCAP_LINKSTATE;
ether_ifattach(ifp, eaddr);
*ret = ifp;
return 0;
}
/* unregister from the system and drop the final refcount */
void
nm_os_vi_detach(struct ifnet *ifp)
{
nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]);
ether_ifdetach(ifp);
if_free(ifp);
}
#ifdef WITH_EXTMEM
#include
#include
struct nm_os_extmem {
vm_object_t obj;
vm_offset_t kva;
vm_offset_t size;
uintptr_t scan;
};
void
nm_os_extmem_delete(struct nm_os_extmem *e)
{
nm_prinf("freeing %zx bytes", (size_t)e->size);
vm_map_remove(kernel_map, e->kva, e->kva + e->size);
nm_os_free(e);
}
char *
nm_os_extmem_nextpage(struct nm_os_extmem *e)
{
char *rv = NULL;
if (e->scan < e->kva + e->size) {
rv = (char *)e->scan;
e->scan += PAGE_SIZE;
}
return rv;
}
int
nm_os_extmem_isequal(struct nm_os_extmem *e1, struct nm_os_extmem *e2)
{
return (e1->obj == e2->obj);
}
int
nm_os_extmem_nr_pages(struct nm_os_extmem *e)
{
return e->size >> PAGE_SHIFT;
}
struct nm_os_extmem *
nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror)
{
vm_map_t map;
vm_map_entry_t entry;
vm_object_t obj;
vm_prot_t prot;
vm_pindex_t index;
boolean_t wired;
struct nm_os_extmem *e = NULL;
int rv, error = 0;
e = nm_os_malloc(sizeof(*e));
if (e == NULL) {
error = ENOMEM;
goto out;
}
map = &curthread->td_proc->p_vmspace->vm_map;
rv = vm_map_lookup(&map, p, VM_PROT_RW, &entry,
&obj, &index, &prot, &wired);
if (rv != KERN_SUCCESS) {
nm_prerr("address %lx not found", p);
goto out_free;
}
/* check that we are given the whole vm_object ? */
vm_map_lookup_done(map, entry);
// XXX can we really use obj after releasing the map lock?
e->obj = obj;
vm_object_reference(obj);
/* wire the memory and add the vm_object to the kernel map,
* to make sure that it is not fred even if the processes that
* are mmap()ing it all exit
*/
e->kva = vm_map_min(kernel_map);
e->size = obj->size << PAGE_SHIFT;
rv = vm_map_find(kernel_map, obj, 0, &e->kva, e->size, 0,
VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
VM_PROT_READ | VM_PROT_WRITE, 0);
if (rv != KERN_SUCCESS) {
nm_prerr("vm_map_find(%zx) failed", (size_t)e->size);
goto out_rel;
}
rv = vm_map_wire(kernel_map, e->kva, e->kva + e->size,
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
if (rv != KERN_SUCCESS) {
nm_prerr("vm_map_wire failed");
goto out_rem;
}
e->scan = e->kva;
return e;
out_rem:
vm_map_remove(kernel_map, e->kva, e->kva + e->size);
e->obj = NULL;
out_rel:
vm_object_deallocate(e->obj);
out_free:
nm_os_free(e);
out:
if (perror)
*perror = error;
return NULL;
}
#endif /* WITH_EXTMEM */
/* ================== PTNETMAP GUEST SUPPORT ==================== */
#ifdef WITH_PTNETMAP
#include
#include